diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index d3e74bc530..dfa7dc756e 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -953,8 +953,8 @@ struct IlutWrap { const index_t l_nnz = L_new_values.extent(0); const index_t u_nnz = U_new_values.extent(0); - const auto l_filter_rank = std::max(0, l_nnz - l_nnz_limit - 1); - const auto u_filter_rank = std::max(0, u_nnz - u_nnz_limit - 1); + const auto l_filter_rank = std::max(index_t(0), l_nnz - l_nnz_limit - 1); + const auto u_filter_rank = std::max(index_t(0), u_nnz - u_nnz_limit - 1); const auto l_threshold = threshold_select(L_new_values, l_filter_rank, V_copy); diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp index d95425c352..26ebb07e3d 100644 --- a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp @@ -57,6 +57,10 @@ #include "KokkosSparse_spgemm_jacobi_seq_impl.hpp" #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) +#include "KokkosSparse_spgemm_rocSPARSE_impl.hpp" +#endif + namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists @@ -194,6 +198,20 @@ struct SPGEMM_JACOBIget_algorithm_type() == SPGEMM_ROCSPARSE) { +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + jacobi_spgemm_numeric_rocsparse( + sh, m, n, k, + row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB, + omega, dinv, + row_mapC, entriesC, valuesC); +#else + throw std::runtime_error( + "Requiring (jacobi) SPGEMM_ROCSPARSE but TPL_ROCSPARSE was not enabled!"); +#endif } else { KokkosSPGEMM @@ -292,7 +310,6 @@ struct SPGEMM_JACOBI >, \ false, true>; -#include #include #endif diff --git a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index 0a09040559..8359039dd0 100644 --- a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -223,16 +223,6 @@ struct SPGEMM_NUMERIC< #else throw std::runtime_error( "Requiring SPGEMM_CUSPARSE but TPL_CUSPARSE was not enabled!"); -#endif - break; - case SPGEMM_ROCSPARSE: -#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) - rocsparse_spgemm_numeric( - sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, - entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC); -#else - throw std::runtime_error( - "Requiring SPGEMM_ROCSPARSE but TPL_ROCSPARSE was not enabled!"); #endif break; case SPGEMM_CUSP: @@ -243,6 +233,18 @@ struct SPGEMM_NUMERIC< transposeA, row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC); break; + case SPGEMM_ROCSPARSE: +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + spgemm_numeric_rocsparse( + sh, m, n, k, + row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB, + row_mapC, entriesC, valuesC); +#else + throw std::runtime_error("Requested rocSPARSE for SpGEMM, but it is not available. Please recompile with rocsparse enabled."); +#endif + break; + case SPGEMM_MKL: #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL mkl_numeric(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, diff --git a/sparse/impl/KokkosSparse_spgemm_rocSPARSE_impl.hpp b/sparse/impl/KokkosSparse_spgemm_rocSPARSE_impl.hpp index f2b4f0e38f..b7dd33b34f 100644 --- a/sparse/impl/KokkosSparse_spgemm_rocSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_spgemm_rocSPARSE_impl.hpp @@ -50,7 +50,7 @@ #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #include "KokkosKernels_Controls.hpp" -#include "rocsparse.h" +#include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosSparse { @@ -58,378 +58,773 @@ namespace Impl { //============================================================================= // Overload rocsparse_Xcsrgemm_buffer_size() over scalar types -inline rocsparse_status rocsparse_Xcsrgemm_buffer_size( - rocsparse_handle handle, rocsparse_operation trans_A, - rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, - rocsparse_int k, const float *alpha, const rocsparse_mat_descr descr_A, - rocsparse_int nnz_A, const rocsparse_int *csr_row_ptr_A, - const rocsparse_int *csr_col_ind_A, const rocsparse_mat_descr descr_B, - rocsparse_int nnz_B, const rocsparse_int *csr_row_ptr_B, - const rocsparse_int *csr_col_ind_B, const float *beta, - const rocsparse_mat_descr descr_D, rocsparse_int nnz_D, - const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D, - rocsparse_mat_info info_C, size_t *buffer_size) { - return rocsparse_scsrgemm_buffer_size( - handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_row_ptr_A, - csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B, csr_col_ind_B, beta, - descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, info_C, buffer_size); -} - -inline rocsparse_status rocsparse_Xcsrgemm_buffer_size( - rocsparse_handle handle, rocsparse_operation trans_A, - rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, - rocsparse_int k, const double *alpha, const rocsparse_mat_descr descr_A, - rocsparse_int nnz_A, const rocsparse_int *csr_row_ptr_A, - const rocsparse_int *csr_col_ind_A, const rocsparse_mat_descr descr_B, - rocsparse_int nnz_B, const rocsparse_int *csr_row_ptr_B, - const rocsparse_int *csr_col_ind_B, const double *beta, - const rocsparse_mat_descr descr_D, rocsparse_int nnz_D, - const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D, - rocsparse_mat_info info_C, size_t *buffer_size) { - return rocsparse_dcsrgemm_buffer_size( - handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_row_ptr_A, - csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B, csr_col_ind_B, beta, - descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, info_C, buffer_size); -} - -inline rocsparse_status rocsparse_Xcsrgemm_buffer_size( - rocsparse_handle handle, rocsparse_operation trans_A, - rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, - rocsparse_int k, const Kokkos::complex *alpha, - const rocsparse_mat_descr descr_A, rocsparse_int nnz_A, - const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A, - const rocsparse_mat_descr descr_B, rocsparse_int nnz_B, - const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B, - const Kokkos::complex *beta, const rocsparse_mat_descr descr_D, - rocsparse_int nnz_D, const rocsparse_int *csr_row_ptr_D, - const rocsparse_int *csr_col_ind_D, rocsparse_mat_info info_C, - size_t *buffer_size) { - return rocsparse_ccsrgemm_buffer_size( - handle, trans_A, trans_B, m, n, k, - reinterpret_cast(alpha), descr_A, nnz_A, - csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B, - csr_col_ind_B, reinterpret_cast(beta), - descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, info_C, buffer_size); -} - -inline rocsparse_status rocsparse_Xcsrgemm_buffer_size( - rocsparse_handle handle, rocsparse_operation trans_A, - rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, - rocsparse_int k, const Kokkos::complex *alpha, - const rocsparse_mat_descr descr_A, rocsparse_int nnz_A, - const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A, - const rocsparse_mat_descr descr_B, rocsparse_int nnz_B, - const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B, - const Kokkos::complex *beta, const rocsparse_mat_descr descr_D, - rocsparse_int nnz_D, const rocsparse_int *csr_row_ptr_D, - const rocsparse_int *csr_col_ind_D, rocsparse_mat_info info_C, - size_t *buffer_size) { - return rocsparse_zcsrgemm_buffer_size( - handle, trans_A, trans_B, m, n, k, - reinterpret_cast(alpha), descr_A, nnz_A, - csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B, - csr_col_ind_B, reinterpret_cast(beta), - descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, info_C, buffer_size); -} +#define ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(scalar_type, TOKEN) \ + inline rocsparse_status rocsparse_Xcsrgemm_buffer_size( \ + rocsparse_handle handle, rocsparse_operation trans_A, \ + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, \ + rocsparse_int k, const scalar_type *alpha, \ + const rocsparse_mat_descr descr_A, rocsparse_int nnz_A, \ + const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A, \ + const rocsparse_mat_descr descr_B, rocsparse_int nnz_B, \ + const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B, \ + const scalar_type *beta, const rocsparse_mat_descr descr_D, \ + rocsparse_int nnz_D, const rocsparse_int *csr_row_ptr_D, \ + const rocsparse_int *csr_col_ind_D, rocsparse_mat_info info_C, \ + size_t *buffer_size) { \ + return rocsparse_##TOKEN##csrgemm_buffer_size( \ + handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, \ + csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B, \ + csr_col_ind_B, beta, descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, \ + info_C, buffer_size); \ + } +ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(float, s) +ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(double, d) +ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(rocsparse_float_complex, c) +ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(rocsparse_double_complex, z) //============================================================================= // Overload rocsparse_Xcsrgemm_numeric() over scalar types -inline rocsparse_status rocsparse_Xcsrgemm_numeric( - rocsparse_handle handle, rocsparse_operation trans_A, - rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, - rocsparse_int k, const float *alpha, const rocsparse_mat_descr descr_A, - rocsparse_int nnz_A, const float *csr_val_A, - const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A, - const rocsparse_mat_descr descr_B, rocsparse_int nnz_B, - const float *csr_val_B, const rocsparse_int *csr_row_ptr_B, - const rocsparse_int *csr_col_ind_B, const float *beta, - const rocsparse_mat_descr descr_D, rocsparse_int nnz_D, - const float *csr_val_D, const rocsparse_int *csr_row_ptr_D, - const rocsparse_int *csr_col_ind_D, const rocsparse_mat_descr descr_C, - rocsparse_int nnz_C, float *csr_val_C, const rocsparse_int *csr_row_ptr_C, - const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C, - void *buffer) { - return rocsparse_scsrgemm_numeric( - handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_val_A, - csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_val_B, csr_row_ptr_B, - csr_col_ind_B, beta, descr_D, nnz_D, csr_val_D, csr_row_ptr_D, - csr_col_ind_D, descr_C, nnz_C, csr_val_C, csr_row_ptr_C, csr_col_ind_C, - info_C, buffer); -} +#define ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(scalar_type, TOKEN) \ + inline rocsparse_status rocsparse_Xcsrgemm_numeric( \ + rocsparse_handle handle, rocsparse_operation trans_A, \ + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, \ + rocsparse_int k, const scalar_type *alpha, \ + const rocsparse_mat_descr descr_A, rocsparse_int nnz_A, \ + const scalar_type *csr_val_A, const rocsparse_int *csr_row_ptr_A, \ + const rocsparse_int *csr_col_ind_A, const rocsparse_mat_descr descr_B, \ + rocsparse_int nnz_B, const scalar_type *csr_val_B, \ + const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B, \ + const scalar_type *beta, const rocsparse_mat_descr descr_D, \ + rocsparse_int nnz_D, const scalar_type *csr_val_D, \ + const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D, \ + const rocsparse_mat_descr descr_C, rocsparse_int nnz_C, \ + scalar_type *csr_val_C, const rocsparse_int *csr_row_ptr_C, \ + const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C, \ + void *buffer) { \ + return rocsparse_##TOKEN##csrgemm_numeric( \ + handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_val_A, \ + csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_val_B, \ + csr_row_ptr_B, csr_col_ind_B, beta, descr_D, nnz_D, csr_val_D, \ + csr_row_ptr_D, csr_col_ind_D, descr_C, nnz_C, csr_val_C, \ + csr_row_ptr_C, csr_col_ind_C, info_C, buffer); \ + } +ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(float, s) +ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(double, d) +ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(rocsparse_float_complex, c) +ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(rocsparse_double_complex, z) + + /* + Rocsparse has its own datatype for complex numbers. + The datatype, however, has the exact form of the Kokkos::complex type + (i.e. struct {T real,imaginary;} comlex_type;), so we can reinterpret_cast + between the types. Note that this is not a 100% safe operation since the + compiler may not preserve the ordering of the real/imaginary entries. + */ +template struct rocsparse_type_conversion +{ + using Type=T; + static Type convert(T val){return val;} +}; +template<> struct rocsparse_type_conversion> +{ + using Type=rocsparse_float_complex; + static Type convert(Kokkos::complex val){ + return *reinterpret_cast(&val); + } +}; +template<> struct rocsparse_type_conversion> +{ + using Type=rocsparse_double_complex; + static Type convert(Kokkos::complex val){ + return *reinterpret_cast(&val); + } +}; + + /* + The rocsparse kernels only run for "rocsparse_int" (int) index datatypes. + Downstream applications may use a variety of signed/unsigned ordinal and + size types. To get around this we use a set of macros to copy/convert back + and forth between datatypes. Naturally this comes at a cost, but the + runtime of the conversion kernels is negligible compared to the SpGEMM + operation itself. + */ + +#define KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(TYPE, \ + PTR_NAME, VIEW_TYPE, VIEW_NAME) \ + TYPE * PTR_NAME = nullptr; \ + if(std::is_same::type, \ + typename VIEW_TYPE::non_const_value_type>::value) { \ + PTR_NAME = (TYPE*) VIEW_NAME.data(); \ + } else { \ + if(VIEW_NAME.size() > 0 && (spgemm_handle.PTR_NAME.size() != \ + VIEW_NAME.size())) { \ + auto local_##PTR_NAME = spgemm_handle.PTR_NAME = \ + typename InputHandle::rocSparseSpgemmHandleType:: \ + rocsparse_index_array_t( \ + Kokkos::ViewAllocateWithoutInitializing("local copy " #VIEW_NAME),\ + VIEW_NAME.size()); \ + Kokkos::parallel_for("local copy from array " #VIEW_NAME , \ + VIEW_NAME.size(), \ + KOKKOS_LAMBDA(const size_t i){local_##PTR_NAME[i] = \ + static_cast::type>(VIEW_NAME[i]);}); \ + Kokkos::fence(); \ + } \ + PTR_NAME = (TYPE*) spgemm_handle.PTR_NAME.data(); \ + } -inline rocsparse_status rocsparse_Xcsrgemm_numeric( - rocsparse_handle handle, rocsparse_operation trans_A, - rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, - rocsparse_int k, const double *alpha, const rocsparse_mat_descr descr_A, - rocsparse_int nnz_A, const double *csr_val_A, - const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A, - const rocsparse_mat_descr descr_B, rocsparse_int nnz_B, - const double *csr_val_B, const rocsparse_int *csr_row_ptr_B, - const rocsparse_int *csr_col_ind_B, const double *beta, - const rocsparse_mat_descr descr_D, rocsparse_int nnz_D, - const double *csr_val_D, const rocsparse_int *csr_row_ptr_D, - const rocsparse_int *csr_col_ind_D, const rocsparse_mat_descr descr_C, - rocsparse_int nnz_C, double *csr_val_C, const rocsparse_int *csr_row_ptr_C, - const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C, - void *buffer) { - return rocsparse_dcsrgemm_numeric( - handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_val_A, - csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_val_B, csr_row_ptr_B, - csr_col_ind_B, beta, descr_D, nnz_D, csr_val_D, csr_row_ptr_D, - csr_col_ind_D, descr_C, nnz_C, csr_val_C, csr_row_ptr_C, csr_col_ind_C, - info_C, buffer); -} +#define KOKKOSKERNELS_ROCSPARSE_USE_OR_ALLOC_VIEW(TYPE, \ + PTR_NAME, VIEW_TYPE, VIEW_NAME) \ + TYPE * PTR_NAME = nullptr; \ + if(std::is_same::type, \ + typename VIEW_TYPE::non_const_value_type>::value) { \ + PTR_NAME = (TYPE*) VIEW_NAME.data(); \ + } else { \ + if(VIEW_NAME.size() > 0 && (spgemm_handle.PTR_NAME.size() != \ + VIEW_NAME.size())) \ + spgemm_handle.PTR_NAME = \ + typename InputHandle::rocSparseSpgemmHandleType:: \ + rocsparse_index_array_t( \ + Kokkos::ViewAllocateWithoutInitializing("local copy " #VIEW_NAME), \ + VIEW_NAME.size()); \ + PTR_NAME = (TYPE*) spgemm_handle.PTR_NAME.data(); \ + } -inline rocsparse_status rocsparse_Xcsrgemm_numeric( - rocsparse_handle handle, rocsparse_operation trans_A, - rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, - rocsparse_int k, const Kokkos::complex *alpha, - const rocsparse_mat_descr descr_A, rocsparse_int nnz_A, - const Kokkos::complex *csr_val_A, const rocsparse_int *csr_row_ptr_A, - const rocsparse_int *csr_col_ind_A, const rocsparse_mat_descr descr_B, - rocsparse_int nnz_B, const Kokkos::complex *csr_val_B, - const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B, - const Kokkos::complex *beta, const rocsparse_mat_descr descr_D, - rocsparse_int nnz_D, const Kokkos::complex *csr_val_D, - const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D, - const rocsparse_mat_descr descr_C, rocsparse_int nnz_C, - Kokkos::complex *csr_val_C, const rocsparse_int *csr_row_ptr_C, - const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C, - void *buffer) { - return rocsparse_ccsrgemm_numeric( - handle, trans_A, trans_B, m, n, k, - reinterpret_cast(alpha), descr_A, nnz_A, - reinterpret_cast(csr_val_A), - csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, - reinterpret_cast(csr_val_B), - csr_row_ptr_B, csr_col_ind_B, - reinterpret_cast(beta), descr_D, nnz_D, - reinterpret_cast(csr_val_D), - csr_row_ptr_D, csr_col_ind_D, descr_C, nnz_C, - reinterpret_cast(csr_val_C), csr_row_ptr_C, - csr_col_ind_C, info_C, buffer); -} +#define KOKKOSKERNELS_ROCSPARSE_COPY_VIEW(TYPE, \ + PTR_NAME, VIEW_TYPE, VIEW_NAME) \ + if(! std::is_same::type, \ + typename VIEW_TYPE::non_const_value_type>::value) { \ + auto local_##VIEW_NAME = spgemm_handle.PTR_NAME; \ + Kokkos::parallel_for("local copy to array " #VIEW_NAME , VIEW_NAME.size(),\ + KOKKOS_LAMBDA(const size_t i){VIEW_NAME[i] = local_##VIEW_NAME[i];}); \ + Kokkos::fence(); \ + } -inline rocsparse_status rocsparse_Xcsrgemm_numeric( - rocsparse_handle handle, rocsparse_operation trans_A, - rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, - rocsparse_int k, const Kokkos::complex *alpha, - const rocsparse_mat_descr descr_A, rocsparse_int nnz_A, - const Kokkos::complex *csr_val_A, - const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A, - const rocsparse_mat_descr descr_B, rocsparse_int nnz_B, - const Kokkos::complex *csr_val_B, - const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B, - const Kokkos::complex *beta, const rocsparse_mat_descr descr_D, - rocsparse_int nnz_D, const Kokkos::complex *csr_val_D, - const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D, - const rocsparse_mat_descr descr_C, rocsparse_int nnz_C, - Kokkos::complex *csr_val_C, const rocsparse_int *csr_row_ptr_C, - const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C, - void *buffer) { - return rocsparse_zcsrgemm_numeric( - handle, trans_A, trans_B, m, n, k, - reinterpret_cast(alpha), descr_A, nnz_A, - reinterpret_cast(csr_val_A), - csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, - reinterpret_cast(csr_val_B), - csr_row_ptr_B, csr_col_ind_B, - reinterpret_cast(beta), descr_D, nnz_D, - reinterpret_cast(csr_val_D), - csr_row_ptr_D, csr_col_ind_D, descr_C, nnz_C, - reinterpret_cast(csr_val_C), csr_row_ptr_C, - csr_col_ind_C, info_C, buffer); -} +/** + * @brief Symbolic call for rocsparse SpGEMM + * + * Call is used to count the nnz for C = A*B, and fill C's row map. + * + * @tparam InputHandle KernelHandle used to store descriptor data + * @tparam ScalarType Input scalar type (float, double, etc) + * @tparam InputRowView Row map view type + * @tparam InputColumnView Column index view type + * @tparam OutputRowView Output row map view type + * @param input_handle SpGEMM handle that will store various rocsparse content + * @param m Number of rows in A + * @param n Number of columns in A and rows in B + * @param k Number of columns in B + * @param row_map_A Row map for A + * @param columns_A Column indexes for A + * @param trans_A Use transpose of A (not supported by rocsparse) + * @param row_map_B Row map for B + * @param columns_B Column indexes for B + * @param trans_B Use transpose of B (not supported by rocsparse) + * @param row_map_C Output rowmap for C + */ +template +void +spgemm_symbolic_rocsparse(InputHandle * input_handle, + const rocsparse_int m, + const rocsparse_int n, + const rocsparse_int k, + InputRowView row_map_A, + InputColumnView columns_A, + const bool trans_A, + InputRowView row_map_B, + InputColumnView columns_B, + const bool trans_B, + OutputRowView row_map_C) +{ + + // rocSPARSE solves C = alpha * A * B + beta * D + // For our purposes, we only care about C = A * B + // If Jacobi SpGEMM is enabled, we instead use : + // C = B - omega * D^{-1} * A * B, + // where D is a diagonal matrix (no effect on A's sparsity pattern) + + // In the symbolic call we will only count nnz for C, + // fill the C row map, and allocate a buffer space. + + using RocsparseScalarType = + typename rocsparse_type_conversion::Type; + using OrdinalType = rocsparse_int; + + // This handle contains some useful rocsparse components + if(input_handle->get_rocsparse_spgemm_handle() == NULL) + input_handle->create_rocsparse_spgemm_handle(trans_A,trans_B); + auto & spgemm_handle = *input_handle->get_rocsparse_spgemm_handle(); + + const bool enable_jacobi = spgemm_handle.enable_jacobi; + + // Initialize scalar multipliers - not used at this point, + // but we need beta=0 to ignore matrix D (or 1 if Jacobi is enabled) + RocsparseScalarType alpha = 1; + RocsparseScalarType beta = enable_jacobi ? 1 : 0; + + // Create matrix descriptors + rocsparse_mat_descr descr_A = spgemm_handle.descr_A; + rocsparse_mat_descr descr_B = spgemm_handle.descr_B; + rocsparse_mat_descr descr_C = spgemm_handle.descr_C; + rocsparse_mat_descr descr_D = spgemm_handle.descr_D; + + // Acquire info object for C + rocsparse_mat_info & info_C = spgemm_handle.info_C; + + const int nnz_A = columns_A.size(); + const int nnz_B = columns_B.size(); + const int nnz_D = enable_jacobi ? columns_B.size() : 0; + + // row_map_C needs to have been sized externally + if(row_map_C.size() != static_cast(m+1)) + throw std::runtime_error("spgemm_symbolic_rocsparse : " \ + "row_map_C has not been properly allocated. row_map_C.size() = " \ + +std::to_string(row_map_C.size())+", expected "+std::to_string(m+1)); + + rocsparse_handle handle = spgemm_handle.handle; + + auto operation_A = trans_A ? rocsparse_operation_transpose : + rocsparse_operation_none; + auto operation_B = trans_B ? rocsparse_operation_transpose : + rocsparse_operation_none; + + if((spgemm_handle.opA != operation_A) || + (spgemm_handle.opB != operation_B)) + throw std::runtime_error("spgemm_symbolic_rocsparse : " \ + "Reusing handle with incorrect transpose state."); + + // Matrix CSR row offset arrays + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_row_ptr_A, + InputRowView, row_map_A) + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_row_ptr_B, + InputRowView, row_map_B) + KOKKOSKERNELS_ROCSPARSE_USE_OR_ALLOC_VIEW( OrdinalType, csr_row_ptr_C, + OutputRowView, row_map_C) + const OrdinalType * csr_row_ptr_D = (enable_jacobi) ? csr_row_ptr_B : nullptr; + + // Matrix CSR column index arrays + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_col_ind_A, + InputColumnView, columns_A) + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_col_ind_B, + InputColumnView, columns_B) +// OrdinalType * csr_col_ind_C = nullptr; + const OrdinalType * csr_col_ind_D = (enable_jacobi) ? csr_col_ind_B : nullptr; + + // Make sure device is ready + Kokkos::fence(); + + // Set pointer mode (applies to scalar values only) + rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host); + + // Make sure buffer is allocated + { + // Get the size of the buffer + size_t buffer_size; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_Xcsrgemm_buffer_size(handle, + operation_A,operation_B, + m,n,k, + &alpha, + descr_A,nnz_A,csr_row_ptr_A,csr_col_ind_A, + descr_B,nnz_B,csr_row_ptr_B,csr_col_ind_B, + &beta, + descr_D,nnz_D,csr_row_ptr_D,csr_col_ind_D, + info_C, + &buffer_size)); + + // Allocate buffer + using buffer_t = typename InputHandle:: + rocSparseSpgemmHandleType::buffer_t; + spgemm_handle.buffer = buffer_t( + Kokkos::ViewAllocateWithoutInitializing("rocsparse_buffer"), + buffer_size); + } -//============================================================================= -template -inline typename std::enable_if< - (not std::is_same::value) or - (not std::is_same::value), - void>::type -rocsparse_spgemm_symbolic_internal(KernelHandle *handle, index_type m, - index_type n, index_type k, size_type nnz_A, - const size_type *rowptrA, - const index_type *colidxA, bool transposeA, - size_type nnz_B, const size_type *rowptrB, - const index_type *colidxB, bool transposeB, - size_type *rowptrC) { - // normal code should use the specializations and not go here - throw std::runtime_error( - "The installed rocsparse does not support the index type and size type"); -} + void* buffer = static_cast(spgemm_handle.buffer.data()); -template -inline - typename std::enable_if::value and - std::is_same::value, - void>::type - rocsparse_spgemm_symbolic_internal( - KernelHandle *handle, index_type m, index_type n, index_type k, - size_type nnz_A, const size_type *rowptrA, const index_type *colidxA, - bool transposeA, size_type nnz_B, const size_type *rowptrB, - const index_type *colidxB, bool transposeB, size_type *rowptrC) { - handle->create_rocsparse_spgemm_handle(transposeA, transposeB); - typename KernelHandle::rocSparseSpgemmHandleType *h = - handle->get_rocsparse_spgemm_handle(); - - // alpha, beta are on host, but since we use singleton on the rocsparse - // handle, we save/restore the pointer mode to not interference with - // others' use - const auto alpha = Kokkos::ArithTraits::one(); - const auto beta = Kokkos::ArithTraits::zero(); - rocsparse_pointer_mode oldPtrMode; + // Find number of nonzeros in C + rocsparse_int nnz_C = 0; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_csrgemm_nnz(handle, + operation_A,operation_B, + m,n,k, + descr_A,nnz_A,csr_row_ptr_A,csr_col_ind_A, + descr_B,nnz_B,csr_row_ptr_B,csr_col_ind_B, + descr_D,nnz_D,csr_row_ptr_D,csr_col_ind_D, + descr_C,csr_row_ptr_C, + &nnz_C,info_C, + buffer)); + + KOKKOSKERNELS_ROCSPARSE_COPY_VIEW(OrdinalType, csr_row_ptr_C, + OutputRowView, row_map_C); + + // We pass the nnz_C out through the KernelHandle + input_handle->set_c_nnz(nnz_C); + + // Here is an interesting corner case that testing caught + // that is a side-effect of not initializing arrays to zero + if(nnz_C == 0) Kokkos::deep_copy(row_map_C,0); + + // While we have set nnz_C, and filled row_map_C, + // we have not filled C column indexes, which is + // handled in the numeric call. We reset the flag + // governing the evaluation of C column indexes here. + spgemm_handle.C_populated = false; +} + +/** + * @brief Numeric call for rocsparse SpGEMM + * + * Used to fill values for C = A*B + * Will also fill column indexes in C if that has not already been done + * + * @note The symbolic call must be called first or rocsparse will error out + * + * @tparam InputHandle KernelHandle used to store handle information + * @tparam InputRowView Row map view type for A and B + * @tparam InputColumnView Column index view type for A and B + * @tparam InputDataView Data view type for A and B + * @tparam OutputColumnView Output column index view type for C + * @tparam OutputRowView Output row map view type for C + * @tparam OutputDataView Data view type for C + * @param input_handle SpGEMM handle that will store various rocsparse content + * @param m Number of rows in A + * @param n Number of columns in A and rows in B + * @param k Number of columns in B + * @param row_map_A Row map for A + * @param columns_A Column indexes for A + * @param values_A Values for A + * @param trans_A Use transpose of A (not supported by rocsparse) + * @param row_map_B Row map for B + * @param columns_B Column indexes for B + * @param values_B Values for B + * @param trans_B Use transpose of B (not supported by rocsparse) + * @param row_map_C Input row map for C + * @param columns_C Output column indexes for C + * @param values_C Output values for C + */ +template +void +spgemm_numeric_rocsparse(InputHandle * input_handle, + const rocsparse_int m, + const rocsparse_int n, + const rocsparse_int k, + InputRowView row_map_A, + InputColumnView columns_A, + InputDataView values_A, + const bool trans_A, + InputRowView row_map_B, + InputColumnView columns_B, + InputDataView values_B, + const bool trans_B, + InputRowView row_map_C, + OutputColumnView columns_C, + OutputDataView values_C) +{ + + // rocSPARSE solves C = alpha * A * B + beta * D + // For our purposes, we only care about C = A * B + + // In the numeric call we will fill columns_C and values_C using what was + // learned in the symbolic call. + + using OrdinalType = rocsparse_int; + using ScalarType = typename InputDataView::non_const_value_type; + using RocsparseScalarType = + typename rocsparse_type_conversion::Type; + + // Initialize scalar multipliers - not used at this point, + // but we need beta=0 to ignore matrix D + RocsparseScalarType alpha = 1; + RocsparseScalarType beta = 0; + + // This handle contains some useful rocsparse components + auto & spgemm_handle = *input_handle->get_rocsparse_spgemm_handle(); + + // Create matrix descriptors + rocsparse_mat_descr descr_A = spgemm_handle.descr_A; + rocsparse_mat_descr descr_B = spgemm_handle.descr_B; + rocsparse_mat_descr descr_C = spgemm_handle.descr_C; + rocsparse_mat_descr descr_D = spgemm_handle.descr_D; + + // Acquire info object for C + rocsparse_mat_info & info_C = spgemm_handle.info_C; + + // Get the pre-allocated buffer (allocated in the symbolic call) + void* buffer = static_cast(spgemm_handle.buffer.data()); + if(buffer == nullptr) + throw std::runtime_error("spgemm_numeric_rocsparse : " \ + "Buffer has not been allocated."); + + const int nnz_A = columns_A.size(); + const int nnz_B = columns_B.size(); + const int nnz_C = columns_C.size(); + const int nnz_D = 0; + + // Make sure output allocations are properly sized + if(input_handle->get_c_nnz() != nnz_C) + throw std::runtime_error("spgemm_numeric_rocsparse : " \ + "columns_C has not been properly allocated. columns_C.size() = " \ + +std::to_string(columns_C.size())+", expected " \ + +std::to_string(input_handle->get_c_nnz())); + if(static_cast(nnz_C) != values_C.size()) + throw std::runtime_error("spgemm_numeric_rocsparse : " \ + "values_C has not been properly allocated. values_C.size() = " \ + +std::to_string(values_C.size())+", expected "+std::to_string(nnz_C)); + + rocsparse_handle & handle = spgemm_handle.handle; + + auto operation_A = trans_A ? rocsparse_operation_transpose : + rocsparse_operation_none; + auto operation_B = trans_B ? rocsparse_operation_transpose : + rocsparse_operation_none; + + if((spgemm_handle.opA != operation_A) || + (spgemm_handle.opB != operation_B)) + throw std::runtime_error("spgemm_numeric_rocsparse : " \ + "Using handle with incorrect transpose state."); + + // Matrix CSR row offset arrays + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_row_ptr_A, + InputRowView, row_map_A) + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_row_ptr_B, + InputRowView, row_map_B) + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_row_ptr_C, + InputRowView, row_map_C) + const OrdinalType * csr_row_ptr_D = nullptr; + + // Matrix CSR column index arrays + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType,csr_col_ind_A, + InputColumnView, columns_A) + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType,csr_col_ind_B, + InputColumnView, columns_B) + KOKKOSKERNELS_ROCSPARSE_USE_OR_ALLOC_VIEW( OrdinalType,csr_col_ind_C, + OutputColumnView, columns_C) + const OrdinalType * csr_col_ind_D = nullptr; + + // Matrix CSR values arrays + const RocsparseScalarType * csr_val_A = + reinterpret_cast(values_A.data()); + const RocsparseScalarType * csr_val_B = + reinterpret_cast(values_B.data()); + RocsparseScalarType * csr_val_C = + reinterpret_cast< RocsparseScalarType*>(values_C.data()); + const RocsparseScalarType * csr_val_D = nullptr; + + // Set pointer mode (applies to scalar values only) KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode( - h->rocsparseHandle, rocsparse_pointer_mode_host)); - - // C = alpha * OpA(A) * OpB(B) + beta * D - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_buffer_size( - h->rocsparseHandle, h->opA, h->opB, m, n, k, &alpha, h->descr_A, nnz_A, - rowptrA, colidxA, h->descr_B, nnz_B, rowptrB, colidxB, &beta, h->descr_D, - 0, NULL, NULL, h->info_C, &h->bufferSize)); - - KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&h->buffer, h->bufferSize)); - - rocsparse_int C_nnz = 0; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_nnz( - h->rocsparseHandle, h->opA, h->opB, m, n, k, h->descr_A, nnz_A, rowptrA, - colidxA, h->descr_B, nnz_B, rowptrB, colidxB, h->descr_D, 0, NULL, NULL, - h->descr_C, rowptrC, &C_nnz, h->info_C, h->buffer)); - - handle->set_c_nnz(C_nnz); - h->C_populated = false; // sparsity pattern of C is not set yet, so this is a - // fake symbolic + rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + // Make sure device is ready + Kokkos::fence(); + + // Symbolic spgemm call - fills columns_C + if(!spgemm_handle.C_populated){ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_csrgemm_symbolic(handle, + operation_A,operation_B, + m,n,k, + descr_A,nnz_A,csr_row_ptr_A,csr_col_ind_A, + descr_B,nnz_B,csr_row_ptr_B,csr_col_ind_B, + descr_D,nnz_D,csr_row_ptr_D,csr_col_ind_D, + descr_C,nnz_C,csr_row_ptr_C,csr_col_ind_C, + info_C, + buffer)); + + KOKKOSKERNELS_ROCSPARSE_COPY_VIEW(OrdinalType, csr_col_ind_C, + OutputColumnView, columns_C); + + // Make sure device is ready + Kokkos::fence(); + + spgemm_handle.C_populated = true; + } + + // Numeric spgemm call - fills values_C KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); -} + rocsparse_Xcsrgemm_numeric(handle, + operation_A,operation_B, + m,n,k, + &alpha, + descr_A,nnz_A,csr_val_A,csr_row_ptr_A,csr_col_ind_A, + descr_B,nnz_B,csr_val_B,csr_row_ptr_B,csr_col_ind_B, + &beta, + descr_D,nnz_D,csr_val_D,csr_row_ptr_D,csr_col_ind_D, + descr_C,nnz_C,csr_val_C,csr_row_ptr_C,csr_col_ind_C, + info_C, + buffer)); + + // Tell user the result is sorted + input_handle->set_sort_option(1); -template < - typename KernelHandle, typename ain_row_index_view_type, - typename ain_nonzero_index_view_type, typename bin_row_index_view_type, - typename bin_nonzero_index_view_type, typename cin_row_index_view_type> -void rocsparse_spgemm_symbolic( - KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, - ain_row_index_view_type rowptrA, ain_nonzero_index_view_type colidxA, - bool transposeA, bin_row_index_view_type rowptrB, - bin_nonzero_index_view_type colidxB, bool transposeB, - cin_row_index_view_type rowptrC) { - using index_type = typename KernelHandle::nnz_lno_t; - using size_type = typename KernelHandle::size_type; - using scalar_type = typename KernelHandle::nnz_scalar_t; - - // In case the KernelHandle uses const types! - using non_const_index_type = typename std::remove_cv::type; - using non_const_size_type = typename std::remove_cv::type; - using non_const_scalar_type = typename std::remove_cv::type; - - handle->set_sort_option(1); // tells users the output matrix is sorted - rocsparse_spgemm_symbolic_internal( - handle, m, n, k, colidxA.extent(0), rowptrA.data(), colidxA.data(), - transposeA, colidxB.extent(0), rowptrB.data(), colidxB.data(), transposeB, - rowptrC.data()); } -//============================================================================= -template -inline typename std::enable_if< - (not std::is_same::value) or - (not std::is_same::value), - void>::type -rocsparse_spgemm_numeric_internal( - KernelHandle *handle, index_type m, index_type n, index_type k, - size_type nnz_A, const size_type *rowptrA, const index_type *colidxA, - const scalar_type *valuesA, size_type nnz_B, const size_type *rowptrB, - const index_type *colidxB, const scalar_type *valuesB, size_type nnz_C, - const size_type *rowptrC, index_type *colidxC, scalar_type *valuesC) { - // normal code should use the specializations and not go here - throw std::runtime_error( - "The installed rocsparse does not support the index type and size type"); -} +/** + * @brief Numeric call for rocsparse SpGEMM + * + * Used to fill values for C = B - omega * (D^{-1}*A) * B + * Will also fill column indexes in C if that has not already been done + * + * @note The symbolic call must be called first or rocsparse will error out + * @note In the symbolic call make sure the enable_jacobi flag is turned on + * + * @tparam InputHandle KernelHandle used to store handle information + * @tparam InputRowView Row map view type for A and B + * @tparam InputColumnView Column index view type for A and B + * @tparam InputDataView Data view type for A and B + * @tparam OutputColumnView Output column index view type for C + * @tparam OutputRowView Output row map view type for C + * @tparam OutputDataView Data view type for C + * @param input_handle SpGEMM handle that will store various rocsparse content + * @param m Number of rows in A + * @param n Number of columns in A and rows in B + * @param k Number of columns in B + * @param row_map_A Row map for A + * @param columns_A Column indexes for A + * @param values_A Values for A + * @param trans_A Use transpose of A (not supported by rocsparse) + * @param row_map_B Row map for B + * @param columns_B Column indexes for B + * @param values_B Values for B + * @param trans_B Use transpose of B (not supported by rocsparse) + * @param row_map_C Input row map for C + * @param columns_C Output column indexes for C + * @param values_C Output values for C + */ +template +void +jacobi_spgemm_numeric_rocsparse(InputHandle * input_handle, + const rocsparse_int m, + const rocsparse_int n, + const rocsparse_int k, + InputRowView row_map_A, + InputColumnView columns_A, + InputDataView values_A, + const bool trans_A, + InputRowView row_map_B, + InputColumnView columns_B, + InputDataView values_B, + const bool trans_B, + typename InputDataView::const_value_type omega, + InvDView values_invD, + InputRowView row_map_C, + OutputColumnView columns_C, + OutputDataView values_C) +{ + + // rocSPARSE solves C = alpha * A * B + beta * D + // We need to evaluate C = B - omega * (D^{-1}*A) * B + + // In the numeric call we will fill columns_C and values_C. + + using OrdinalType = rocsparse_int; + using ScalarType = typename InputDataView::non_const_value_type; + using RocsparseScalarType = + typename rocsparse_type_conversion::Type; + + // Initialize scalar multipliers - not used at this point, + // but we need beta=0 to ignore matrix D + RocsparseScalarType alpha = + rocsparse_type_conversion::convert(-omega); + RocsparseScalarType beta = 1; + + // This handle contains some useful rocsparse components + auto & spgemm_handle = *input_handle->get_rocsparse_spgemm_handle(); + + // Create matrix descriptors + rocsparse_mat_descr descr_A = spgemm_handle.descr_A; + rocsparse_mat_descr descr_B = spgemm_handle.descr_B; + rocsparse_mat_descr descr_C = spgemm_handle.descr_C; + rocsparse_mat_descr descr_D = spgemm_handle.descr_D; + + // Acquire info object for C + rocsparse_mat_info & info_C = spgemm_handle.info_C; + + // Get the pre-allocated buffer (allocated in the symbolic call) + void* buffer = static_cast(spgemm_handle.buffer.data()); + if(buffer == nullptr) + throw std::runtime_error("jacobi_spgemm_numeric_rocsparse : " \ + "Buffer has not been allocated."); + + // If this is not set, then it is likely the allocations and + // row map for C are incorrect (defined by rocsparse_spgemm_symbolic call) + // FIXME: We currently lack a clean way to implementent this flag. + // Hopefully if a user comes accross this issue the exception is verbose + // enough to describe the fix. + if(! spgemm_handle.enable_jacobi) + throw std::runtime_error("jacobi_spgemm_numeric_rocsparse : " \ + "Handle is not setup for Jacobi SpGEMM, this can lead to " \ + "undefined behavior so we do not allow it. Please set enable_jacobi " \ + "in rocsparse spgemm handle to true prior to running symbolic spgemm " \ + "call. i.e.:\nKokkosKernels::Experimental::KokkosKernelHandle::" \ + "get_spgemm_handle()->get_rocsparse_spgemm_handle()->enable_jacobi " \ + " = true"); + + // invD needs to have num_rows entries + if(static_cast(values_invD.size()) != m) + throw std::runtime_error("jacobi_spgemm_numeric_rocsparse : "\ + "invD should be of size A_num_rows ("+std::to_string(m)+ \ + "), but is instead size "+std::to_string(values_invD.size())+"."); + + const int nnz_A = columns_A.size(); + const int nnz_B = columns_B.size(); + const int nnz_C = columns_C.size(); + const int nnz_D = columns_B.size(); + + // Make sure output allocations are properly sized + if(input_handle->get_c_nnz() != nnz_C) + throw std::runtime_error("jacobi_spgemm_numeric_rocsparse : " \ + "columns_C has not been properly allocated. columns_C.size() = " \ + +std::to_string(columns_C.size())+", expected " \ + +std::to_string(input_handle->get_c_nnz())); + if(static_cast(nnz_C) != values_C.size()) + throw std::runtime_error("jacobi_spgemm_numeric_rocsparse : " \ + "values_C has not been properly allocated. values_C.size() = " \ + +std::to_string(values_C.size())+", expected "+std::to_string(nnz_C)); + + rocsparse_handle & handle = spgemm_handle.handle; + + auto operation_A = trans_A ? rocsparse_operation_transpose : + rocsparse_operation_none; + auto operation_B = trans_B ? rocsparse_operation_transpose : + rocsparse_operation_none; + + if((spgemm_handle.opA != operation_A) || + (spgemm_handle.opB != operation_B)) + throw std::runtime_error("jacobi_spgemm_numeric_rocsparse : " \ + "Using handle with incorrect transpose state."); + + // Matrix CSR row offset arrays + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_row_ptr_A, + InputRowView, row_map_A) + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_row_ptr_B, + InputRowView, row_map_B) + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_row_ptr_C, + InputRowView, row_map_C) + const OrdinalType * csr_row_ptr_D = csr_row_ptr_B; + + // Matrix CSR column index arrays + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_col_ind_A, + InputColumnView, columns_A) + KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW(const OrdinalType, csr_col_ind_B, + InputColumnView, columns_B) + KOKKOSKERNELS_ROCSPARSE_USE_OR_ALLOC_VIEW( OrdinalType, csr_col_ind_C, + OutputColumnView, columns_C) + const OrdinalType * csr_col_ind_D = csr_col_ind_B; + + // Before we call the numeric call for rocsparse, we need to fill out + // the invD*A entries. Note that D is diagonal so it has no impact on + // the row_map or columns of the resulting operation + + // Allocate invDA_values if it not correctly sized + if(spgemm_handle.csr_values_invDA.size() != nnz_A){ + spgemm_handle.csr_values_invDA = + typename InputHandle::scalar_temp_work_view_t( + Kokkos::ViewAllocateWithoutInitializing("local copy invDA"), nnz_A); + } -template -inline - typename std::enable_if::value and - std::is_same::value, - void>::type - rocsparse_spgemm_numeric_internal( - KernelHandle *handle, index_type m, index_type n, index_type k, - size_type nnz_A, const size_type *rowptrA, const index_type *colidxA, - const scalar_type *valuesA, size_type nnz_B, const size_type *rowptrB, - const index_type *colidxB, const scalar_type *valuesB, size_type nnz_C, - const size_type *rowptrC, index_type *colidxC, scalar_type *valuesC) { - typename KernelHandle::rocSparseSpgemmHandleType *h = - handle->get_rocsparse_spgemm_handle(); - - const auto alpha = Kokkos::ArithTraits::one(); - const auto beta = Kokkos::ArithTraits::zero(); - rocsparse_pointer_mode oldPtrMode; + // Matrix CSR values arrays + RocsparseScalarType * csr_val_A = + reinterpret_cast< RocsparseScalarType*>( + spgemm_handle.csr_values_invDA.data()); + const RocsparseScalarType * csr_val_B = + reinterpret_cast(values_B.data()); + RocsparseScalarType * csr_val_C = + reinterpret_cast< RocsparseScalarType*>(values_C.data()); + const RocsparseScalarType * csr_val_D = + reinterpret_cast(csr_val_B); + + // Now we need to fill the csr_val_A with the results of invD * A + { + + // Each team is a single wave/warp + const OrdinalType num_rows = row_map_A.size()-1; + const OrdinalType num_threads_per_row = 16; + const OrdinalType num_rows_per_team = 16; + using policy_t=Kokkos::TeamPolicy; + using team_t=typename policy_t::member_type; + + // invD can be a high rank view with a single value per row, + // but it should be flat so this is "safe" + const ScalarType * csr_val_A = values_A.data(); + const ScalarType * csr_val_invD = values_invD.data(); + ScalarType * csr_val_invDA = spgemm_handle.csr_values_invDA.data(); + + policy_t policy((num_rows + num_rows_per_team - 1) / num_rows_per_team, + num_rows_per_team, num_threads_per_row); + Kokkos::parallel_for("Filling invD*A values", policy, + KOKKOS_LAMBDA(const team_t & team){ + const OrdinalType row_begin = team.league_rank() * num_rows_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, + row_begin, min(row_begin+num_rows_per_team, num_rows)), + [&](const OrdinalType & row) { + const auto mult = csr_val_invD[row]; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, + csr_row_ptr_A[row], csr_row_ptr_A[row + 1]), + [&](const OrdinalType & idx) { + csr_val_invDA[idx] = mult*csr_val_A[idx]; + }); + }); + }); + + // Wait for kokkos to finish + Kokkos::fence(); + } + // Set pointer mode (applies to scalar values only) KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode( - h->rocsparseHandle, rocsparse_pointer_mode_host)); - - if (!h->C_populated) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_symbolic( - h->rocsparseHandle, h->opA, h->opB, m, n, k, h->descr_A, nnz_A, rowptrA, - colidxA, h->descr_B, nnz_B, rowptrB, colidxB, h->descr_D, 0, NULL, NULL, - h->descr_C, nnz_C, rowptrC, colidxC, h->info_C, h->buffer)); - h->C_populated = true; + rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + // Symbolic spgemm call - fills columns_C + if(!spgemm_handle.C_populated){ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_csrgemm_symbolic(handle, + operation_A,operation_B, + m,n,k, + descr_A,nnz_A,csr_row_ptr_A,csr_col_ind_A, + descr_B,nnz_B,csr_row_ptr_B,csr_col_ind_B, + descr_D,nnz_D,csr_row_ptr_D,csr_col_ind_D, + descr_C,nnz_C,csr_row_ptr_C,csr_col_ind_C, + info_C, + buffer)); + + KOKKOSKERNELS_ROCSPARSE_COPY_VIEW(OrdinalType, csr_col_ind_C, + OutputColumnView, columns_C); + + spgemm_handle.C_populated = true; } - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_numeric( - h->rocsparseHandle, h->opA, h->opB, m, n, k, &alpha, h->descr_A, nnz_A, - valuesA, rowptrA, colidxA, h->descr_B, nnz_B, valuesB, rowptrB, colidxB, - &beta, h->descr_D, 0, NULL, NULL, NULL, h->descr_C, nnz_C, valuesC, - rowptrC, colidxC, h->info_C, h->buffer)); + // Numeric spgemm call - fills values_C KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); -} + rocsparse_Xcsrgemm_numeric(handle, + operation_A, operation_B, + m, n, k, + &alpha, + descr_A,nnz_A,csr_val_A,csr_row_ptr_A,csr_col_ind_A, + descr_B,nnz_B,csr_val_B,csr_row_ptr_B,csr_col_ind_B, + &beta, + descr_D,nnz_D,csr_val_D,csr_row_ptr_D,csr_col_ind_D, + descr_C,nnz_C,csr_val_C,csr_row_ptr_C,csr_col_ind_C, + info_C, + buffer)); + + // Tell user the result is sorted + input_handle->set_sort_option(1); -template < - typename KernelHandle, typename ain_row_index_view_type, - typename ain_nonzero_index_view_type, typename ain_nonzero_value_view_type, - typename bin_row_index_view_type, typename bin_nonzero_index_view_type, - typename bin_nonzero_value_view_type, typename cin_row_index_view_type, - typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type> -void rocsparse_spgemm_numeric( - KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, - ain_row_index_view_type rowptrA, ain_nonzero_index_view_type colidxA, - ain_nonzero_value_view_type valuesA, bool /* transposeA */, - bin_row_index_view_type rowptrB, bin_nonzero_index_view_type colidxB, - bin_nonzero_value_view_type valuesB, bool /* transposeB */, - cin_row_index_view_type rowptrC, cin_nonzero_index_view_type colidxC, - cin_nonzero_value_view_type valuesC) { - using index_type = typename KernelHandle::nnz_lno_t; - using size_type = typename KernelHandle::size_type; - using scalar_type = typename KernelHandle::nnz_scalar_t; - - // In case the KernelHandle uses const types! - using non_const_index_type = typename std::remove_cv::type; - using non_const_size_type = typename std::remove_cv::type; - using non_const_scalar_type = typename std::remove_cv::type; - - rocsparse_spgemm_numeric_internal( - handle, m, n, k, colidxA.extent(0), rowptrA.data(), colidxA.data(), - valuesA.data(), colidxB.extent(0), rowptrB.data(), colidxB.data(), - valuesB.data(), colidxC.extent(0), rowptrC.data(), colidxC.data(), - valuesC.data()); } } // namespace Impl } // namespace KokkosSparse +#undef KOKKOSKERNELS_ROCSPARSE_USE_OR_CONVERT_VIEW +#undef KOKKOSKERNELS_ROCSPARSE_USE_OR_ALLOC_VIEW +#undef KOKKOSKERNELS_ROCSPARSE_COPY_VIEW + #endif #endif + + diff --git a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp index a85ac45a7a..808f3c1747 100644 --- a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp @@ -161,11 +161,13 @@ struct SPGEMM_SYMBOLIC( - sh, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB, - transposeB, row_mapC); + spgemm_symbolic_rocsparse( + sh, m, n, k, + row_mapA, entriesA, transposeA, + row_mapB, entriesB, transposeB, + row_mapC); #else throw std::runtime_error( "Requiring SPGEMM_ROCSPARSE but TPL_ROCSPARSE was not enabled!"); diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index 9f89083957..e9bcdc2724 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -149,44 +149,83 @@ class SPGEMMHandle { nnz_lno_persistent_work_host_view_t; // Host view type #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE - struct rocSparseSpgemmHandleType { - KokkosKernels::Experimental::Controls - kkControls; // give a singleton rocsparse handle - rocsparse_handle rocsparseHandle; - rocsparse_operation opA, opB; - rocsparse_mat_descr descr_A, descr_B, descr_C, descr_D; - rocsparse_mat_info info_C; - size_t bufferSize; - void *buffer; - bool C_populated; - rocSparseSpgemmHandleType(bool transposeA, bool transposeB) { - opA = - transposeA ? rocsparse_operation_transpose : rocsparse_operation_none; - opB = - transposeB ? rocsparse_operation_transpose : rocsparse_operation_none; + struct rocSparseSpgemmHandleType + { - bufferSize = 0; - buffer = NULL; - C_populated = false; + /// Constructor + rocSparseSpgemmHandleType(bool transposeA, bool transposeB): + opA(transposeA ? rocsparse_operation_transpose : rocsparse_operation_none), + opB(transposeB ? rocsparse_operation_transpose : rocsparse_operation_none), + enable_jacobi(false), + C_populated(false) + { + // Get singleton handle + handle = kkControls.getRocsparseHandle(); + + // Create matrix descriptors KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_A)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_B)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_C)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_D)); + + // Create C info object KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info_C)); - rocsparseHandle = kkControls.getRocsparseHandle(); + } - ~rocSparseSpgemmHandleType() { - rocsparse_destroy_mat_info(info_C); - rocsparse_destroy_mat_descr(descr_A); - rocsparse_destroy_mat_descr(descr_B); - rocsparse_destroy_mat_descr(descr_C); - rocsparse_destroy_mat_descr(descr_D); + // Destructor + ~rocSparseSpgemmHandleType() + { + + (void) rocsparse_destroy_mat_descr(descr_A); + (void) rocsparse_destroy_mat_descr(descr_B); + (void) rocsparse_destroy_mat_descr(descr_C); + (void) rocsparse_destroy_mat_descr(descr_D); + (void) rocsparse_destroy_mat_info(info_C); } + + // rocsparse handle wrappers + KokkosKernels::Experimental::Controls kkControls; // give a singleton rocsparse handle + rocsparse_handle handle; + rocsparse_operation opA, opB; + + // Matrix descriptors + rocsparse_mat_descr descr_A; + rocsparse_mat_descr descr_B; + rocsparse_mat_descr descr_C; + rocsparse_mat_descr descr_D; + rocsparse_mat_info info_C; + + // Scratch buffer allocation + using buffer_t = Kokkos::View>; + buffer_t buffer; + + // Currently SpGEMM in rocSPARSE only supports the int datatype + // We use these arrays to transform back and forth as needed + using rocsparse_index_array_t = Kokkos::View>; + rocsparse_index_array_t csr_row_ptr_A; + rocsparse_index_array_t csr_row_ptr_B; + rocsparse_index_array_t csr_row_ptr_C; + rocsparse_index_array_t csr_row_ptr_D; + rocsparse_index_array_t csr_col_ind_A; + rocsparse_index_array_t csr_col_ind_B; + rocsparse_index_array_t csr_col_ind_C; + rocsparse_index_array_t csr_col_ind_D; + + // (Jacobi only) Used to store D^{-1} * A entries for use with rocsparse csrgemm + scalar_temp_work_view_t csr_values_invDA; + + // Used to trigger Jacobi SpGEMM via rocsparse + bool enable_jacobi; + + // Used to ensure C column indexes are not recomputed in the spgemm_numeric call + bool C_populated; }; -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #if (CUDA_VERSION >= 11000) diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index 2f425b07d2..259102a6d4 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -295,6 +295,10 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, algorithms.push_back(SPGEMM_MKL); #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) && defined(TEST_HIP_SPARSE_CPP) + algorithms.push_back(SPGEMM_ROCSPARSE); +#endif + for (auto spgemm_algorithm : algorithms) { const uint64_t max_integer = Kokkos::ArithTraits::max(); std::string algo = "UNKNOWN"; @@ -309,7 +313,13 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, is_expected_to_fail = true; #endif break; - + case SPGEMM_ROCSPARSE: algo = "SPGEMM_ROCSPARSE"; + // The rocsparse implementation internally casts size types to int + if (A.values.extent(0) > max_integer) { + is_expected_to_fail = true; + } + + break; case SPGEMM_MKL: algo = "SPGEMM_MKL"; #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL if (!KokkosSparse::Impl::mkl_is_supported_value_type::value) { diff --git a/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp b/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp index 12cfd983f1..a6a141f82c 100644 --- a/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp @@ -100,6 +100,11 @@ int run_spgemm_jacobi(crsMat_t input_mat, crsMat_t input_mat2, kh.create_spgemm_handle(spgemm_algorithm); +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) && defined(TEST_HIP_SPARSE_CPP) + kh.get_spgemm_handle()->create_rocsparse_spgemm_handle(false,false); + kh.get_spgemm_handle()->get_rocsparse_spgemm_handle()->enable_jacobi=true; +#endif + const size_t num_rows_1 = input_mat.numRows(); const size_t num_rows_2 = input_mat2.numRows(); const size_t num_cols_2 = input_mat2.numCols(); @@ -256,15 +261,20 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, run_spgemm_jacobi( input_mat, input_mat, omega, dinv, SPGEMM_SERIAL, output_mat2); - SPGEMMAlgorithm spgemm_algorithm = - SPGEMM_KK_MEMORY; // should we test other SpGEMM algorithms as well? + std::vector spgemm_algorithms; + spgemm_algorithms.push_back(SPGEMM_KK_MEMORY); +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) && defined(TEST_HIP_SPARSE_CPP) + spgemm_algorithms.push_back(SPGEMM_ROCSPARSE); +#endif - crsMat_t output_mat; + for(const auto & spgemm_algorithm : spgemm_algorithms){ + crsMat_t output_mat; - run_spgemm_jacobi(input_mat, input_mat, omega, dinv, - spgemm_algorithm, output_mat); - bool is_identical = is_same_mat(output_mat, output_mat2); - EXPECT_TRUE(is_identical); + run_spgemm_jacobi(input_mat, input_mat, omega, dinv, + spgemm_algorithm, output_mat); + bool is_identical = is_same_mat(output_mat, output_mat2); + EXPECT_TRUE(is_identical); + } } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \