From 373cf8bc9ce9c8888e0a6ff5b2c71fc0a9ff182e Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 8 Apr 2026 14:58:18 -0400 Subject: [PATCH 1/3] recalculate bins, dont store it Signed-off-by: Dan Hoeflinger --- .../parallel_backend_sycl_radix_sort_one_wg.h | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index c929cd448d9..b82bb1f62f5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -215,9 +215,6 @@ struct __subgroup_radix_sort { uint16_t __indices[__block_size]; //indices for indirect access in the "re-order" phase { - //pointers(by performance reasons) to bucket's counters - uint32_t* __counters[__block_size]; - //1. "counting" phase //counter initialization auto __pcounter = __dpl_sycl::__get_accessor_ptr(__counter_lacc) + __wi; @@ -239,9 +236,9 @@ struct __subgroup_radix_sort : __bin_count - 1 /*default bin for out of range elements (when idx >= n)*/; //"counting" and local offset calculation - __counters[__i] = &__pcounter[__bin * __wg_size]; - __indices[__i] = *__counters[__i]; - *__counters[__i] = __indices[__i] + 1; + auto* __p = &__pcounter[__bin * __wg_size]; + __indices[__i] = *__p; + *__p = __indices[__i] + 1; } __dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence()); @@ -274,8 +271,16 @@ struct __subgroup_radix_sort _ONEDPL_PRAGMA_UNROLL for (uint16_t __i = 0; __i < __block_size; ++__i) { - // a global index is a local offset plus a global base index - __indices[__i] += *__counters[__i]; + // a global index is a local offset plus a global base index; + // recompute the bucket to avoid caching pointers in registers + const uint16_t __idx = __wi * __block_size + __i; + const uint16_t __bin = __idx < __n + ? __get_bucket( + __order_preserving_cast<__is_asc>( + std::invoke(__proj, __values.__v[__i])), + __begin_bit) + : __bin_count - 1; + __indices[__i] += __pcounter[__bin * __wg_size]; } } From 523edf1d0d08cd5847c1bc74db62e02e65171501 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 8 Apr 2026 16:05:09 -0400 Subject: [PATCH 2/3] another try Signed-off-by: Dan Hoeflinger --- .../parallel_backend_sycl_radix_sort_one_wg.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index b82bb1f62f5..d1a65ac0994 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -215,6 +215,10 @@ struct __subgroup_radix_sort { uint16_t __indices[__block_size]; //indices for indirect access in the "re-order" phase { + // Cache bin indices (1 byte each) instead of full pointers (8 bytes each) + // to reduce register pressure while avoiding recomputation from values + uint16_t __bins[__block_size]; + //1. "counting" phase //counter initialization auto __pcounter = __dpl_sycl::__get_accessor_ptr(__counter_lacc) + __wi; @@ -236,6 +240,7 @@ struct __subgroup_radix_sort : __bin_count - 1 /*default bin for out of range elements (when idx >= n)*/; //"counting" and local offset calculation + __bins[__i] = __bin; auto* __p = &__pcounter[__bin * __wg_size]; __indices[__i] = *__p; *__p = __indices[__i] + 1; @@ -271,16 +276,8 @@ struct __subgroup_radix_sort _ONEDPL_PRAGMA_UNROLL for (uint16_t __i = 0; __i < __block_size; ++__i) { - // a global index is a local offset plus a global base index; - // recompute the bucket to avoid caching pointers in registers - const uint16_t __idx = __wi * __block_size + __i; - const uint16_t __bin = __idx < __n - ? __get_bucket( - __order_preserving_cast<__is_asc>( - std::invoke(__proj, __values.__v[__i])), - __begin_bit) - : __bin_count - 1; - __indices[__i] += __pcounter[__bin * __wg_size]; + // a global index is a local offset plus a global base index + __indices[__i] += __pcounter[__bins[__i] * __wg_size]; } } From acca558f01bc029511ad883b9d51ddfa8c82f9f3 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 8 Apr 2026 16:54:55 -0400 Subject: [PATCH 3/3] correct comment, add std:: for types Signed-off-by: Dan Hoeflinger --- .../parallel_backend_sycl_radix_sort_one_wg.h | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index d1a65ac0994..a6c5bcdd5c0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -178,7 +178,7 @@ struct __subgroup_radix_sort { assert(__src.size() <= std::numeric_limits::max()); assert(__block_size * __wg_size <= std::numeric_limits::max()); - uint16_t __n = __src.size(); + std::uint16_t __n = __src.size(); assert(__n <= __block_size * __wg_size); using _ValT = oneapi::dpl::__internal::__value_t<_RangeIn>; @@ -186,7 +186,7 @@ struct __subgroup_radix_sort const auto __counter_buf_sz = __get_counter_buf_size(__wg_size); _TempBuf<_ValT, _SLM_tag_val> __buf_val(__block_size * __wg_size); - _TempBuf __buf_count(__counter_buf_sz); + _TempBuf __buf_count(__counter_buf_sz); sycl::nd_range __range{sycl::range{__wg_size}, sycl::range{__wg_size}}; return __q.submit([&](sycl::handler& __cgh) { @@ -202,9 +202,9 @@ struct __subgroup_radix_sort _ValT __v[__block_size]; __storage() {} } __values; - uint16_t __wi = __it.get_local_linear_id(); - uint16_t __begin_bit = 0; - constexpr uint16_t __end_bit = sizeof(_KeyT) * ::std::numeric_limits::digits; + std::uint16_t __wi = __it.get_local_linear_id(); + std::uint16_t __begin_bit = 0; + constexpr std::uint16_t __end_bit = sizeof(_KeyT) * std::numeric_limits::digits; //copy(move) values construction __block_load<_ValT>(__wi, __src, __values.__v, __n); @@ -213,25 +213,24 @@ struct __subgroup_radix_sort while (true) { - uint16_t __indices[__block_size]; //indices for indirect access in the "re-order" phase + std::uint16_t __indices[__block_size]; //indices for indirect access in the "re-order" phase { - // Cache bin indices (1 byte each) instead of full pointers (8 bytes each) - // to reduce register pressure while avoiding recomputation from values - uint16_t __bins[__block_size]; + // Cache bin indices to avoid recomputation from values + std::uint16_t __bins[__block_size]; //1. "counting" phase //counter initialization auto __pcounter = __dpl_sycl::__get_accessor_ptr(__counter_lacc) + __wi; _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __bin_count; ++__i) + for (std::uint16_t __i = 0; __i < __bin_count; ++__i) __pcounter[__i * __wg_size] = 0; _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __block_size; ++__i) + for (std::uint16_t __i = 0; __i < __block_size; ++__i) { - const uint16_t __idx = __wi * __block_size + __i; - const uint16_t __bin = + const std::uint16_t __idx = __wi * __block_size + __i; + const std::uint16_t __bin = __idx < __n ? __get_bucket( __order_preserving_cast<__is_asc>( @@ -241,7 +240,7 @@ struct __subgroup_radix_sort //"counting" and local offset calculation __bins[__i] = __bin; - auto* __p = &__pcounter[__bin * __wg_size]; + std::uint32_t* __p = &__pcounter[__bin * __wg_size]; __indices[__i] = *__p; *__p = __indices[__i] + 1; } @@ -252,20 +251,21 @@ struct __subgroup_radix_sort //TODO: probably can be further optimized //scan contiguous numbers - uint16_t __bin_sum[__bin_count]; + std::uint16_t __bin_sum[__bin_count]; __bin_sum[0] = __counter_lacc[__wi * __bin_count]; _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 1; __i < __bin_count; ++__i) + for (std::uint16_t __i = 1; __i < __bin_count; ++__i) __bin_sum[__i] = __bin_sum[__i - 1] + __counter_lacc[__wi * __bin_count + __i]; __dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence()); //exclusive scan local sum - uint16_t __sum_scan = __dpl_sycl::__exclusive_scan_over_group( - __it.get_group(), __bin_sum[__bin_count - 1], __dpl_sycl::__plus()); + std::uint16_t __sum_scan = __dpl_sycl::__exclusive_scan_over_group( + __it.get_group(), __bin_sum[__bin_count - 1], + __dpl_sycl::__plus()); //add to local sum, generate exclusive scan result _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __bin_count; ++__i) + for (std::uint16_t __i = 0; __i < __bin_count; ++__i) __counter_lacc[__wi * __bin_count + __i + 1] = __sum_scan + __bin_sum[__i]; if (__wi == 0) @@ -274,7 +274,7 @@ struct __subgroup_radix_sort } _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __block_size; ++__i) + for (std::uint16_t __i = 0; __i < __block_size; ++__i) { // a global index is a local offset plus a global base index __indices[__i] += __pcounter[__bins[__i] * __wg_size]; @@ -289,22 +289,22 @@ struct __subgroup_radix_sort { // the last iteration - writing out the result _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __block_size; ++__i) + for (std::uint16_t __i = 0; __i < __block_size; ++__i) { - const uint16_t __r = __indices[__i]; + const std::uint16_t __r = __indices[__i]; if (__r < __n) { //move the values to source range and destroy the values - __src[__r] = ::std::move(__values.__v[__i]); + __src[__r] = std::move(__values.__v[__i]); __values.__v[__i].~_ValT(); } } //destroy values in exchange buffer _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __block_size; ++__i) + for (std::uint16_t __i = 0; __i < __block_size; ++__i) { - const uint16_t __idx = __wi * __block_size + __i; + const std::uint16_t __idx = __wi * __block_size + __i; if (__idx < __n) __exchange_lacc[__idx].~_ValT(); } @@ -315,31 +315,31 @@ struct __subgroup_radix_sort if (__begin_bit == __radix) //the first sort iteration { _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __block_size; ++__i) + for (std::uint16_t __i = 0; __i < __block_size; ++__i) { - const uint16_t __r = __indices[__i]; + const std::uint16_t __r = __indices[__i]; if (__r < __n) - new (&__exchange_lacc[__r]) _ValT(::std::move(__values.__v[__i])); + new (&__exchange_lacc[__r]) _ValT(std::move(__values.__v[__i])); } } else { _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __block_size; ++__i) + for (std::uint16_t __i = 0; __i < __block_size; ++__i) { - const uint16_t __r = __indices[__i]; + const std::uint16_t __r = __indices[__i]; if (__r < __n) - __exchange_lacc[__r] = ::std::move(__values.__v[__i]); + __exchange_lacc[__r] = std::move(__values.__v[__i]); } } __dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence()); _ONEDPL_PRAGMA_UNROLL - for (uint16_t __i = 0; __i < __block_size; ++__i) + for (std::uint16_t __i = 0; __i < __block_size; ++__i) { - const uint16_t __idx = __wi * __block_size + __i; + const std::uint16_t __idx = __wi * __block_size + __i; if (__idx < __n) - __values.__v[__i] = ::std::move(__exchange_lacc[__idx]); + __values.__v[__i] = std::move(__exchange_lacc[__idx]); } __dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence()); }