Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ list(APPEND HWY_CONTRIB_SOURCES
hwy/contrib/sort/sorting_networks-inl.h
hwy/contrib/sort/traits-inl.h
hwy/contrib/sort/traits128-inl.h
hwy/contrib/sort/order-emulate-inl.h
hwy/contrib/sort/vqsort-inl.h
hwy/contrib/sort/vqsort.cc
hwy/contrib/sort/vqsort.h
Expand Down
1 change: 1 addition & 0 deletions hwy/contrib/sort/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ VQSORT_TEXTUAL_HDRS = [
"sorting_networks-inl.h",
"traits-inl.h",
"traits128-inl.h",
"order-emulate-inl.h",
"vqsort-inl.h",
# Placeholder for internal instrumentation. Do not remove.
]
Expand Down
224 changes: 224 additions & 0 deletions hwy/contrib/sort/order-emulate-inl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
// Emulated floating-point total order
//
// This implementation sorts floating-point values by reinterpreting them as
// unsigned integer bit patterns instead of using the FPU. It does not depend on
// the floating-point control register, so there is no flush-to-zero handling.
//
// NaNs are already replaced by ±Inf before calling this code, so no special
// handling is needed here.
// Because ordering is emulated, we guarantee a stable rule for zeros: -0.0
// always comes before +0.0.
//
// SPDX-License-Identifier: BSD-3-Clause
#if defined(HIGHWAY_HWY_CONTRIB_SORT_ORDER_EMULATE_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_ORDER_EMULATE_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_ORDER_EMULATE_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_ORDER_EMULATE_TOGGLE
#endif

#include <stddef.h>
#include <stdint.h>

#include "hwy/contrib/sort/order.h" // SortDescending
#include "hwy/highway.h"

HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {


template <class VU, class D = DFromV<VU>, class DI = RebindToSigned<D>>
HWY_API Vec<DI> LtBinKey(VU a) {
using TI = TFromD<DI>;
using VI = Vec<DI>;
const DI di;
const VI neg_flip = Set(di, TI(SignMask<TI>() - 1));
return Xor(BitCast(di, a), IfNegativeThenElseZero(BitCast(di, a), neg_flip));
}

template <class VU, class D = DFromV<VU>, class M = MFromD<D>, HWY_IF_UNSIGNED_D(D)>
HWY_API M LtBin(VU a, VU b) {
return RebindMask(D{}, Lt(LtBinKey(a), LtBinKey(b)));
}

template <class Base, class Order_>
struct OrderEmulate : public Base {
using T = typename Base::LaneType;
using TF = typename Base::KeyType;

HWY_INLINE bool Equal1(const T* a, const T* b) const {
return *a == *b;
}

template <class D>
HWY_INLINE Mask<D> EqualKeys(D, Vec<D> a, Vec<D> b) const {
return Eq(a, b); // Bitwise equality, -0 != +0, +-NaN is equal to itself
}

template <class D>
HWY_INLINE Mask<D> NotEqualKeys(D, Vec<D> a, Vec<D> b) const {
return Ne(a, b); // bitwise inequality, -0 != +0, +-NaN is equal to itself
}

HWY_INLINE bool Compare1(const T* a_, const T* b_) const {
const T a = *a_;
const T b = *b_;
// specialized less than, -0.0 < +0.0, and NaNs are not ordered
using TI = MakeSigned<T>;
constexpr int kMSB = 8 * sizeof(T) - 1;
constexpr T neg_flip = T((T(1) << kMSB) - 1);
const T a_neg = 0 - (a >> kMSB);
const T b_neg = 0 - (b >> kMSB);
// Signed-domain keys (xor 0x7FFF.. only for negatives)
const T sa = a ^ (a_neg & neg_flip);
const T sb = b ^ (b_neg & neg_flip);
return static_cast<TI>(sa) < static_cast<TI>(sb);
}
template <class D>
HWY_INLINE Mask<D> Compare(D, Vec<D> a, Vec<D> b) const {
// specialized less than, -0.0 < +0.0, and NaNs are not ordered
return LtBin(a, b);
}

// Two halves of Sort2, used in ScanMinMax.
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return IfThenElse(LtBin(a, b), a, b);
}

template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return IfThenElse(LtBin(a, b), b, a);
}

template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
T* HWY_RESTRICT /* buf */) const {
const RebindToSigned<D> di;
using VI = Vec<decltype(di)>;
VI key = LtBinKey(v);
VI min = MinOfLanes(di, key);
Mask<D> m = RebindMask(d, Eq(min, key));
return MaxOfLanes(d, IfThenElseZero(m, v));
}

template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
T* HWY_RESTRICT /* buf */) const {
const RebindToSigned<D> di;
using VI = Vec<decltype(di)>;
VI key = LtBinKey(v);
VI max = MaxOfLanes(di, key);
Mask<D> m = RebindMask(d, Eq(max, key));
return MaxOfLanes(d, IfThenElseZero(m, v));
}

template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, BitCastScalar<T>(NegativeInfOrLowestValue<TF>()));
}

template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, BitCastScalar<T>(PositiveInfOrHighestValue<TF>()));
}

// Returns the next distinct smaller value unless already -inf.
template <class D, class V = Vec<D>>
HWY_INLINE V PrevValue(D, V v) const {
return NextSortValueBits<true>(v);
}

// Next representable value in total order by ±1 ULP, saturating at ±Inf.
// IsDown = false → next larger
// IsDown = true → next smaller
template <bool IsDown, class V>
HWY_INLINE V NextSortValueBits(V u) const {
const DFromV<V> d;
using M = Mask<decltype(d)>;
constexpr T kSignBit = SignMask<T>();
constexpr T kBoundaryUp = SignMask<T>() - 1;
const V sign_bit = Set(d, kSignBit);
const V all1 = Set(d, T(~T(0)));
const V one = Set(d, T(1));
// Detect saturation at ±Inf
const M is_target_inf = Eq(u, IsDown ? FirstValue(d) : LastValue(d));
// Transform to monotonic space: flip sign for positives, invert for negatives
const M is_neg = TestBit(u, sign_bit);
const V key = Xor(u, IfThenElse(is_neg, all1, sign_bit));
// Boundary detection: +0/-0 swap needs a step of 2 instead of 1
Comment thread
jan-wassenberg marked this conversation as resolved.
const V boundary = Set(d, IsDown ? kSignBit : kBoundaryUp);
const M at_boundary = Eq(key, boundary);
// Step size: normally 1, but 2 at zero-boundary
const V step = Add(one, IfThenElseZero(at_boundary, one));
// Apply increment/decrement unless already at ±Inf
const V key2 = IfThenElse(is_target_inf, key,
IsDown ? Sub(key, step) : Add(key, step));
// Transform back from monotonic space
const M neg_out = Lt(key2, sign_bit);
return Xor(key2, IfThenElse(neg_out, all1, sign_bit));
}
};

template <class Base>
struct OrderEmulate<Base, SortDescending> : public OrderEmulate<Base, SortAscending> {
using _AscBase = OrderEmulate<Base, SortAscending>;
using _Asc = const _AscBase*;
using T = typename Base::LaneType;
using TF = typename Base::KeyType;

HWY_INLINE bool Compare1(const T* a, const T* b) const {
return reinterpret_cast<_Asc>(this)->Compare1(b, a);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reinterpret_cast and inheritance seems questionable here.
We could instead use a typedef like you have (but no _ prefix because that is reserved by C++), and instead write return AscBase().Compare(d, b, a) - WDYT?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sense, done

}
template <class D>
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
return reinterpret_cast<_Asc>(this)->Compare(d, b, a);
}

template <class D>
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
return reinterpret_cast<_Asc>(this)->Last(d, a, b);
}

template <class D>
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
return reinterpret_cast<_Asc>(this)->First(d, a, b);
}

template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
T* HWY_RESTRICT b) const {
return reinterpret_cast<_Asc>(this)->LastOfLanes(d, v, b);
}

template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
T* HWY_RESTRICT b) const {
return reinterpret_cast<_Asc>(this)->FirstOfLanes(d, v, b);
}

template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return reinterpret_cast<_Asc>(this)->LastValue(d);
}

template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return reinterpret_cast<_Asc>(this)->FirstValue(d);
}

template <class D, class V = Vec<D>>
HWY_INLINE V PrevValue(D, V v) const {
return this->template NextSortValueBits<false>(v);
}
};

} // namespace detail
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();

#endif // HIGHWAY_HWY_CONTRIB_SORT_ORDER_EMULATE_TOGGLE
26 changes: 20 additions & 6 deletions hwy/contrib/sort/sort_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,32 @@ using detail::OrderDescendingKV128;
using detail::Traits128;
#endif // !HAVE_INTEL && HWY_TARGET != HWY_SCALAR

template <typename T>
inline void IotaWrapper(T *first, T *last, T val) {
std::iota(first, last, val);
}
// Emulate std::iota for hwy::float16_t, some compliers mostly on ARM & Longarch
// complain about operator++ on that type.
template <>
inline void IotaWrapper<hwy::float16_t>(hwy::float16_t *first, hwy::float16_t *last,
hwy::float16_t val) {
float v = ConvertScalarTo<float>(val);
for (; first != last; ++first, ++v) {
*first = ConvertScalarTo<hwy::float16_t>(v);
}
}

template <typename Key>
void TestSortIota(hwy::ThreadPool& pool) {
pool.Run(128, 300, [](uint64_t task, size_t /*thread*/) {
const size_t num = static_cast<size_t>(task);
Key keys[300];
std::iota(keys, keys + num, Key{0});
IotaWrapper(keys, keys + num, ConvertScalarTo<Key>(0));
VQSort(keys, num, hwy::SortAscending());
for (size_t i = 0; i < num; ++i) {
if (keys[i] != static_cast<Key>(i)) {
if (keys[i] != ConvertScalarTo<Key>(i)) {
HWY_ABORT("num %zu i %zu: not iota, got %.0f\n", num, i,
static_cast<double>(keys[i]));
ConvertScalarTo<double>(keys[i]));
}
}
});
Expand All @@ -86,10 +101,9 @@ void TestAllSortIota() {
TestSortIota<int64_t>(pool);
TestSortIota<uint64_t>(pool);
}
TestSortIota<hwy::float16_t>(pool);
TestSortIota<float>(pool);
if (hwy::HaveFloat64()) {
TestSortIota<double>(pool);
}
TestSortIota<double>(pool);
#endif
}

Expand Down
Loading
Loading