From 61255d6df1580b84929d755376ab005e8c0aaa7f Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Wed, 10 Jun 2026 21:51:12 +0000 Subject: [PATCH 01/28] Fix FlatMap copy assignment -- need to compare addresses, not values Adds typed tests covering assignment over a non-empty target, source preservation, and self-assignment. --- src/axom/core/FlatMap.hpp | 2 +- src/axom/core/tests/core_flatmap.hpp | 39 ++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index 9e6d8d5e98..13f83dfdda 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -179,7 +179,7 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy::value, "Cannot copy an axom::FlatMap when value type is not " "copy-constructible."); - if(*this != other) + if(this != &other) { FlatMap new_map(other); swap(new_map); diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp index 705b2eb073..b032c9887d 100644 --- a/src/axom/core/tests/core_flatmap.hpp +++ b/src/axom/core/tests/core_flatmap.hpp @@ -481,6 +481,45 @@ AXOM_TYPED_TEST(core_flatmap, init_and_copy) } } +AXOM_TYPED_TEST(core_flatmap, copy_assign) +{ + using MapType = typename TestFixture::MapType; + MapType test_map; + const int NUM_ELEMS = 40; + + for(int i = 0; i < NUM_ELEMS; i++) + { + test_map[this->getKey(i)] = this->getValue(i + 10.0); + } + + // Copy-assign over a non-empty map with different contents should replace prior contents + MapType copied_map; + copied_map[this->getKey(NUM_ELEMS + 5)] = this->getValue(0.0); + copied_map = test_map; + + EXPECT_EQ(copied_map.size(), NUM_ELEMS); + EXPECT_EQ(copied_map.find(this->getKey(NUM_ELEMS + 5)), copied_map.end()); + for(int i = 0; i < NUM_ELEMS; i++) + { + auto it = copied_map.find(this->getKey(i)); + ASSERT_NE(it, copied_map.end()); + EXPECT_EQ(it->second, this->getValue(i + 10.0)); + } + + // The source should be unchanged + EXPECT_EQ(test_map.size(), NUM_ELEMS); + + // Self-assignment is a no-op + copied_map = static_cast(copied_map); + EXPECT_EQ(copied_map.size(), NUM_ELEMS); + for(int i = 0; i < NUM_ELEMS; i++) + { + auto it = copied_map.find(this->getKey(i)); + ASSERT_NE(it, copied_map.end()); + EXPECT_EQ(it->second, this->getValue(i + 10.0)); + } +} + AXOM_TYPED_TEST(core_flatmap, insert_until_rehash) { using MapType = typename TestFixture::MapType; From 2baf6a5b6dd5c0d997594428edacbc5852592b0a Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Wed, 10 Jun 2026 21:52:27 +0000 Subject: [PATCH 02/28] Remove FlatMap's const operator[], which inserts for missing keys Removing it cannot break callers since this would not have compiled. Const callers should use find()/at()/count()/contains(). at() throws std::out_of_range on a missing key. --- src/axom/core/FlatMap.hpp | 9 ------- src/axom/core/tests/core_flatmap.hpp | 35 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index 13f83dfdda..0a7b1209c4 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -332,7 +332,6 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy::value, @@ -340,14 +339,6 @@ class FlatMap : detail::flat_map::SequentialLookupPolicytry_emplace(key).first->second; } - const ValueType& operator[](const KeyType& key) const - { - static_assert(std::is_default_constructible::value, - "Cannot use axom::FlatMap::operator[] when value type is not " - "default-constructible."); - return this->try_emplace(key).first->second; - } - /// @} /*! * \brief Return the number of entries matching a given key. diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp index b032c9887d..89c6616fab 100644 --- a/src/axom/core/tests/core_flatmap.hpp +++ b/src/axom/core/tests/core_flatmap.hpp @@ -520,6 +520,41 @@ AXOM_TYPED_TEST(core_flatmap, copy_assign) } } +AXOM_TYPED_TEST(core_flatmap, const_lookup) +{ + using MapType = typename TestFixture::MapType; + MapType test_map; + const int NUM_ELEMS = 20; + + for(int i = 0; i < NUM_ELEMS; i++) + { + test_map[this->getKey(i)] = this->getValue(i + 10.0); + } + + // Read-only lookups must be through const reference (matching std::unordered_map) + // operator[] is intentionally non-const since it inserts on a missing key + const MapType& const_map = test_map; + EXPECT_EQ(const_map.size(), NUM_ELEMS); + for(int i = 0; i < NUM_ELEMS; i++) + { + auto key = this->getKey(i); + auto value = this->getValue(i + 10.0); + + auto it = const_map.find(key); + ASSERT_NE(it, const_map.end()); + EXPECT_EQ(it->second, value); + EXPECT_EQ(const_map.at(key), value); + EXPECT_EQ(const_map.count(key), 1); + EXPECT_TRUE(const_map.contains(key)); + } + + auto missing = this->getKey(NUM_ELEMS + 5); + EXPECT_EQ(const_map.find(missing), const_map.end()); + EXPECT_EQ(const_map.count(missing), 0); + EXPECT_FALSE(const_map.contains(missing)); + EXPECT_THROW(const_map.at(missing), std::out_of_range); +} + AXOM_TYPED_TEST(core_flatmap, insert_until_rehash) { using MapType = typename TestFixture::MapType; From d1cb266754c3afdfd2ba7d748b37139330b75d09 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Wed, 10 Jun 2026 21:54:17 +0000 Subject: [PATCH 03/28] DeviceHash: hash in 64 bits regardless of IndexType width DeviceHashHelper returned axom::IndexType and integer keys were converted before the 64-bit mixer ran. With AXOM_USE_64BIT_INDEXTYPE=OFF every key wider than 32 bits is truncated first, so keys equal mod 2^32 produce identical final hashes. This was happening in the Morton codes in spin's SparseOctreeLevel and in numerics/quadrature. --- src/axom/core/DeviceHash.hpp | 36 +++++++++++++++--------- src/axom/core/tests/core_device_hash.hpp | 31 ++++++++++++++++++++ 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/src/axom/core/DeviceHash.hpp b/src/axom/core/DeviceHash.hpp index 0c934fbadc..cbd6d5e3e6 100644 --- a/src/axom/core/DeviceHash.hpp +++ b/src/axom/core/DeviceHash.hpp @@ -11,6 +11,7 @@ #include "axom/core/Macros.hpp" #include "axom/core/Types.hpp" +#include #include namespace axom @@ -25,8 +26,11 @@ template struct DeviceHashHelper::value>> { using argument_type = T; - using result_type = axom::IndexType; - AXOM_HOST_DEVICE axom::IndexType operator()(T value) const { return value; } + using result_type = std::uint64_t; + AXOM_HOST_DEVICE std::uint64_t operator()(T value) const + { + return static_cast(value); + } }; /// \brief Specialization for floating-point types @@ -34,15 +38,15 @@ template struct DeviceHashHelper::value>> { using argument_type = T; - using result_type = axom::IndexType; - AXOM_HOST_DEVICE axom::IndexType operator()(T value) const + using result_type = std::uint64_t; + AXOM_HOST_DEVICE std::uint64_t operator()(T value) const { // Special case: -0.0 and 0.0 compare equal but have different byte representations. if(value == T {0.}) { return 0; } - return value; + return static_cast(static_cast(value)); } }; @@ -51,10 +55,10 @@ template struct DeviceHashHelper::value>> { using argument_type = T; - using result_type = axom::IndexType; - AXOM_HOST_DEVICE axom::IndexType operator()(T value) const + using result_type = std::uint64_t; + AXOM_HOST_DEVICE std::uint64_t operator()(T value) const { - return static_cast(value); + return static_cast(value); } }; @@ -63,10 +67,10 @@ template struct DeviceHashHelper { using argument_type = T*; - using result_type = axom::IndexType; - AXOM_HOST_DEVICE axom::IndexType operator()(T* ptr) const + using result_type = std::uint64_t; + AXOM_HOST_DEVICE std::uint64_t operator()(T* ptr) const { - return static_cast(reinterpret_cast(ptr)); + return static_cast(reinterpret_cast(ptr)); } }; @@ -75,10 +79,10 @@ template struct DeviceHashHelper { using argument_type = T; - using result_type = axom::IndexType; - axom::IndexType operator()(const T& object) const + using result_type = std::uint64_t; + std::uint64_t operator()(const T& object) const { - return static_cast(std::hash {}(object)); + return static_cast(std::hash {}(object)); } }; @@ -89,6 +93,10 @@ struct DeviceHashHelper * * \brief Implements a host/device-callable hash function for supported types, * and passes through to std::hash otherwise. + * + * The result type is always std::uint64_t, independent of the configured axom::IndexType width. + * Hashes feed bit mixers and bucket selection, where truncating wide keys (e.g. 64-bit Morton codes) + * to a 32-bit IndexType before mixing would make keys equal mod 2^32 collide. */ template struct DeviceHash : public detail::DeviceHashHelper diff --git a/src/axom/core/tests/core_device_hash.hpp b/src/axom/core/tests/core_device_hash.hpp index 8ccee915d9..d19ac69386 100644 --- a/src/axom/core/tests/core_device_hash.hpp +++ b/src/axom/core/tests/core_device_hash.hpp @@ -259,3 +259,34 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined) } } } + +TEST(core_device_hash, hash_width_decoupled_from_indextype) +{ + // The hash result must be 64 bits wide regardless of the configured + // axom::IndexType. When the result type was IndexType, builds with + // AXOM_USE_64BIT_INDEXTYPE=OFF truncated integer keys to 32 bits before + // the FlatMap bit mixer ran, so keys equal mod 2^32 (e.g. deep Morton codes) + // produced identical hashes. The type assertions catch the coupling in every + // build configuration; the value checks fail in the truncating configuration itself. + static_assert(std::is_same::result_type, std::uint64_t>::value, + "integral hash result must be std::uint64_t"); + static_assert(std::is_same::result_type, std::uint64_t>::value, + "integral hash result must be std::uint64_t"); + static_assert(std::is_same::result_type, std::uint64_t>::value, + "floating-point hash result must be std::uint64_t"); + static_assert(std::is_same::result_type, std::uint64_t>::value, + "pointer hash result must be std::uint64_t"); + static_assert(std::is_same::result_type, std::uint64_t>::value, + "catch-all (std::hash) result must be std::uint64_t"); + static_assert( + std::is_same {}(std::uint64_t {})), std::uint64_t>::value, + "integral hash operator() must return std::uint64_t"); + + axom::DeviceHash device_hasher; + const std::uint64_t base = 1; + const std::uint64_t plus_2_32 = base + (std::uint64_t {1} << 32); + const std::uint64_t plus_2_33 = base + (std::uint64_t {1} << 33); + EXPECT_NE(device_hasher(base), device_hasher(plus_2_32)); + EXPECT_NE(device_hasher(base), device_hasher(plus_2_33)); + EXPECT_NE(device_hasher(plus_2_32), device_hasher(plus_2_33)); +} From 51f9a863c2a063d54db4d77918b94769ed5b344c Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Wed, 10 Jun 2026 22:00:30 +0000 Subject: [PATCH 04/28] DeviceHash: hash floating-point keys by bit pattern, not value The floating-point specialization returned the key converted to an integer. Every key sharing an integer part therefore collided -- e.g. all numbers between -1 and 1 converted to the integer 0, so a FlatMap keyed on fractional floats degenerated into one probe chain with O(size) inserts and finds --- src/axom/core/DeviceHash.hpp | 27 +++++++++++++++-- src/axom/core/tests/core_device_hash.hpp | 38 +++++++++++++++++++++++- src/axom/core/tests/core_flatmap.hpp | 27 +++++++++++++++++ 3 files changed, 88 insertions(+), 4 deletions(-) diff --git a/src/axom/core/DeviceHash.hpp b/src/axom/core/DeviceHash.hpp index cbd6d5e3e6..d1ac31ced2 100644 --- a/src/axom/core/DeviceHash.hpp +++ b/src/axom/core/DeviceHash.hpp @@ -12,6 +12,7 @@ #include "axom/core/Types.hpp" #include +#include #include namespace axom @@ -41,12 +42,32 @@ struct DeviceHashHelper::value>> using result_type = std::uint64_t; AXOM_HOST_DEVICE std::uint64_t operator()(T value) const { - // Special case: -0.0 and 0.0 compare equal but have different byte representations. + // -0.0 and 0.0 compare equal but have different bit patterns; normalize so both hash identically if(value == T {0.}) { - return 0; + value = T {0.}; } - return static_cast(static_cast(value)); + + // Hash the bit pattern, not the converted value. + // A float-to-integer value conversion collapses every key sharing an integer part, + // e.g. all numbers between -1 and 1 converts to integer 0 + + // NUM_WORDS is 1 for float or double, possibly 2 for long double + constexpr std::size_t NUM_WORDS = (sizeof(T) + sizeof(std::uint64_t) - 1) / sizeof(std::uint64_t); + // zero out words since we might only copy 4 bytes in for floats + std::uint64_t words[NUM_WORDS] = {0}; + memcpy(words, &value, sizeof(T)); + + std::uint64_t result = words[0]; + // Extra processing fortypes wider than 64 bits (long double). + // Use an odd multiplier (2^64/golden-ratio-phi), + // so the halves cannot cancel under a later XOR-style mixer + for(std::size_t i = 1; i < NUM_WORDS; i++) + { + result = result * std::uint64_t {0x9e3779b97f4a7c15} + words[i]; + } + + return result; } }; diff --git a/src/axom/core/tests/core_device_hash.hpp b/src/axom/core/tests/core_device_hash.hpp index d19ac69386..58173725e1 100644 --- a/src/axom/core/tests/core_device_hash.hpp +++ b/src/axom/core/tests/core_device_hash.hpp @@ -11,6 +11,9 @@ // gtest includes #include "gtest/gtest.h" +// C++ includes +#include + template class core_device_hash : public ::testing::Test { @@ -274,7 +277,7 @@ TEST(core_device_hash, hash_width_decoupled_from_indextype) "integral hash result must be std::uint64_t"); static_assert(std::is_same::result_type, std::uint64_t>::value, "floating-point hash result must be std::uint64_t"); - static_assert(std::is_same::result_type, std::uint64_t>::value, + static_assert(std::is_same::result_type, std::uint64_t>::value, "pointer hash result must be std::uint64_t"); static_assert(std::is_same::result_type, std::uint64_t>::value, "catch-all (std::hash) result must be std::uint64_t"); @@ -290,3 +293,36 @@ TEST(core_device_hash, hash_width_decoupled_from_indextype) EXPECT_NE(device_hasher(base), device_hasher(plus_2_33)); EXPECT_NE(device_hasher(plus_2_32), device_hasher(plus_2_33)); } + +TEST(core_device_hash, hash_float_bit_pattern) +{ + // Floating-point keys must be hashed by bit pattern, not by integer value conversion. + // This is a regression test for a previous implementation where the conversion collapsed + // every key with the same integer value, e.g. all numbers between -1 and 1 converted to integer 0 + // so a FlatMap keyed on fractional floats degenerated into a single probe chain. + axom::DeviceHash float_hasher; + axom::DeviceHash double_hasher; + + EXPECT_NE(float_hasher(0.25f), float_hasher(0.75f)); + EXPECT_NE(float_hasher(0.25f), std::uint64_t {0}); + EXPECT_NE(double_hasher(0.25), double_hasher(0.75)); + + // A spread of fractional keys must be collision-free at this scale + std::set float_hashes, double_hashes; + const int NUM_KEYS = 1000; + for(int i = 1; i <= NUM_KEYS; i++) + { + float_hashes.insert(float_hasher(i / static_cast(NUM_KEYS + 1))); + double_hashes.insert(double_hasher(i / static_cast(NUM_KEYS + 1))); + } + EXPECT_EQ(float_hashes.size(), NUM_KEYS); + EXPECT_EQ(double_hashes.size(), NUM_KEYS); + + // Signed zeros compare equal and must hash equal + EXPECT_EQ(float_hasher(0.0f), float_hasher(-0.0f)); + EXPECT_EQ(double_hasher(0.0), double_hasher(-0.0)); + + // Magnitudes beyond any integer type's range are now well-defined and distinct + EXPECT_NE(double_hasher(1e300), double_hasher(2e300)); + EXPECT_NE(float_hasher(-0.5f), float_hasher(0.5f)); +} diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp index 89c6616fab..186fbbf70a 100644 --- a/src/axom/core/tests/core_flatmap.hpp +++ b/src/axom/core/tests/core_flatmap.hpp @@ -129,6 +129,33 @@ using MyTypes = ::testing::Types, TYPED_TEST_SUITE(core_flatmap, MyTypes); +TEST(core_flatmap_unit, float_keys_in_unit_interval) +{ + // Regression test for the floating-point DeviceHash specialization, which + // converted keys to integers by value: every key in (0, 1) hashed to 0, so + // this map was a single probe chain and each insert/find was O(size). + // With bit-pattern hashing the keys spread normally. + axom::FlatMap test_map; + const int NUM_ELEMS = 512; + + for(int i = 1; i <= NUM_ELEMS; i++) + { + float key = i / static_cast(NUM_ELEMS + 2); + test_map[key] = i; + } + + EXPECT_EQ(test_map.size(), NUM_ELEMS); + for(int i = 1; i <= NUM_ELEMS; i++) + { + float key = i / static_cast(NUM_ELEMS + 2); + auto it = test_map.find(key); + ASSERT_NE(it, test_map.end()); + EXPECT_EQ(it->second, i); + } + EXPECT_EQ(test_map.find(1.5f), test_map.end()); + EXPECT_EQ(test_map.count(0.5f), 1); +} + AXOM_TYPED_TEST(core_flatmap, default_init) { using MapType = typename TestFixture::MapType; From 2c0060213e1337f7b93ee01630ae30cb9532ad7a Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Wed, 10 Jun 2026 22:03:17 +0000 Subject: [PATCH 05/28] FlatTable: wrap probe advance with a mask, not a signed division The quadratic probe advance in probeIndex and probeEmptyIndex wrapped using a mod (%) operator. Since the group count is always a power of two, we can use a bitmask instead. Adds a cross-group probe stress test: a degenerate hash drives 600 keys through one initial group so inserts, lookups, misses, erases, and reinserts all walk and wrap the group sequence. --- src/axom/core/detail/FlatTable.hpp | 27 ++++++----- src/axom/core/tests/core_flatmap.hpp | 72 ++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp index e99853bdcf..66ac4019b7 100644 --- a/src/axom/core/detail/FlatTable.hpp +++ b/src/axom/core/detail/FlatTable.hpp @@ -319,10 +319,11 @@ struct SequentialLookupPolicy : ProbePolicy IndexType probeEmptyIndex(int ngroups_pow_2, ArrayView metadata, HashType hash) const { // We use the k MSBs of the hash as the initial group probe point, - // where ngroups = 2^k. - int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2); - HashType curr_group = hash >> bitshift_right; - curr_group &= ((1 << ngroups_pow_2) - 1); + // where ngroups = 2^k. Since the group count is always a power of two, + // wrapping a group index is a bitwise AND with this mask. + const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2); + const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1; + HashType curr_group = (hash >> bitshift_right) & group_mask; int empty_group = NO_MATCH; int empty_bucket = NO_MATCH; @@ -347,7 +348,10 @@ struct SequentialLookupPolicy : ProbePolicy // Set the overflow bit and continue probing. metadata[curr_group].setOverflow(hash_8); } - curr_group = (curr_group + this->getNext(iteration)) % metadata.size(); + // Mask instead of "% metadata.size()": the group count is a power of + // two, and the modulo compiled to a 64-bit signed division on the + // critical path of every probe continuation. + curr_group = (curr_group + this->getNext(iteration)) & group_mask; } if(empty_group != NO_MATCH) { @@ -373,10 +377,11 @@ struct SequentialLookupPolicy : ProbePolicy FoundIndex&& on_hash_found) const { // We use the k MSBs of the hash as the initial group probe point, - // where ngroups = 2^k. - int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2); - HashType curr_group = hash >> bitshift_right; - curr_group &= ((1 << ngroups_pow_2) - 1); + // where ngroups = 2^k. Since the group count is always a power of two, + // wrapping a group index is a bitwise AND with this mask. + const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2); + const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1; + HashType curr_group = (hash >> bitshift_right) & group_mask; std::uint8_t hash_8 = static_cast(hash); bool keep_going = true; @@ -397,8 +402,8 @@ struct SequentialLookupPolicy : ProbePolicy { break; } - // Probe the next bucket. - curr_group = (curr_group + this->getNext(iteration)) % metadata.size(); + // Probe the next bucket. Note that the group count is a power of 2 so we can use a bit mask + curr_group = (curr_group + this->getNext(iteration)) & group_mask; } } diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp index 186fbbf70a..4e3a2f4506 100644 --- a/src/axom/core/tests/core_flatmap.hpp +++ b/src/axom/core/tests/core_flatmap.hpp @@ -156,6 +156,78 @@ TEST(core_flatmap_unit, float_keys_in_unit_interval) EXPECT_EQ(test_map.count(0.5f), 1); } +// Hash functor whose group-selector bits (the top bits) are always zero, +// so every key lands in the same initial group and probing must walk across groups. +// Stress tests the cross-group probe sequence. +struct DegenerateGroupHash +{ + using argument_type = int; + using result_type = std::uint64_t; + std::uint64_t operator()(int key) const + { + return static_cast(static_cast(key) & 0xFF); + } +}; + +TEST(core_flatmap_unit, cross_group_probe_chains) +{ + // Forces hundreds of keys through a single initial group: + // inserts walk probeEmptyIndex's group sequence, lookups walk probeIndex's, + // both wrap around the group array, and erases punch holes mid-sequence. + // Guards the probe-advance arithmetic (group wrapping) against regressions. + axom::FlatMap test_map; + const int NUM_ELEMS = 600; + + for(int i = 0; i < NUM_ELEMS; i++) + { + test_map[i] = i * 3; + } + EXPECT_EQ(test_map.size(), NUM_ELEMS); + for(int i = 0; i < NUM_ELEMS; i++) + { + auto it = test_map.find(i); + ASSERT_NE(it, test_map.end()); + EXPECT_EQ(it->second, i * 3); + } + for(int i = NUM_ELEMS; i < NUM_ELEMS + 64; i++) + { + EXPECT_EQ(test_map.find(i), test_map.end()); + } + + // Erase every third key (some mid-probe-sequence) and re-verify + for(int i = 0; i < NUM_ELEMS; i += 3) + { + EXPECT_EQ(test_map.erase(i), 1); + } + EXPECT_EQ(test_map.size(), NUM_ELEMS - (NUM_ELEMS + 2) / 3); + for(int i = 0; i < NUM_ELEMS; i++) + { + if(i % 3 == 0) + { + EXPECT_EQ(test_map.find(i), test_map.end()); + } + else + { + auto it = test_map.find(i); + ASSERT_NE(it, test_map.end()); + EXPECT_EQ(it->second, i * 3); + } + } + + // Reinsert over the holes and verify + for(int i = 0; i < NUM_ELEMS; i += 3) + { + test_map[i] = i * 7; + } + EXPECT_EQ(test_map.size(), NUM_ELEMS); + for(int i = 0; i < NUM_ELEMS; i++) + { + auto it = test_map.find(i); + ASSERT_NE(it, test_map.end()); + EXPECT_EQ(it->second, (i % 3 == 0) ? i * 7 : i * 3); + } +} + AXOM_TYPED_TEST(core_flatmap, default_init) { using MapType = typename TestFixture::MapType; From c6555a89ab3d7c04e8696319c7f3df7a4cbe327e Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 15 Jan 2026 19:55:17 -0800 Subject: [PATCH 06/28] Adds initial benchmark for flatmap vs map vs unordered_map vs sparsehash --- src/axom/core/tests/CMakeLists.txt | 3 +- .../core/tests/core_benchmark_flatmap.cpp | 463 ++++++++++++++++++ 2 files changed, 465 insertions(+), 1 deletion(-) create mode 100644 src/axom/core/tests/core_benchmark_flatmap.cpp diff --git a/src/axom/core/tests/CMakeLists.txt b/src/axom/core/tests/CMakeLists.txt index eecb1bc949..b30ebb25fc 100644 --- a/src/axom/core/tests/CMakeLists.txt +++ b/src/axom/core/tests/CMakeLists.txt @@ -212,7 +212,8 @@ endforeach() if (ENABLE_BENCHMARKS) set(core_benchmarks - core_benchmark_array.cpp ) + core_benchmark_array.cpp + core_benchmark_flatmap.cpp ) foreach(test ${core_benchmarks}) get_filename_component(test_name ${test} NAME_WE) diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp new file mode 100644 index 0000000000..978c47f6d1 --- /dev/null +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -0,0 +1,463 @@ +// Copyright (c) Lawrence Livermore National Security, LLC and other +// Axom Project Contributors. See top-level LICENSE and COPYRIGHT +// files for dates and other details. +// +// SPDX-License-Identifier: (BSD-3-Clause) + +#include "benchmark/benchmark.h" + +#include "axom/config.hpp" +#include "axom/core.hpp" +#include "axom/slic.hpp" + +#include "axom/CLI11.hpp" +#include "axom/fmt.hpp" + +#include "axom/core/FlatMap.hpp" +#include "axom/core/FlatMapUtil.hpp" + +#if defined(AXOM_USE_SPARSEHASH) + #include "axom/sparsehash/sparse_hash_map" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + +using KeyType = std::int64_t; +using ValueType = std::int64_t; + +enum class FlatMapFeatureBenchmarks +{ + None = 0, + Insertion = 1 << 0, + Lookup = 1 << 1, + BatchedInsertion = 1 << 2, + + All = Insertion | Lookup | BatchedInsertion +}; + +inline FlatMapFeatureBenchmarks operator|(FlatMapFeatureBenchmarks lhs, FlatMapFeatureBenchmarks rhs) +{ + using T = std::underlying_type_t; + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +inline FlatMapFeatureBenchmarks& operator|=(FlatMapFeatureBenchmarks& lhs, + FlatMapFeatureBenchmarks rhs) +{ + lhs = lhs | rhs; + return lhs; +} + +inline FlatMapFeatureBenchmarks operator&(FlatMapFeatureBenchmarks lhs, FlatMapFeatureBenchmarks rhs) +{ + using T = std::underlying_type_t; + return static_cast(static_cast(lhs) & static_cast(rhs)); +} + +std::vector args_benchmark_sizes; +FlatMapFeatureBenchmarks args_benchmark_features {FlatMapFeatureBenchmarks::None}; +int args_batch_size = 1 << 10; +} // namespace + +template <> +struct axom::fmt::formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + auto format(FlatMapFeatureBenchmarks feature, FormatContext& ctx) const + { + static const std::map feature_map = { + {FlatMapFeatureBenchmarks::Insertion, "Insertion"}, + {FlatMapFeatureBenchmarks::Lookup, "Lookup"}, + {FlatMapFeatureBenchmarks::BatchedInsertion, "BatchedInsertion"}}; + + if(feature == FlatMapFeatureBenchmarks::None) + { + return axom::fmt::format_to(ctx.out(), "None"); + } + else if(feature == FlatMapFeatureBenchmarks::All) + { + return axom::fmt::format_to(ctx.out(), "All"); + } + + std::string name; + for(const auto& kv : feature_map) + { + if((feature & kv.first) != FlatMapFeatureBenchmarks::None) + { + name += name.empty() ? kv.second : "|" + kv.second; + } + } + return axom::fmt::format_to(ctx.out(), "{}", name); + } +}; + +namespace +{ + +void CustomArgs(benchmark::internal::Benchmark* b) +{ + for(int sz : ::args_benchmark_sizes) + { + b->Arg(sz); + } +} + +std::vector make_shuffled_keys(int n, std::uint64_t seed) +{ + std::vector keys; + keys.reserve(static_cast(n)); + for(int i = 0; i < n; ++i) + { + keys.push_back(static_cast(i)); + } + + std::mt19937_64 rng(seed); + std::shuffle(keys.begin(), keys.end(), rng); + return keys; +} + +std::vector> make_pairs(const std::vector& keys) +{ + std::vector> pairs; + pairs.reserve(keys.size()); + for(std::size_t i = 0; i < keys.size(); ++i) + { + pairs.emplace_back(keys[i], static_cast(i)); + } + return pairs; +} + +std::vector make_miss_keys(const std::vector& keys, KeyType offset) +{ + std::vector misses; + misses.reserve(keys.size()); + for(KeyType k : keys) + { + misses.push_back(k + offset); + } + return misses; +} + +template +struct MapFactory +{ + static MapType make_empty(std::size_t) { return MapType {}; } + static void reserve(MapType&, std::size_t) { } +}; + +template +struct MapFactory> +{ + using MapType = std::unordered_map; + static MapType make_empty(std::size_t) { return MapType {}; } + static void reserve(MapType& map, std::size_t n) { map.reserve(n); } +}; + +template +struct MapFactory> +{ + using MapType = axom::FlatMap; + static MapType make_empty(std::size_t) { return MapType {}; } + static void reserve(MapType& map, std::size_t n) { map.reserve(static_cast(n)); } +}; + +#if defined(AXOM_USE_SPARSEHASH) +template +void reserve_sparsehash(axom::google::sparse_hash_map& map, std::size_t n) +{ + map.max_load_factor(0.8f); + const auto buckets_needed = + static_cast(static_cast(n) / map.max_load_factor()) + 1; + map.resize(buckets_needed); +} +#endif + +#if defined(AXOM_USE_SPARSEHASH) +template +struct MapFactory> +{ + using MapType = axom::google::sparse_hash_map; + static MapType make_empty(std::size_t) { return MapType {}; } + static void reserve(MapType& map, std::size_t n) { reserve_sparsehash(map, n); } +}; +#endif + +template +MapType make_reserved_map(std::size_t n) +{ + MapType map = MapFactory::make_empty(n); + MapFactory::reserve(map, n); + return map; +} + +template +MapType make_filled_map(const std::vector>& pairs) +{ + MapType map = make_reserved_map(pairs.size()); + map.insert(pairs.begin(), pairs.end()); + return map; +} + +template +void BM_Insert_StartEmpty(benchmark::State& state) +{ + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xA2D5B7C4ULL); + const auto pairs = make_pairs(keys); + + for(auto _ : state) + { + MapType map = MapFactory::make_empty(pairs.size()); + map.insert(pairs.begin(), pairs.end()); + benchmark::DoNotOptimize(map); + } +} + +template +void BM_Insert_Reserved(benchmark::State& state) +{ + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xA2D5B7C4ULL); + const auto pairs = make_pairs(keys); + + for(auto _ : state) + { + MapType map = make_reserved_map(pairs.size()); + map.insert(pairs.begin(), pairs.end()); + benchmark::DoNotOptimize(map); + } +} + +template +void BM_Find_Hit(benchmark::State& state) +{ + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); + const auto pairs = make_pairs(keys); + const MapType map = make_filled_map(pairs); + + for(auto _ : state) + { + ValueType sum = 0; + for(KeyType k : keys) + { + auto it = map.find(k); + if(it != map.end()) + { + sum += it->second; + } + } + benchmark::DoNotOptimize(sum); + } +} + +template +void BM_Find_Miss(benchmark::State& state) +{ + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); + const auto pairs = make_pairs(keys); + const MapType map = make_filled_map(pairs); + const auto miss_keys = make_miss_keys(keys, static_cast(n) + 11); + + for(auto _ : state) + { + std::int64_t misses = 0; + for(KeyType k : miss_keys) + { + misses += (map.find(k) == map.end()) ? 1 : 0; + } + benchmark::DoNotOptimize(misses); + } +} + +template +void insert_pairs_in_batches(MapType& map, + const std::vector>& pairs, + int batch_size) +{ + const std::size_t n = pairs.size(); + const std::size_t bs = static_cast(std::max(1, batch_size)); + for(std::size_t offset = 0; offset < n; offset += bs) + { + const std::size_t count = std::min(bs, n - offset); + map.insert(pairs.begin() + static_cast(offset), + pairs.begin() + static_cast(offset + count)); + } +} + +template +void insert_pairs_in_batches(axom::FlatMap& map, + const std::vector>& pairs, + int batch_size) +{ + const std::size_t n = pairs.size(); + const std::size_t bs = static_cast(std::max(1, batch_size)); + for(std::size_t offset = 0; offset < n; offset += bs) + { + const std::size_t count = std::min(bs, n - offset); + map.template insert(pairs.begin() + static_cast(offset), + pairs.begin() + static_cast(offset + count)); + } +} + +template +void BM_BatchedInsert_Reserved(benchmark::State& state) +{ + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0x1CEB00DAULL); + const auto pairs = make_pairs(keys); + + for(auto _ : state) + { + MapType map = make_reserved_map(pairs.size()); + insert_pairs_in_batches(map, pairs, ::args_batch_size); + benchmark::DoNotOptimize(map); + } +} + +} // namespace + +//----------------------------------------------------------------------------- +// Register benchmarks +//----------------------------------------------------------------------------- + +template +void RegisterBenchmarksFor(const std::string& map_name) +{ + auto name = [&map_name](const std::string& op) { + return axom::fmt::format("{}::{}", map_name, op); + }; + + // clang-format off + if((::args_benchmark_features & FlatMapFeatureBenchmarks::Insertion) != FlatMapFeatureBenchmarks::None) + { + benchmark::RegisterBenchmark(name("insert_startEmpty"), &BM_Insert_StartEmpty)->Apply(CustomArgs); + benchmark::RegisterBenchmark(name("insert_reserved"), &BM_Insert_Reserved)->Apply(CustomArgs); + } + + if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None) + { + benchmark::RegisterBenchmark(name("find_hit"), &BM_Find_Hit)->Apply(CustomArgs); + benchmark::RegisterBenchmark(name("find_miss"), &BM_Find_Miss)->Apply(CustomArgs); + } + + if((::args_benchmark_features & FlatMapFeatureBenchmarks::BatchedInsertion) != FlatMapFeatureBenchmarks::None) + { + benchmark::RegisterBenchmark(name("insert_batched_reserved"), &BM_BatchedInsert_Reserved)->Apply(CustomArgs); + } + // clang-format on +} + +int main(int argc, char* argv[]) +{ + std::vector local_test_sizes; + FlatMapFeatureBenchmarks local_benchmark_features {FlatMapFeatureBenchmarks::None}; + int local_batch_size = ::args_batch_size; + + axom::CLI::App app {"Axom FlatMap benchmarks"}; + app.add_option("-s,--custom_sizes", local_test_sizes) + ->description("Adds custom map sizes to benchmark (positive numbers only)") + ->expected(-1) + ->default_val(std::vector {1 << 16}) + ->each([](const std::string& num_str) { + int num = std::stoi(num_str); + if(num < 0) + { + throw axom::CLI::ValidationError("Negative numbers are not allowed"); + } + }); + + app + .add_flag_callback("--use_cache_related_sizes", + [&local_test_sizes]() { + local_test_sizes.push_back(1 << 3); // small + local_test_sizes.push_back(1 << 16); // larger than 32K L1 cache + local_test_sizes.push_back(1 << 19); // larger than 256K L2 cache + //local_test_sizes.push_back(1 << 25); // larger than 25M L3 cache + }) + ->description("Test map sizes related to typical cache sizes"); + + app.add_option("--batch_size", local_batch_size) + ->description("Batch size for batched insertion benchmarks") + ->default_val(local_batch_size) + ->check(axom::CLI::PositiveNumber); + + std::vector feature_strings; + auto feature_opt = + app.add_option("-f,--features", feature_strings) + ->description( + "Features to benchmark (Insertion, Lookup, BatchedInsertion, All); default is 'All'") + ->expected(-1) + ->each([&local_benchmark_features](const std::string& feature) { + static const std::map feature_map = { + {"insertion", FlatMapFeatureBenchmarks::Insertion}, + {"lookup", FlatMapFeatureBenchmarks::Lookup}, + {"batchedinsertion", FlatMapFeatureBenchmarks::BatchedInsertion}, + {"all", FlatMapFeatureBenchmarks::All}}; + + std::string lower_feature = feature; + std::transform(lower_feature.begin(), lower_feature.end(), lower_feature.begin(), ::tolower); + auto it = feature_map.find(lower_feature); + if(it == feature_map.end()) + { + throw axom::CLI::ValidationError("Invalid feature: " + feature); + } + + local_benchmark_features |= it->second; + }); + + app.allow_extras(); // pass additional args to gbenchmark + CLI11_PARSE(app, argc, argv); + + ::benchmark::Initialize(&argc, argv); + axom::slic::SimpleLogger logger; + + // process input into global variables + { + ::args_benchmark_features = + feature_opt->count() > 0 ? local_benchmark_features : FlatMapFeatureBenchmarks::All; + + std::sort(local_test_sizes.begin(), local_test_sizes.end()); + auto last = std::unique(local_test_sizes.begin(), local_test_sizes.end()); + local_test_sizes.erase(last, local_test_sizes.end()); + std::swap(::args_benchmark_sizes, local_test_sizes); + + ::args_batch_size = local_batch_size; + + SLIC_INFO("Parsed and processed command line arguments:"); + SLIC_INFO(axom::fmt::format("- Map sizes: {}", axom::fmt::join(::args_benchmark_sizes, ","))); + SLIC_INFO(axom::fmt::format("- Batch size: {}", ::args_batch_size)); + SLIC_INFO(axom::fmt::format("- Map features to test: {}", ::args_benchmark_features)); + } + + RegisterBenchmarksFor>("axom::FlatMap"); + RegisterBenchmarksFor>("std::unordered_map"); + RegisterBenchmarksFor>("std::map"); + +#if defined(AXOM_USE_SPARSEHASH) + RegisterBenchmarksFor>( + "axom::google::sparse_hash_map"); +#endif + + ::benchmark::RunSpecifiedBenchmarks(); + return 0; +} From 09dc808d31866b2c19561aa61c2501319d0f9b77 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 15 Jan 2026 20:20:13 -0800 Subject: [PATCH 07/28] Improves performance of FlatMap batched insertion for SEQ policy --- src/axom/core/FlatMapUtil.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp index 5515607046..d1adb18f90 100644 --- a/src/axom/core/FlatMapUtil.hpp +++ b/src/axom/core/FlatMapUtil.hpp @@ -263,6 +263,27 @@ void FlatMap::insert(InputIt kv_begin, InputIt kv_end) typename std::iterator_traits::iterator_category>::value, "InputIt must be a random-access iterator for batched construction"); + // Fast path for sequential execution: + // The batched insertion algorithm below is designed for parallel execution and + // uses per-group locks and auxiliary arrays for deduplication. In SEQ, those + // structures add significant overhead; a simple sequential loop provides + // better performance while preserving the documented semantics that later + // duplicates overwrite earlier ones. + if constexpr(std::is_same_v) + { + const IndexType num_elems = std::distance(kv_begin, kv_end); + + // Ensure we have enough capacity up-front to avoid repeated rehashing. + this->reserve(this->size() + num_elems); + + for(IndexType idx = 0; idx < num_elems; ++idx) + { + auto kv = *(kv_begin + idx); + this->insert_or_assign(kv.first, kv.second); + } + return; + } + using HashResult = typename Hash::result_type; using GroupBucket = detail::flat_map::GroupBucket; From acdf68343bdefa4ffa71410a32d00fd4671fdbfb Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 15 Jan 2026 20:50:35 -0800 Subject: [PATCH 08/28] Adds FlatMap benchmarks for hits and misses of precached entities --- src/axom/core/FlatMap.hpp | 18 +++++ src/axom/core/detail/FlatTable.hpp | 4 +- .../core/tests/core_benchmark_flatmap.cpp | 75 +++++++++++++++++++ 3 files changed, 94 insertions(+), 3 deletions(-) diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index 0a7b1209c4..5f3aca38ae 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -74,6 +74,8 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy; using const_iterator = IteratorImpl; @@ -289,6 +291,8 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy auto FlatMap::find(const KeyType& key) -> iterator { auto hash = Hash {}(key); + return find_with_hash(key, hash); +} + +template +auto FlatMap::find_with_hash(const KeyType& key, hash_result_type hash) + -> iterator +{ iterator found_iter = end(); this->probeIndex(m_numGroups2, m_metadata, hash, [&](IndexType bucket_index) -> bool { if(this->m_buckets[bucket_index].get().first == key) @@ -849,6 +860,13 @@ template auto FlatMap::find(const KeyType& key) const -> const_iterator { auto hash = Hash {}(key); + return find_with_hash(key, hash); +} + +template +auto FlatMap::find_with_hash(const KeyType& key, hash_result_type hash) const + -> const_iterator +{ const_iterator found_iter = end(); this->probeIndex(m_numGroups2, m_metadata, hash, [&](IndexType bucket_index) -> bool { if(this->m_buckets[bucket_index].get().first == key) diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp index 66ac4019b7..6dec9f0970 100644 --- a/src/axom/core/detail/FlatTable.hpp +++ b/src/axom/core/detail/FlatTable.hpp @@ -348,9 +348,7 @@ struct SequentialLookupPolicy : ProbePolicy // Set the overflow bit and continue probing. metadata[curr_group].setOverflow(hash_8); } - // Mask instead of "% metadata.size()": the group count is a power of - // two, and the modulo compiled to a 64-bit signed division on the - // critical path of every probe continuation. + // The group count is a power of two, so we can use a bitmask (instead of a modulo) curr_group = (curr_group + this->getNext(iteration)) & group_mask; } if(empty_group != NO_MATCH) diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index 978c47f6d1..21ea611db6 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -289,6 +289,67 @@ void BM_Find_Miss(benchmark::State& state) } } +void BM_FlatMap_Find_Hit_Prehashed(benchmark::State& state) +{ + using MapType = axom::FlatMap; + using HashResult = typename MapType::hash_result_type; + + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); + const auto pairs = make_pairs(keys); + const MapType map = make_filled_map(pairs); + + std::vector hashes; + hashes.reserve(keys.size()); + for(KeyType k : keys) + { + hashes.push_back(typename MapType::hasher {}(k)); + } + + for(auto _ : state) + { + ValueType sum = 0; + for(std::size_t i = 0; i < keys.size(); ++i) + { + auto it = map.find_with_hash(keys[i], hashes[i]); + if(it != map.end()) + { + sum += it->second; + } + } + benchmark::DoNotOptimize(sum); + } +} + +void BM_FlatMap_Find_Miss_Prehashed(benchmark::State& state) +{ + using MapType = axom::FlatMap; + using HashResult = typename MapType::hash_result_type; + + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); + const auto pairs = make_pairs(keys); + const MapType map = make_filled_map(pairs); + const auto miss_keys = make_miss_keys(keys, static_cast(n) + 11); + + std::vector hashes; + hashes.reserve(miss_keys.size()); + for(KeyType k : miss_keys) + { + hashes.push_back(typename MapType::hasher {}(k)); + } + + for(auto _ : state) + { + std::int64_t misses = 0; + for(std::size_t i = 0; i < miss_keys.size(); ++i) + { + misses += (map.find_with_hash(miss_keys[i], hashes[i]) == map.end()) ? 1 : 0; + } + benchmark::DoNotOptimize(misses); + } +} + template void insert_pairs_in_batches(MapType& map, const std::vector>& pairs, @@ -367,6 +428,19 @@ void RegisterBenchmarksFor(const std::string& map_name) // clang-format on } +void RegisterFlatMapPrehashedBenchmarks() +{ + auto name = [](const std::string& op) { return axom::fmt::format("axom::FlatMap::{}", op); }; + + if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None) + { + benchmark::RegisterBenchmark(name("find_hit_prehashed"), &BM_FlatMap_Find_Hit_Prehashed) + ->Apply(CustomArgs); + benchmark::RegisterBenchmark(name("find_miss_prehashed"), &BM_FlatMap_Find_Miss_Prehashed) + ->Apply(CustomArgs); + } +} + int main(int argc, char* argv[]) { std::vector local_test_sizes; @@ -450,6 +524,7 @@ int main(int argc, char* argv[]) } RegisterBenchmarksFor>("axom::FlatMap"); + RegisterFlatMapPrehashedBenchmarks(); RegisterBenchmarksFor>("std::unordered_map"); RegisterBenchmarksFor>("std::map"); From 862195a78741a5ab162d196283fd694744416883 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 15 Jan 2026 21:11:11 -0800 Subject: [PATCH 09/28] Exploring faster hash functions --- src/axom/core/FlatMap.hpp | 6 ++---- src/axom/core/detail/FlatTable.hpp | 21 +++++++++++++++++++ .../core/tests/core_benchmark_flatmap.cpp | 3 +++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index 5f3aca38ae..5fbaceed7d 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -835,8 +835,7 @@ FlatMap::FlatMap(IndexType num_elems, template auto FlatMap::find(const KeyType& key) -> iterator { - auto hash = Hash {}(key); - return find_with_hash(key, hash); + return find_with_hash(key, Hash {}(key)); } template @@ -859,8 +858,7 @@ auto FlatMap::find_with_hash(const KeyType& key, hash_ template auto FlatMap::find(const KeyType& key) const -> const_iterator { - auto hash = Hash {}(key); - return find_with_hash(key, hash); + return find_with_hash(key, Hash {}(key)); } template diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp index 6dec9f0970..5da5d9be3c 100644 --- a/src/axom/core/detail/FlatTable.hpp +++ b/src/axom/core/detail/FlatTable.hpp @@ -72,6 +72,27 @@ struct HashMixer64 } }; +/*! + * \brief A faster (but lower-cost) hash mixer for 64-bit hashing. + * + * Intended for performance experiments when the cost of hashing dominates + * lookup. Uses a single 64-bit multiply followed by an xor-fold. + */ +template class HashFunc> +struct FastHashMixer64 +{ + using argument_type = typename HashFunc::argument_type; + using result_type = typename HashFunc::result_type; + + AXOM_HOST_DEVICE uint64_t operator()(const KeyType& key) const + { + uint64_t hash = static_cast(HashFunc {}(key)); + hash *= 0x9e3779b97f4a7c15ULL; + hash ^= hash >> 32; + return hash; + } +}; + // We follow the design of boost::unordered_flat_map, which uses a 128-bit chunk // of metadata for each group of 15 buckets. // This is split up into an "overflow bit", and 15 bytes representing the diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index 21ea611db6..8164f234c5 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -15,6 +15,7 @@ #include "axom/core/FlatMap.hpp" #include "axom/core/FlatMapUtil.hpp" +#include "axom/core/detail/FlatTable.hpp" #if defined(AXOM_USE_SPARSEHASH) #include "axom/sparsehash/sparse_hash_map" @@ -525,6 +526,8 @@ int main(int argc, char* argv[]) RegisterBenchmarksFor>("axom::FlatMap"); RegisterFlatMapPrehashedBenchmarks(); + using FastHash = axom::detail::flat_map::FastHashMixer64; + RegisterBenchmarksFor>("axom::FlatMapFastHash"); RegisterBenchmarksFor>("std::unordered_map"); RegisterBenchmarksFor>("std::map"); From 296934c5cf6049580ff1b693de4a48083204740d Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Tue, 10 Mar 2026 15:08:25 -0700 Subject: [PATCH 10/28] Adds benchmark for flatmap load factor --- .../core/tests/core_benchmark_flatmap.cpp | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index 8164f234c5..3fe000ec62 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -22,6 +22,7 @@ #endif #include +#include #include #include #include @@ -217,6 +218,27 @@ MapType make_filled_map(const std::vector>& pairs) return map; } +template +axom::FlatMap make_filled_flatmap_with_target_load_factor( + const std::vector>& pairs, + double target_load_factor) +{ + using MapType = axom::FlatMap; + MapType map; + + const double max_lf = map.max_load_factor(); + const double lf = std::max(1e-3, std::min(target_load_factor, max_lf)); + const double n = static_cast(pairs.size()); + + // FlatMap's ctor/rehash argument is scaled internally by max_load_factor. + // To target load factor `lf` for `n` elements, scale the count accordingly. + const axom::IndexType rehash_count = static_cast(std::ceil((n * max_lf) / lf)); + + map.rehash(rehash_count); + map.insert(pairs.begin(), pairs.end()); + return map; +} + template void BM_Insert_StartEmpty(benchmark::State& state) { @@ -290,6 +312,32 @@ void BM_Find_Miss(benchmark::State& state) } } +template +void BM_FlatMap_Find_Hit_TargetLoad(benchmark::State& state, double target_load_factor) +{ + using MapType = axom::FlatMap; + + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); + const auto pairs = make_pairs(keys); + const MapType map = + make_filled_flatmap_with_target_load_factor(pairs, target_load_factor); + + for(auto _ : state) + { + ValueType sum = 0; + for(KeyType k : keys) + { + auto it = map.find(k); + if(it != map.end()) + { + sum += it->second; + } + } + benchmark::DoNotOptimize(sum); + } +} + void BM_FlatMap_Find_Hit_Prehashed(benchmark::State& state) { using MapType = axom::FlatMap; @@ -528,6 +576,23 @@ int main(int argc, char* argv[]) RegisterFlatMapPrehashedBenchmarks(); using FastHash = axom::detail::flat_map::FastHashMixer64; RegisterBenchmarksFor>("axom::FlatMapFastHash"); + + // Explore the impact of lower load factors on successful lookups. + // This trades memory for potentially fewer probes and fewer cache misses. + using DefaultHash = axom::FlatMap::hasher; + benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p50", [](benchmark::State& st) { + BM_FlatMap_Find_Hit_TargetLoad(st, 0.50); + })->Apply(CustomArgs); + benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p70", [](benchmark::State& st) { + BM_FlatMap_Find_Hit_TargetLoad(st, 0.70); + })->Apply(CustomArgs); + benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p50", [](benchmark::State& st) { + BM_FlatMap_Find_Hit_TargetLoad(st, 0.50); + })->Apply(CustomArgs); + benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p70", [](benchmark::State& st) { + BM_FlatMap_Find_Hit_TargetLoad(st, 0.70); + })->Apply(CustomArgs); + RegisterBenchmarksFor>("std::unordered_map"); RegisterBenchmarksFor>("std::map"); From 8b6cb5851a8bf8e44fa6532f0103901ccd29557d Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 05:09:42 +0000 Subject: [PATCH 11/28] Benchmark: decouple lookup order from insertion order in FlatMap suite BM_Find_Hit looks keys up in the order they were inserted. Since node-based maps walk the heap nearly sequentially, the hardware prefetcher hides their pointer-chasing latency. This commit adds find_hit_shuffled (same keys, independently shuffled lookup order) and find_hit_randkeys (distinct pseudorandom 64-bit keys, shuffled lookup order) to better exhibit expected lookup behavior. --- .../core/tests/core_benchmark_flatmap.cpp | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index 3fe000ec62..c87eb86e7c 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -158,6 +159,46 @@ std::vector make_miss_keys(const std::vector& keys, KeyType of return misses; } +/*! + * \brief Returns a copy of \a keys reshuffled with an independent seed. + * + * Looking keys up in the exact order they were inserted is rarely representative, + * and it systematically favors node-based containers: with libstdc++'s identity hash + * for integers and densely numbered keys, the i-th lookup touches the i-th allocated node, + * so the lookup loop streams through the heap nearly sequentially and the hardware prefetcher hides + * most of the pointer-chasing latency. An independently shuffled lookup order removes that correlation. + */ +std::vector make_lookup_order(const std::vector& keys, std::uint64_t seed) +{ + std::vector lookup = keys; + std::mt19937_64 rng(seed); + std::shuffle(lookup.begin(), lookup.end(), rng); + return lookup; +} + +/*! + * \brief Generates \a n distinct pseudorandom 64-bit keys. + * + * Dense keys in [0, n) are friendly to identity-style integer hashes and bucket layouts. + * Random keys exercise hashing and probing the way sparse or pointer-derived IDs do. + */ +std::vector make_random_unique_keys(int n, std::uint64_t seed) +{ + std::mt19937_64 rng(seed); + std::unordered_set seen; + std::vector keys; + keys.reserve(static_cast(n)); + while(keys.size() < static_cast(n)) + { + const KeyType k = static_cast(rng()); + if(seen.insert(k).second) + { + keys.push_back(k); + } + } + return keys; +} + template struct MapFactory { @@ -269,6 +310,9 @@ void BM_Insert_Reserved(benchmark::State& state) } } +// NOTE: BM_Find_Hit looks keys up in insertion order, which favors +// node-based maps as described on make_lookup_order() above. +// Prefer BM_Find_Hit_Shuffled and BM_Find_Hit_RandomKeys when comparing containers. template void BM_Find_Hit(benchmark::State& state) { @@ -292,6 +336,54 @@ void BM_Find_Hit(benchmark::State& state) } } +template +void BM_Find_Hit_Shuffled(benchmark::State& state) +{ + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); + const auto pairs = make_pairs(keys); + const MapType map = make_filled_map(pairs); + const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL); + + for(auto _ : state) + { + ValueType sum = 0; + for(KeyType k : lookup_keys) + { + auto it = map.find(k); + if(it != map.end()) + { + sum += it->second; + } + } + benchmark::DoNotOptimize(sum); + } +} + +template +void BM_Find_Hit_RandomKeys(benchmark::State& state) +{ + const int n = state.range(0); + const auto keys = make_random_unique_keys(n, 0xFEEDFACEULL); + const auto pairs = make_pairs(keys); + const MapType map = make_filled_map(pairs); + const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL); + + for(auto _ : state) + { + ValueType sum = 0; + for(KeyType k : lookup_keys) + { + auto it = map.find(k); + if(it != map.end()) + { + sum += it->second; + } + } + benchmark::DoNotOptimize(sum); + } +} + template void BM_Find_Miss(benchmark::State& state) { @@ -467,6 +559,8 @@ void RegisterBenchmarksFor(const std::string& map_name) if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None) { benchmark::RegisterBenchmark(name("find_hit"), &BM_Find_Hit)->Apply(CustomArgs); + benchmark::RegisterBenchmark(name("find_hit_shuffled"), &BM_Find_Hit_Shuffled)->Apply(CustomArgs); + benchmark::RegisterBenchmark(name("find_hit_randkeys"), &BM_Find_Hit_RandomKeys)->Apply(CustomArgs); benchmark::RegisterBenchmark(name("find_miss"), &BM_Find_Miss)->Apply(CustomArgs); } From 324c90a03e064b5ec8a7f10744eb8befa356c5a3 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 05:11:25 +0000 Subject: [PATCH 12/28] FlatMap: force-inline the lookup hot path When find_with_hash() in not inlined, every lookup is more expensive (extra registers, and a stack spill for the key) and requires loop-invariant setup that cannot be hoisted out of the caller's lookup loop. Forcing the probe path inline removed 20-40% of find_hit time and 15-35% of find_miss time for FlatMap at n = 2^16 and 2^20. --- src/axom/core/FlatMap.hpp | 9 +++++---- src/axom/core/detail/FlatTable.hpp | 23 +++++++++++++++++------ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index 5fbaceed7d..875db27c1a 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -289,10 +289,11 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy #endif +// Force-inline annotation for the FlatMap/FlatTable lookup hot path. +#if defined(__CUDACC__) || defined(__HIPCC__) + #define AXOM_FLATMAP_FORCE_INLINE __forceinline__ +#elif defined(__GNUC__) || defined(__clang__) + #define AXOM_FLATMAP_FORCE_INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) + #define AXOM_FLATMAP_FORCE_INLINE __forceinline +#else + #define AXOM_FLATMAP_FORCE_INLINE inline +#endif + namespace axom { namespace detail @@ -162,7 +173,7 @@ struct GroupBucket } template - AXOM_HOST_DEVICE int visitHashBucket(std::uint8_t hash, Func&& visitor) const + AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE int visitHashBucket(std::uint8_t hash, Func&& visitor) const { std::uint8_t reducedHash = reduceHash(hash); #if !defined(AXOM_DEVICE_CODE) && defined(_AXOM_CORE_HAVE_SSE2) @@ -273,7 +284,7 @@ struct GroupBucket } template - AXOM_HOST_DEVICE bool getMaybeOverflowed(std::uint8_t hash) const + AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE bool getMaybeOverflowed(std::uint8_t hash) const { std::uint8_t hashOfwBit = 1 << (hash % 8); std::uint8_t curr_ofw; @@ -390,10 +401,10 @@ struct SequentialLookupPolicy : ProbePolicy * matching hash */ template - AXOM_HOST_DEVICE void probeIndex(int ngroups_pow_2, - ArrayView metadata, - HashType hash, - FoundIndex&& on_hash_found) const + AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE void probeIndex(int ngroups_pow_2, + ArrayView metadata, + HashType hash, + FoundIndex&& on_hash_found) const { // We use the k MSBs of the hash as the initial group probe point, // where ngroups = 2^k. Since the group count is always a power of two, From 03ac4d99017df403f67ae8268368ecc041a0955c Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 05:13:25 +0000 Subject: [PATCH 13/28] FlatMap: hash once and avoid FP division in getEmplacePos `getEmplacePos()` computed `Hash{}(key)`, then called `find(key)`, which hashed the same key a second time. It then performed a floating-point division against MAX_LOAD_FACTOR on every insertion to decide whether to grow. Note: This reduced instruction count but the performance improvements within run-to-run noise in our measurements. --- src/axom/core/FlatMap.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index 875db27c1a..087eb75810 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -7,6 +7,7 @@ #ifndef Axom_Core_FlatMap_HPP #define Axom_Core_FlatMap_HPP +#include #include #include #include @@ -897,14 +898,21 @@ auto FlatMap::getEmplacePos(const KeyType& key) auto hash = Hash {}(key); // If the key already exists, return the existing iterator. - iterator existing_elem = this->find(key); + // Reuse the hash computed above rather than re-hashing inside find(). + iterator existing_elem = this->find_with_hash(key, hash); if(existing_elem != this->end()) { return {existing_elem, false}; } // Resize to double the number of bucket groups if insertion would put us // above the maximum load factor. - if(((m_loadCount + 1) / (double)bucket_count()) >= MAX_LOAD_FACTOR) + // MAX_LOAD_FACTOR is exactly 7/8, so (count + 1) / buckets >= 7/8 is + // equivalent to 8 * (count + 1) >= 7 * buckets in exact integer arithmetic. + // This avoids a floating-point division on every insertion. + static_assert(MAX_LOAD_FACTOR == 0.875, + "Integer load-factor check below assumes MAX_LOAD_FACTOR == 7/8."); + if(8 * (static_cast(m_loadCount) + 1) >= + 7 * static_cast(bucket_count())) { IndexType newNumGroups = m_metadata.size() * 2; rehash(newNumGroups * BucketsPerGroup - 1); From f658482439a76424278a87de7a4dca1f16af4f29 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 05:14:16 +0000 Subject: [PATCH 14/28] Benchmark: report realized load factor in target-load-factor scenarios FlatMap rounds its group count up to a power of two, so for a fixed element count the achievable load factors form a geometric ladder and a nominal target is quantized to the next rung at or below it. At n = 2^16 the 0.70 target and the default reserve(n) geometry coincide (actual load factor 0.533, which is why find_hit_lf0p70 reproduced find_hit to within noise), and the 0.50 target lands at 0.267 -- a table twice as large. That scenario was really measuring a larger working set, not a shorter probe sequence. --- src/axom/core/tests/core_benchmark_flatmap.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index c87eb86e7c..a06d9be21a 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -273,6 +273,15 @@ axom::FlatMap make_filled_flatmap_with_target_load_factor( // FlatMap's ctor/rehash argument is scaled internally by max_load_factor. // To target load factor `lf` for `n` elements, scale the count accordingly. + // + // NOTE: FlatMap rounds its group count up to a power of two, so for a + // fixed n the achievable load factors form a geometric ladder + // (n / (15 * 2^k - 1) for integer k) and the request is quantized to the + // next rung at or below the target. At n = 2^16 this means a 0.70 target + // and the default reserve(n) geometry coincide at an actual load factor + // of 0.533, and a 0.50 target lands at 0.267 (a table twice as large). + // The benchmarks below export the realized load factor and bucket count + // as counters; compare those, not the nominal targets. const axom::IndexType rehash_count = static_cast(std::ceil((n * max_lf) / lf)); map.rehash(rehash_count); @@ -415,6 +424,11 @@ void BM_FlatMap_Find_Hit_TargetLoad(benchmark::State& state, double target_load_ const MapType map = make_filled_flatmap_with_target_load_factor(pairs, target_load_factor); + // Export the geometry actually realized after power-of-two rounding so + // that runs with different nominal targets can be compared meaningfully. + state.counters["load_factor"] = map.load_factor(); + state.counters["buckets"] = static_cast(map.bucket_count()); + for(auto _ : state) { ValueType sum = 0; From 1935fe63ebaa6b9c756f80088e146058377ad34b Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 05:15:00 +0000 Subject: [PATCH 15/28] FlatTable: honor visitor early-exit in scalar visitHashBucket The SSE2 path of GroupBucket::visitHashBucket() stops visiting as soon as the visitor returns false, but the scalar fallback (including GPU path) ignored the return value and kept scanning all 15 slots. In-tree visitors and the duplicate check in the batched insert path return false to mean 'stop', and extra visits load and compare a key which could incur a cache miss per probe group. --- src/axom/core/detail/FlatTable.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp index 846960597f..45276cdf8f 100644 --- a/src/axom/core/detail/FlatTable.hpp +++ b/src/axom/core/detail/FlatTable.hpp @@ -201,7 +201,11 @@ struct GroupBucket { if(metadata.buckets[i] == reducedHash) { - visitor(i); + if(!visitor(i)) + { + // Found a match - stop visiting, mirroring the SSE2 path above. + break; + } } } #endif From cb5793a21c4bd10331a76d6a9572476f95ff95cf Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 11:16:57 -0700 Subject: [PATCH 16/28] Fixes hip build via missing AXOM_HOST_DEVICE --- src/axom/core/tests/core_flatmap.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp index 4e3a2f4506..69241c2c2a 100644 --- a/src/axom/core/tests/core_flatmap.hpp +++ b/src/axom/core/tests/core_flatmap.hpp @@ -163,7 +163,7 @@ struct DegenerateGroupHash { using argument_type = int; using result_type = std::uint64_t; - std::uint64_t operator()(int key) const + AXOM_HOST_DEVICE std::uint64_t operator()(int key) const { return static_cast(static_cast(key) & 0xFF); } From 6691dbbebad57650c76089acbdf287f702f5f9a8 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 12:06:08 -0700 Subject: [PATCH 17/28] FlatMap: Fuse the find and empty-slot probes in getEmplacePos() Emplacing a new key walked the probe sequence twice -- first to check for a key and then to find an empty slot within the key. We now do both within a single call. --- src/axom/core/FlatMap.hpp | 20 +++-- src/axom/core/detail/FlatTable.hpp | 83 ++++++++++++++++++++ src/axom/core/tests/core_flatmap.hpp | 108 +++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 6 deletions(-) diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index 087eb75810..c51d246414 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -897,9 +897,18 @@ auto FlatMap::getEmplacePos(const KeyType& key) { auto hash = Hash {}(key); - // If the key already exists, return the existing iterator. - // Reuse the hash computed above rather than re-hashing inside find(). - iterator existing_elem = this->find_with_hash(key, hash); + // Single fused probe: visit key matches and locate the insertion slot in a single pass + iterator existing_elem = this->end(); + IndexType newBucket = + this->probeEmplaceIndex(m_numGroups2, m_metadata, hash, [&](IndexType bucket_index) -> bool { + if(this->m_buckets[bucket_index].get().first == key) + { + existing_elem = iterator(this, bucket_index); + return false; + } + return true; + }); + if(existing_elem != this->end()) { return {existing_elem, false}; @@ -916,11 +925,10 @@ auto FlatMap::getEmplacePos(const KeyType& key) { IndexType newNumGroups = m_metadata.size() * 2; rehash(newNumGroups * BucketsPerGroup - 1); + // The table was rebuilt, so the slot is stale. If we got here, the key is missing + newBucket = this->probeEmptyIndex(m_numGroups2, m_metadata, hash); } - // Get an empty index to place the element into. - IndexType newBucket = this->probeEmptyIndex(m_numGroups2, m_metadata, hash); - // Add a hash to the corresponding bucket slot. this->setBucketHash(m_metadata, newBucket, hash); m_size++; diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp index 45276cdf8f..2a9307bd73 100644 --- a/src/axom/core/detail/FlatTable.hpp +++ b/src/axom/core/detail/FlatTable.hpp @@ -394,6 +394,89 @@ struct SequentialLookupPolicy : ProbePolicy return NO_MATCH; } + /*! + * \brief Fused find-or-locate-empty probe for single-key emplacement. + * + * Walks the probe sequence once, simultaneously visiting key matches and + * tracking the first empty slot. Overflow bits are maintained in the + * same way as probeEmptyIndex(): a full group that hasn't yielded an insertion + * slot is marked overflowed for this hash before moving on. + * + * \param [in] ngroups_pow_2 the number of groups, expressed as a power of 2 + * \param [in] metadata the array of metadata for the groups in the hash map + * \param [in] hash the hash to search for and, if absent, insert + * \param [in] on_hash_found functor called for each matching bucket slot; + * returns false to stop the probe (existing key found) + * + * \return the bucket index to insert into, or NO_MATCH if the visitor stopped the probe + */ + template + IndexType probeEmplaceIndex(int ngroups_pow_2, + ArrayView metadata, + HashType hash, + FoundIndex&& on_hash_found) const + { + const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2); + const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1; + HashType curr_group = (hash >> bitshift_right) & group_mask; + int empty_group = NO_MATCH; + int empty_bucket = NO_MATCH; + + std::uint8_t hash_8 = static_cast(hash); + bool may_exist = true; + for(int iteration = 0; iteration < metadata.size(); ++iteration) + { + if(may_exist) + { + // The key may be in the current group, so scan for matches just as probeIndex() does + bool keep_going = true; + metadata[curr_group].visitHashBucket(hash_8, [&](IndexType bucket_index) -> bool { + keep_going = on_hash_found(curr_group * GroupBucket::Size + bucket_index); + return keep_going; + }); + if(!keep_going) + { + // Visitor stopped the probe: the key already exists + return NO_MATCH; + } + } + + if(empty_group == NO_MATCH) + { + int tentative_empty_bucket = metadata[curr_group].getEmptyBucket(); + if(tentative_empty_bucket != GroupBucket::InvalidSlot) + { + empty_group = curr_group; + empty_bucket = tentative_empty_bucket; + } + } + + if(!metadata[curr_group].getMaybeOverflowed(hash_8)) + { + // The key cannot exist past this group + may_exist = false; + if(empty_group != NO_MATCH) + { + break; + } + // Full group at the end of the trail, mark as overflowed and keep looking for an empty slot + metadata[curr_group].setOverflow(hash_8); + } + else if(empty_group == NO_MATCH) + { + // Full group inside the trail + metadata[curr_group].setOverflow(hash_8); + } + // The group count is a power of two, so we can use a bitmask (instead of a modulo) + curr_group = (curr_group + this->getNext(iteration)) & group_mask; + } + if(empty_group != NO_MATCH) + { + return empty_group * GroupBucket::Size + empty_bucket; + } + return NO_MATCH; + } + /*! * \brief Finds the next potential bucket index for a given hash in a group * array for an open-addressing hash map. diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp index 69241c2c2a..1d134550c4 100644 --- a/src/axom/core/tests/core_flatmap.hpp +++ b/src/axom/core/tests/core_flatmap.hpp @@ -228,6 +228,114 @@ TEST(core_flatmap_unit, cross_group_probe_chains) } } +// Hash functor that maps every key to the same hash value +// This forces long probe chains and stresses fused "find + empty-slot" probing +struct ConstantHash64 +{ + using argument_type = int; + using result_type = std::uint64_t; + + AXOM_HOST_DEVICE std::uint64_t operator()(int) const { return std::uint64_t {0}; } +}; + +TEST(core_flatmap_unit, fused_emplace_probe_no_duplicate_across_tombstone) +{ + using MapType = axom::FlatMap; + MapType test_map; + + const int NUM_ELEMS = 40; + for(int i = 0; i < NUM_ELEMS; i++) + { + test_map.insert_or_assign(i, i * 10); + } + ASSERT_EQ(test_map.size(), NUM_ELEMS); + + // Create a tombstone early in the probe trail + EXPECT_EQ(test_map.erase(1), 1); + ASSERT_EQ(test_map.size(), NUM_ELEMS - 1); + + // Update a key that should live beyond the first group. The fused emplace probe + // must keep probing past the earlier empty slot and find the existing key. + const int existing_key = NUM_ELEMS - 1; + auto result = test_map.insert_or_assign(existing_key, 12345); + EXPECT_FALSE(result.second); + EXPECT_EQ(test_map.size(), NUM_ELEMS - 1); + EXPECT_EQ(test_map.at(existing_key), 12345); + + // Verify we did not accidentally insert a duplicate key + // (which would be possible if the probe stopped at the tombstone) + int occurrences = 0; + for(const auto& kv : test_map) + { + occurrences += (kv.first == existing_key) ? 1 : 0; + } + EXPECT_EQ(occurrences, 1); +} + +TEST(core_flatmap_unit, fused_emplace_probe_try_emplace_respects_existing_after_tombstone) +{ + using MapType = axom::FlatMap; + MapType test_map; + + const int NUM_ELEMS = 40; + for(int i = 0; i < NUM_ELEMS; i++) + { + test_map.insert_or_assign(i, i * 10); + } + ASSERT_EQ(test_map.size(), NUM_ELEMS); + + // Create a tombstone early in the probe trail + EXPECT_EQ(test_map.erase(2), 1); + ASSERT_EQ(test_map.size(), NUM_ELEMS - 1); + + const int existing_key = NUM_ELEMS - 2; + test_map.insert_or_assign(existing_key, 777); + ASSERT_EQ(test_map.at(existing_key), 777); + + // try_emplace must not insert or overwrite when the key exists + auto emplace_res = test_map.try_emplace(existing_key, 999); + EXPECT_FALSE(emplace_res.second); + EXPECT_EQ(emplace_res.first->second, 777); + + int occurrences = 0; + for(const auto& kv : test_map) + { + occurrences += (kv.first == existing_key) ? 1 : 0; + } + EXPECT_EQ(occurrences, 1); +} + +TEST(core_flatmap_unit, fused_emplace_probe_recomputes_slot_after_rehash) +{ + using MapType = axom::FlatMap; + MapType test_map; + + const int init_buckets = test_map.bucket_count(); + const int size_no_rehash = static_cast(test_map.max_load_factor() * init_buckets); + + // Fill right up to the no-rehash threshold. + for(int i = 0; i < size_no_rehash; i++) + { + test_map.insert_or_assign(i, i); + } + ASSERT_EQ(test_map.bucket_count(), init_buckets); + ASSERT_EQ(test_map.size(), size_no_rehash); + + // Create a mid-sequence tombstone. With ConstantHash64 and a full trail, this should + // preserve loadCount, so the next insertion triggers a rehash even though an empty slot exists. + EXPECT_EQ(test_map.erase(0), 1); + ASSERT_EQ(test_map.bucket_count(), init_buckets); + ASSERT_EQ(test_map.size(), size_no_rehash - 1); + + const int buckets_before = test_map.bucket_count(); + const int new_key = 100000; + test_map.insert_or_assign(new_key, 42); + + EXPECT_GT(test_map.bucket_count(), buckets_before); + EXPECT_EQ(test_map.at(new_key), 42); + EXPECT_EQ(test_map.count(0), 0); +} + AXOM_TYPED_TEST(core_flatmap, default_init) { using MapType = typename TestFixture::MapType; From 6d8a86f394d461704eebd4d488c50687e2d058e3 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 13:32:25 -0700 Subject: [PATCH 18/28] FlatMap: Keep move semantics during batch insertion --- src/axom/core/FlatMapUtil.hpp | 8 +++-- src/axom/core/tests/core_flatmap.hpp | 48 ++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp index d1adb18f90..4eb1c41a05 100644 --- a/src/axom/core/FlatMapUtil.hpp +++ b/src/axom/core/FlatMapUtil.hpp @@ -278,8 +278,12 @@ void FlatMap::insert(InputIt kv_begin, InputIt kv_end) for(IndexType idx = 0; idx < num_elems; ++idx) { - auto kv = *(kv_begin + idx); - this->insert_or_assign(kv.first, kv.second); + // Preserve the value category of the input pair. In particular, when + // kv_begin/kv_end are move iterators, we must forward the mapped value + // so move-only types remain supported. + decltype(auto) kv = *(kv_begin + idx); + this->insert_or_assign(std::forward(kv).first, + std::forward(kv).second); } return; } diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp index 1d134550c4..021bcd741d 100644 --- a/src/axom/core/tests/core_flatmap.hpp +++ b/src/axom/core/tests/core_flatmap.hpp @@ -13,6 +13,9 @@ // gtest includes #include "gtest/gtest.h" +// C++ includes +#include + // Unit test for QuadraticProbing TEST(core_flatmap_unit, quadratic_probing) { @@ -656,6 +659,51 @@ TEST(core_flatmap_moveonly, init_and_move_moveonly) } } +TEST(core_flatmap_moveonly, insert_batched_seq_move_iterators) +{ + using MapType = axom::FlatMap>; + MapType test_map; + + using PairType = std::pair>; + std::vector pairs; + + const int NUM_ELEMS = 64; + pairs.reserve(NUM_ELEMS + 1); + for(int i = 0; i < NUM_ELEMS; i++) + { + pairs.emplace_back(i, std::make_unique(i + 1.0)); + } + + // Include a duplicate so "later duplicates overwrite earlier ones" is exercised, + // while also ensuring the value is moved in all cases. + pairs.emplace_back(NUM_ELEMS / 2, std::make_unique(123.0)); + + test_map.template insert(std::make_move_iterator(pairs.begin()), + std::make_move_iterator(pairs.end())); + + EXPECT_EQ(test_map.size(), NUM_ELEMS); + for(int i = 0; i < NUM_ELEMS; i++) + { + ASSERT_EQ(test_map.count(i), 1); + auto& ptr = test_map.at(i); + ASSERT_NE(ptr.get(), nullptr); + if(i == NUM_ELEMS / 2) + { + EXPECT_EQ(*ptr, 123.0); + } + else + { + EXPECT_EQ(*ptr, i + 1.0); + } + } + + // All source values should have been moved-from. + for(const auto& kv : pairs) + { + EXPECT_EQ(kv.second.get(), nullptr); + } +} + AXOM_TYPED_TEST(core_flatmap, init_and_copy) { using MapType = typename TestFixture::MapType; From f083e50a927eb417235f0f5cee90ef3d16f19e39 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 13:47:37 -0700 Subject: [PATCH 19/28] Improves FlatMap benchmark * Disables sequential find_hit search by default since it is not representative. * Guards several tests by the feature they are testing --- .../core/tests/core_benchmark_flatmap.cpp | 51 ++++++++++++------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index a06d9be21a..2f013e35df 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -72,6 +72,7 @@ inline FlatMapFeatureBenchmarks operator&(FlatMapFeatureBenchmarks lhs, FlatMapF std::vector args_benchmark_sizes; FlatMapFeatureBenchmarks args_benchmark_features {FlatMapFeatureBenchmarks::None}; int args_batch_size = 1 << 10; +bool args_include_insertion_order_lookup = false; } // namespace template <> @@ -162,9 +163,9 @@ std::vector make_miss_keys(const std::vector& keys, KeyType of /*! * \brief Returns a copy of \a keys reshuffled with an independent seed. * - * Looking keys up in the exact order they were inserted is rarely representative, - * and it systematically favors node-based containers: with libstdc++'s identity hash - * for integers and densely numbered keys, the i-th lookup touches the i-th allocated node, + * Looking keys up in the exact order they were inserted is rarely representative, + * and it systematically favors node-based containers: with libstdc++'s identity hash + * for integers and densely numbered keys, the i-th lookup touches the i-th allocated node, * so the lookup loop streams through the heap nearly sequentially and the hardware prefetcher hides * most of the pointer-chasing latency. An independently shuffled lookup order removes that correlation. */ @@ -423,6 +424,7 @@ void BM_FlatMap_Find_Hit_TargetLoad(benchmark::State& state, double target_load_ const auto pairs = make_pairs(keys); const MapType map = make_filled_flatmap_with_target_load_factor(pairs, target_load_factor); + const auto lookup_keys = make_lookup_order(keys, 0xF00DBA11ULL); // Export the geometry actually realized after power-of-two rounding so // that runs with different nominal targets can be compared meaningfully. @@ -432,7 +434,7 @@ void BM_FlatMap_Find_Hit_TargetLoad(benchmark::State& state, double target_load_ for(auto _ : state) { ValueType sum = 0; - for(KeyType k : keys) + for(KeyType k : lookup_keys) { auto it = map.find(k); if(it != map.end()) @@ -572,7 +574,10 @@ void RegisterBenchmarksFor(const std::string& map_name) if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None) { - benchmark::RegisterBenchmark(name("find_hit"), &BM_Find_Hit)->Apply(CustomArgs); + if(::args_include_insertion_order_lookup) + { + benchmark::RegisterBenchmark(name("find_hit"), &BM_Find_Hit)->Apply(CustomArgs); + } benchmark::RegisterBenchmark(name("find_hit_shuffled"), &BM_Find_Hit_Shuffled)->Apply(CustomArgs); benchmark::RegisterBenchmark(name("find_hit_randkeys"), &BM_Find_Hit_RandomKeys)->Apply(CustomArgs); benchmark::RegisterBenchmark(name("find_miss"), &BM_Find_Miss)->Apply(CustomArgs); @@ -603,6 +608,7 @@ int main(int argc, char* argv[]) std::vector local_test_sizes; FlatMapFeatureBenchmarks local_benchmark_features {FlatMapFeatureBenchmarks::None}; int local_batch_size = ::args_batch_size; + bool local_include_insertion_order_lookup = ::args_include_insertion_order_lookup; axom::CLI::App app {"Axom FlatMap benchmarks"}; app.add_option("-s,--custom_sizes", local_test_sizes) @@ -632,6 +638,9 @@ int main(int argc, char* argv[]) ->default_val(local_batch_size) ->check(axom::CLI::PositiveNumber); + app.add_flag("--include_insertion_order_lookup", local_include_insertion_order_lookup) + ->description("Includes insertion-order lookup benchmark (biased; for diagnosis)"); + std::vector feature_strings; auto feature_opt = app.add_option("-f,--features", feature_strings) @@ -673,11 +682,14 @@ int main(int argc, char* argv[]) std::swap(::args_benchmark_sizes, local_test_sizes); ::args_batch_size = local_batch_size; + ::args_include_insertion_order_lookup = local_include_insertion_order_lookup; SLIC_INFO("Parsed and processed command line arguments:"); SLIC_INFO(axom::fmt::format("- Map sizes: {}", axom::fmt::join(::args_benchmark_sizes, ","))); SLIC_INFO(axom::fmt::format("- Batch size: {}", ::args_batch_size)); SLIC_INFO(axom::fmt::format("- Map features to test: {}", ::args_benchmark_features)); + SLIC_INFO(axom::fmt::format("- Include insertion-order lookup: {}", + ::args_include_insertion_order_lookup ? "true" : "false")); } RegisterBenchmarksFor>("axom::FlatMap"); @@ -687,19 +699,22 @@ int main(int argc, char* argv[]) // Explore the impact of lower load factors on successful lookups. // This trades memory for potentially fewer probes and fewer cache misses. - using DefaultHash = axom::FlatMap::hasher; - benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p50", [](benchmark::State& st) { - BM_FlatMap_Find_Hit_TargetLoad(st, 0.50); - })->Apply(CustomArgs); - benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p70", [](benchmark::State& st) { - BM_FlatMap_Find_Hit_TargetLoad(st, 0.70); - })->Apply(CustomArgs); - benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p50", [](benchmark::State& st) { - BM_FlatMap_Find_Hit_TargetLoad(st, 0.50); - })->Apply(CustomArgs); - benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p70", [](benchmark::State& st) { - BM_FlatMap_Find_Hit_TargetLoad(st, 0.70); - })->Apply(CustomArgs); + if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None) + { + using DefaultHash = axom::FlatMap::hasher; + benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p50", [](benchmark::State& st) { + BM_FlatMap_Find_Hit_TargetLoad(st, 0.50); + })->Apply(CustomArgs); + benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p70", [](benchmark::State& st) { + BM_FlatMap_Find_Hit_TargetLoad(st, 0.70); + })->Apply(CustomArgs); + benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p50", [](benchmark::State& st) { + BM_FlatMap_Find_Hit_TargetLoad(st, 0.50); + })->Apply(CustomArgs); + benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p70", [](benchmark::State& st) { + BM_FlatMap_Find_Hit_TargetLoad(st, 0.70); + })->Apply(CustomArgs); + } RegisterBenchmarksFor>("std::unordered_map"); RegisterBenchmarksFor>("std::map"); From e10cd0260e78bb3196d5a525c8465ad3f1da5dd0 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 14:00:47 -0700 Subject: [PATCH 20/28] FlatMap: Device hash type must be 64 bits Also adds more device hashing tests --- src/axom/core/tests/core_device_hash.hpp | 131 ++++++++++++++++++++--- 1 file changed, 116 insertions(+), 15 deletions(-) diff --git a/src/axom/core/tests/core_device_hash.hpp b/src/axom/core/tests/core_device_hash.hpp index 58173725e1..6cab7708b2 100644 --- a/src/axom/core/tests/core_device_hash.hpp +++ b/src/axom/core/tests/core_device_hash.hpp @@ -12,6 +12,7 @@ #include "gtest/gtest.h" // C++ includes +#include #include template @@ -37,6 +38,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_int) using ExecSpace = typename TestFixture::ExecSpace; axom::DeviceHash device_hasher; + using HashResult = typename decltype(device_hasher)::result_type; constexpr int NUM_HASHES = 4; @@ -44,7 +46,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_int) // Allocate space for hash results. int allocatorID = axom::execution_space::allocatorID(); - axom::IndexType *computed_hashes = axom::allocate(NUM_HASHES, allocatorID); + HashResult* computed_hashes = axom::allocate(NUM_HASHES, allocatorID); // Compute hashes. axom::for_all( @@ -52,8 +54,8 @@ AXOM_TYPED_TEST(core_device_hash, hash_int) AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); }); // Copy back to host. - axom::IndexType computed_hashes_host[NUM_HASHES]; - axom::copy(computed_hashes_host, computed_hashes, sizeof(axom::IndexType) * NUM_HASHES); + HashResult computed_hashes_host[NUM_HASHES]; + axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES); axom::deallocate(computed_hashes); for(int i = 0; i < NUM_HASHES; i++) @@ -74,6 +76,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_float) using ExecSpace = typename TestFixture::ExecSpace; axom::DeviceHash device_hasher; + using HashResult = typename decltype(device_hasher)::result_type; constexpr int NUM_HASHES = 4; @@ -81,7 +84,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_float) // Allocate space for hash results. int allocatorID = axom::execution_space::allocatorID(); - axom::IndexType *computed_hashes = axom::allocate(NUM_HASHES, allocatorID); + HashResult* computed_hashes = axom::allocate(NUM_HASHES, allocatorID); // Compute hashes. axom::for_all( @@ -89,8 +92,8 @@ AXOM_TYPED_TEST(core_device_hash, hash_float) AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); }); // Copy back to host. - axom::IndexType computed_hashes_host[NUM_HASHES]; - axom::copy(computed_hashes_host, computed_hashes, sizeof(axom::IndexType) * NUM_HASHES); + HashResult computed_hashes_host[NUM_HASHES]; + axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES); axom::deallocate(computed_hashes); for(int i = 0; i < NUM_HASHES; i++) @@ -112,12 +115,13 @@ AXOM_TYPED_TEST(core_device_hash, hash_float) TEST(core_device_hash, hash_string) { axom::DeviceHash device_hasher; + using HashResult = typename decltype(device_hasher)::result_type; constexpr int NUM_HASHES = 4; std::string things_to_hash[NUM_HASHES] {"0", "1", "37", "1100"}; - axom::IndexType computed_hashes[NUM_HASHES]; + HashResult computed_hashes[NUM_HASHES]; // Compute hashes. for(int i = 0; i < NUM_HASHES; i++) @@ -151,6 +155,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_enum) using ExecSpace = typename TestFixture::ExecSpace; axom::DeviceHash device_hasher; + using HashResult = typename decltype(device_hasher)::result_type; constexpr int NUM_HASHES = 4; @@ -161,7 +166,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_enum) // Allocate space for hash results. int allocatorID = axom::execution_space::allocatorID(); - axom::IndexType *computed_hashes = axom::allocate(NUM_HASHES, allocatorID); + HashResult* computed_hashes = axom::allocate(NUM_HASHES, allocatorID); // Compute hashes. axom::for_all( @@ -169,8 +174,8 @@ AXOM_TYPED_TEST(core_device_hash, hash_enum) AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); }); // Copy back to host. - axom::IndexType computed_hashes_host[NUM_HASHES]; - axom::copy(computed_hashes_host, computed_hashes, sizeof(axom::IndexType) * NUM_HASHES); + HashResult computed_hashes_host[NUM_HASHES]; + axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES); axom::deallocate(computed_hashes); for(int i = 0; i < NUM_HASHES; i++) @@ -210,7 +215,7 @@ struct DeviceHash> constexpr int NWORDS = sizeof(axom_testing::UserVector) / sizeof(int); alignas(axom_testing::UserVector) int bytes[NWORDS]; // NOTE: Separating these statements fixes a warning about strict-aliasing. - auto ptr = reinterpret_cast *>(bytes); + auto ptr = reinterpret_cast*>(bytes); *ptr = value; axom::IndexType hash_result {}; @@ -228,6 +233,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined) using ExecSpace = typename TestFixture::ExecSpace; axom::DeviceHash> device_hasher; + using HashResult = typename decltype(device_hasher)::result_type; constexpr int NUM_HASHES = 4; @@ -238,7 +244,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined) // Allocate space for hash results. int allocatorID = axom::execution_space::allocatorID(); - axom::IndexType *computed_hashes = axom::allocate(NUM_HASHES, allocatorID); + HashResult* computed_hashes = axom::allocate(NUM_HASHES, allocatorID); // Compute hashes. axom::for_all( @@ -246,8 +252,8 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined) AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); }); // Copy back to host. - axom::IndexType computed_hashes_host[NUM_HASHES]; - axom::copy(computed_hashes_host, computed_hashes, sizeof(axom::IndexType) * NUM_HASHES); + HashResult computed_hashes_host[NUM_HASHES]; + axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES); axom::deallocate(computed_hashes); for(int i = 0; i < NUM_HASHES; i++) @@ -263,6 +269,101 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined) } } +AXOM_TYPED_TEST(core_device_hash, hash_uint64_distinguishes_high_bits) +{ + using ExecSpace = typename TestFixture::ExecSpace; + + axom::DeviceHash device_hasher; + using HashResult = typename decltype(device_hasher)::result_type; + + constexpr int NUM_HASHES = 3; + std::uint64_t things_to_hash[NUM_HASHES] = {std::uint64_t {1}, + std::uint64_t {1} + (std::uint64_t {1} << 32), + std::uint64_t {1} + (std::uint64_t {1} << 33)}; + + int allocatorID = axom::execution_space::allocatorID(); + HashResult* computed_hashes = axom::allocate(NUM_HASHES, allocatorID); + + axom::for_all( + NUM_HASHES, + AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); }); + + HashResult computed_hashes_host[NUM_HASHES]; + axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES); + axom::deallocate(computed_hashes); + + EXPECT_NE(computed_hashes_host[0], computed_hashes_host[1]); + EXPECT_NE(computed_hashes_host[0], computed_hashes_host[2]); + EXPECT_NE(computed_hashes_host[1], computed_hashes_host[2]); +} + +AXOM_TYPED_TEST(core_device_hash, hash_fractional_float_device) +{ + using ExecSpace = typename TestFixture::ExecSpace; + + axom::DeviceHash device_hasher; + using HashResult = typename decltype(device_hasher)::result_type; + + constexpr int NUM_HASHES = 8; + float things_to_hash[NUM_HASHES] = {0.25f, 0.75f, -0.5f, 0.5f, 0.125f, 0.625f, 0.875f, 1.25f}; + + int allocatorID = axom::execution_space::allocatorID(); + HashResult* computed_hashes = axom::allocate(NUM_HASHES, allocatorID); + + axom::for_all( + NUM_HASHES, + AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); }); + + HashResult computed_hashes_host[NUM_HASHES]; + axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES); + axom::deallocate(computed_hashes); + + // Idempotence and pairwise distinctness for these chosen values. + for(int i = 0; i < NUM_HASHES; i++) + { + EXPECT_EQ(computed_hashes_host[i], device_hasher(things_to_hash[i])); + for(int j = i + 1; j < NUM_HASHES; j++) + { + EXPECT_NE(computed_hashes_host[i], computed_hashes_host[j]); + } + } + + EXPECT_EQ(device_hasher(0.0f), device_hasher(-0.0f)); +} + +AXOM_TYPED_TEST(core_device_hash, hash_fractional_double_device) +{ + using ExecSpace = typename TestFixture::ExecSpace; + + axom::DeviceHash device_hasher; + using HashResult = typename decltype(device_hasher)::result_type; + + constexpr int NUM_HASHES = 8; + double things_to_hash[NUM_HASHES] = {0.25, 0.75, -0.5, 0.5, 0.125, 0.625, 0.875, 1.25}; + + int allocatorID = axom::execution_space::allocatorID(); + HashResult* computed_hashes = axom::allocate(NUM_HASHES, allocatorID); + + axom::for_all( + NUM_HASHES, + AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); }); + + HashResult computed_hashes_host[NUM_HASHES]; + axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES); + axom::deallocate(computed_hashes); + + for(int i = 0; i < NUM_HASHES; i++) + { + EXPECT_EQ(computed_hashes_host[i], device_hasher(things_to_hash[i])); + for(int j = i + 1; j < NUM_HASHES; j++) + { + EXPECT_NE(computed_hashes_host[i], computed_hashes_host[j]); + } + } + + EXPECT_EQ(device_hasher(0.0), device_hasher(-0.0)); +} + TEST(core_device_hash, hash_width_decoupled_from_indextype) { // The hash result must be 64 bits wide regardless of the configured @@ -277,7 +378,7 @@ TEST(core_device_hash, hash_width_decoupled_from_indextype) "integral hash result must be std::uint64_t"); static_assert(std::is_same::result_type, std::uint64_t>::value, "floating-point hash result must be std::uint64_t"); - static_assert(std::is_same::result_type, std::uint64_t>::value, + static_assert(std::is_same::result_type, std::uint64_t>::value, "pointer hash result must be std::uint64_t"); static_assert(std::is_same::result_type, std::uint64_t>::value, "catch-all (std::hash) result must be std::uint64_t"); From e95e7df5271ebe6cb8d45e8f1288b5a6a67811fe Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 14:23:00 -0700 Subject: [PATCH 21/28] Moves AXOM_FORCE_INLINE to core's Macros.hpp --- src/axom/core/FlatMap.hpp | 9 ++++----- src/axom/core/Macros.hpp | 18 ++++++++++++++++++ src/axom/core/detail/FlatTable.hpp | 23 ++++++----------------- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index c51d246414..8f92292ab4 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -290,11 +290,10 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy #endif -// Force-inline annotation for the FlatMap/FlatTable lookup hot path. -#if defined(__CUDACC__) || defined(__HIPCC__) - #define AXOM_FLATMAP_FORCE_INLINE __forceinline__ -#elif defined(__GNUC__) || defined(__clang__) - #define AXOM_FLATMAP_FORCE_INLINE inline __attribute__((always_inline)) -#elif defined(_MSC_VER) - #define AXOM_FLATMAP_FORCE_INLINE __forceinline -#else - #define AXOM_FLATMAP_FORCE_INLINE inline -#endif - namespace axom { namespace detail @@ -173,7 +162,7 @@ struct GroupBucket } template - AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE int visitHashBucket(std::uint8_t hash, Func&& visitor) const + AXOM_FORCE_INLINE AXOM_HOST_DEVICE int visitHashBucket(std::uint8_t hash, Func&& visitor) const { std::uint8_t reducedHash = reduceHash(hash); #if !defined(AXOM_DEVICE_CODE) && defined(_AXOM_CORE_HAVE_SSE2) @@ -288,7 +277,7 @@ struct GroupBucket } template - AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE bool getMaybeOverflowed(std::uint8_t hash) const + AXOM_FORCE_INLINE AXOM_HOST_DEVICE bool getMaybeOverflowed(std::uint8_t hash) const { std::uint8_t hashOfwBit = 1 << (hash % 8); std::uint8_t curr_ofw; @@ -488,10 +477,10 @@ struct SequentialLookupPolicy : ProbePolicy * matching hash */ template - AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE void probeIndex(int ngroups_pow_2, - ArrayView metadata, - HashType hash, - FoundIndex&& on_hash_found) const + AXOM_FORCE_INLINE AXOM_HOST_DEVICE void probeIndex(int ngroups_pow_2, + ArrayView metadata, + HashType hash, + FoundIndex&& on_hash_found) const { // We use the k MSBs of the hash as the initial group probe point, // where ngroups = 2^k. Since the group count is always a power of two, From 4f34f61d46bf1c3356bb24201d7d3a1d7cd8f432 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 14:32:35 -0700 Subject: [PATCH 22/28] Adds utility function for initializing initial probe group via bitshift and masking --- src/axom/core/FlatMapUtil.hpp | 12 ++++----- src/axom/core/detail/FlatTable.hpp | 41 +++++++++++++++++++----------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp index 4eb1c41a05..da0b363631 100644 --- a/src/axom/core/FlatMapUtil.hpp +++ b/src/axom/core/FlatMapUtil.hpp @@ -359,11 +359,11 @@ void FlatMap::insert(InputIt kv_begin, InputIt kv_end) // Hash keys. auto hash = Hash {}(key); - // We use the k MSBs of the hash as the initial group probe point, - // where ngroups = 2^k. - int bitshift_right = ((CHAR_BIT * sizeof(HashResult)) - ngroups_pow_2); - HashResult curr_group = hash >> bitshift_right; - curr_group &= ((1 << ngroups_pow_2) - 1); + // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k. + const auto init = + detail::flat_map::SequentialLookupPolicy::initGroupProbe(hash, ngroups_pow_2); + const HashResult group_mask = init.group_mask; + HashResult curr_group = init.curr_group; std::uint8_t hash_8 = static_cast(hash); @@ -469,7 +469,7 @@ void FlatMap::insert(InputIt kv_begin, InputIt kv_end) else { // Move to next group. - curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) % meta_group.size(); + curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) & group_mask; iteration++; } } diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp index dafef72b9f..c033ef7795 100644 --- a/src/axom/core/detail/FlatTable.hpp +++ b/src/axom/core/detail/FlatTable.hpp @@ -333,6 +333,21 @@ struct SequentialLookupPolicy : ProbePolicy { constexpr static int NO_MATCH = -1; + struct GroupProbeInit + { + HashType group_mask; + HashType curr_group; + }; + + AXOM_FORCE_INLINE AXOM_HOST_DEVICE static GroupProbeInit initGroupProbe(HashType hash, + int ngroups_pow_2) + { + const int bitshift_right = (CHAR_BIT * sizeof(HashType)) - ngroups_pow_2; + const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1; + const HashType curr_group = (hash >> bitshift_right) & group_mask; + return {group_mask, curr_group}; + } + /*! * \brief Inserts a hash into the first empty bucket in an array of groups * for an open-addressing hash map. @@ -343,12 +358,10 @@ struct SequentialLookupPolicy : ProbePolicy */ IndexType probeEmptyIndex(int ngroups_pow_2, ArrayView metadata, HashType hash) const { - // We use the k MSBs of the hash as the initial group probe point, - // where ngroups = 2^k. Since the group count is always a power of two, - // wrapping a group index is a bitwise AND with this mask. - const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2); - const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1; - HashType curr_group = (hash >> bitshift_right) & group_mask; + // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k. + const auto init = initGroupProbe(hash, ngroups_pow_2); + const HashType group_mask = init.group_mask; + HashType curr_group = init.curr_group; int empty_group = NO_MATCH; int empty_bucket = NO_MATCH; @@ -405,9 +418,9 @@ struct SequentialLookupPolicy : ProbePolicy HashType hash, FoundIndex&& on_hash_found) const { - const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2); - const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1; - HashType curr_group = (hash >> bitshift_right) & group_mask; + const auto init = initGroupProbe(hash, ngroups_pow_2); + const HashType group_mask = init.group_mask; + HashType curr_group = init.curr_group; int empty_group = NO_MATCH; int empty_bucket = NO_MATCH; @@ -482,12 +495,10 @@ struct SequentialLookupPolicy : ProbePolicy HashType hash, FoundIndex&& on_hash_found) const { - // We use the k MSBs of the hash as the initial group probe point, - // where ngroups = 2^k. Since the group count is always a power of two, - // wrapping a group index is a bitwise AND with this mask. - const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2); - const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1; - HashType curr_group = (hash >> bitshift_right) & group_mask; + // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k. + const auto init = initGroupProbe(hash, ngroups_pow_2); + const HashType group_mask = init.group_mask; + HashType curr_group = init.curr_group; std::uint8_t hash_8 = static_cast(hash); bool keep_going = true; From d409eced2bade2816998bfa34be91eeee813ef87 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 14:58:49 -0700 Subject: [PATCH 23/28] FlatMap: Improves documentation and testing of find_with_hash Also improves device hashing of floating point types (float and long double). --- src/axom/core/DeviceHash.hpp | 22 ++++++------- src/axom/core/FlatMap.hpp | 15 +++++++++ src/axom/core/detail/FlatTable.hpp | 4 +-- .../core/tests/core_benchmark_flatmap.cpp | 9 +++--- src/axom/core/tests/core_device_hash.hpp | 13 ++++++++ src/axom/core/tests/core_flatmap.hpp | 32 +++++++++++++++++++ 6 files changed, 77 insertions(+), 18 deletions(-) diff --git a/src/axom/core/DeviceHash.hpp b/src/axom/core/DeviceHash.hpp index d1ac31ced2..0dc1bb4caa 100644 --- a/src/axom/core/DeviceHash.hpp +++ b/src/axom/core/DeviceHash.hpp @@ -52,21 +52,19 @@ struct DeviceHashHelper::value>> // A float-to-integer value conversion collapses every key sharing an integer part, // e.g. all numbers between -1 and 1 converts to integer 0 - // NUM_WORDS is 1 for float or double, possibly 2 for long double - constexpr std::size_t NUM_WORDS = (sizeof(T) + sizeof(std::uint64_t) - 1) / sizeof(std::uint64_t); - // zero out words since we might only copy 4 bytes in for floats - std::uint64_t words[NUM_WORDS] = {0}; - memcpy(words, &value, sizeof(T)); - - std::uint64_t result = words[0]; - // Extra processing fortypes wider than 64 bits (long double). - // Use an odd multiplier (2^64/golden-ratio-phi), - // so the halves cannot cancel under a later XOR-style mixer - for(std::size_t i = 1; i < NUM_WORDS; i++) + if constexpr(sizeof(T) <= sizeof(std::uint64_t)) { - result = result * std::uint64_t {0x9e3779b97f4a7c15} + words[i]; + // Zero-initialize first since we only copy 4 bytes for floats. + std::uint64_t result = 0; + memcpy(&result, &value, sizeof(T)); + return result; } + // Avoid hashing padding bytes for wider floating types such as x86 long double. + // Collisions are acceptable for a hash; equal values must hash identically. + double narrowed_value = static_cast(value); + std::uint64_t result = 0; + memcpy(&result, &narrowed_value, sizeof(narrowed_value)); return result; } }; diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp index 8f92292ab4..be1958a9bf 100644 --- a/src/axom/core/FlatMap.hpp +++ b/src/axom/core/FlatMap.hpp @@ -292,6 +292,21 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy(pairs); + const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL); std::vector hashes; - hashes.reserve(keys.size()); - for(KeyType k : keys) + hashes.reserve(lookup_keys.size()); + for(KeyType k : lookup_keys) { hashes.push_back(typename MapType::hasher {}(k)); } @@ -466,9 +467,9 @@ void BM_FlatMap_Find_Hit_Prehashed(benchmark::State& state) for(auto _ : state) { ValueType sum = 0; - for(std::size_t i = 0; i < keys.size(); ++i) + for(std::size_t i = 0; i < lookup_keys.size(); ++i) { - auto it = map.find_with_hash(keys[i], hashes[i]); + auto it = map.find_with_hash(lookup_keys[i], hashes[i]); if(it != map.end()) { sum += it->second; diff --git a/src/axom/core/tests/core_device_hash.hpp b/src/axom/core/tests/core_device_hash.hpp index 6cab7708b2..dcdb278c37 100644 --- a/src/axom/core/tests/core_device_hash.hpp +++ b/src/axom/core/tests/core_device_hash.hpp @@ -14,6 +14,7 @@ // C++ includes #include #include +#include template class core_device_hash : public ::testing::Test @@ -427,3 +428,15 @@ TEST(core_device_hash, hash_float_bit_pattern) EXPECT_NE(double_hasher(1e300), double_hasher(2e300)); EXPECT_NE(float_hasher(-0.5f), float_hasher(0.5f)); } + +TEST(core_device_hash, hash_long_double_has_stable_equal_value_hash) +{ + axom::DeviceHash long_double_hasher; + + static_assert(std::is_same::result_type, std::uint64_t>::value, + "long double hash result must be std::uint64_t"); + + EXPECT_EQ(long_double_hasher(0.0L), long_double_hasher(-0.0L)); + EXPECT_EQ(long_double_hasher(0.25L), long_double_hasher(static_cast(0.25))); + EXPECT_NE(long_double_hasher(0.25L), long_double_hasher(0.75L)); +} diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp index 021bcd741d..c0665ce21b 100644 --- a/src/axom/core/tests/core_flatmap.hpp +++ b/src/axom/core/tests/core_flatmap.hpp @@ -339,6 +339,38 @@ TEST(core_flatmap_unit, fused_emplace_probe_recomputes_slot_after_rehash) EXPECT_EQ(test_map.count(0), 0); } +TEST(core_flatmap_unit, find_with_hash_uses_precomputed_hash) +{ + using MapType = axom::FlatMap; + MapType test_map; + + const int NUM_ELEMS = 64; + for(int i = 0; i < NUM_ELEMS; i++) + { + test_map.insert_or_assign(i, i * 10); + } + + const int key = 37; + const auto hash = MapType::hasher {}(key); + + auto it = test_map.find_with_hash(key, hash); + ASSERT_NE(it, test_map.end()); + EXPECT_EQ(it->first, key); + EXPECT_EQ(it->second, key * 10); + + const MapType& const_map = test_map; + auto const_it = const_map.find_with_hash(key, hash); + ASSERT_NE(const_it, const_map.end()); + EXPECT_EQ(const_it->first, key); + EXPECT_EQ(const_it->second, key * 10); + + // A precomputed hash is part of the lookup key. Supplying a mismatched hash + // may miss an existing key; this guards the documented precondition. + const auto mismatched_hash = hash ^ MapType::hash_result_type {0x80}; + EXPECT_EQ(test_map.find_with_hash(key, mismatched_hash), test_map.end()); + EXPECT_EQ(const_map.find_with_hash(key, mismatched_hash), const_map.end()); +} + AXOM_TYPED_TEST(core_flatmap, default_init) { using MapType = typename TestFixture::MapType; From 9b14075602dfade3bd9ed8b8569aa55a09247b88 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 16:07:54 -0700 Subject: [PATCH 24/28] Adds benchmarks for device contruction and lookup --- .../core/tests/core_benchmark_flatmap.cpp | 291 +++++++++++++++++- 1 file changed, 287 insertions(+), 4 deletions(-) diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index ccd79a9408..edb8a12cdb 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -13,10 +13,6 @@ #include "axom/CLI11.hpp" #include "axom/fmt.hpp" -#include "axom/core/FlatMap.hpp" -#include "axom/core/FlatMapUtil.hpp" -#include "axom/core/detail/FlatTable.hpp" - #if defined(AXOM_USE_SPARSEHASH) #include "axom/sparsehash/sparse_hash_map" #endif @@ -553,6 +549,269 @@ void BM_BatchedInsert_Reserved(benchmark::State& state) } } +#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) + + // Device execution policy for benchmarking + #if defined(AXOM_USE_HIP) +using DeviceExec = axom::HIP_EXEC<256>; + #elif defined(AXOM_USE_CUDA) +using DeviceExec = axom::CUDA_EXEC<256>; + #endif + + #if defined(AXOM_USE_CUDA) || defined(AXOM_USE_HIP) + +/*! + * \brief Device/parallel benchmarks for FlatMap + * + * These benchmarks measure GPU kernel execution + data transfer overhead. + * Google Benchmark measures wall-clock time, which includes: + * - Host-to-device memory transfers + * - Kernel launch overhead + * - GPU execution time + * - Device-to-host synchronization + * + * For pure kernel performance, profile with nsys/rocprof separately. + * These benchmarks characterize end-to-end device operation cost. + */ + +/*! + * \brief Simple device sanity check + * + * Minimal test to verify device operations work before trying FlatMap. + */ +void BM_Device_Sanity_Check(benchmark::State& state) +{ + const int n = state.range(0); + + const int device_allocator_id = axom::execution_space::allocatorID(); + if(device_allocator_id == axom::INVALID_ALLOCATOR_ID) + { + state.SkipWithError("Device allocator not available"); + return; + } + + // Allocate simple arrays on device + int* device_input = axom::allocate(n, device_allocator_id); + int* device_output = axom::allocate(n, device_allocator_id); + + // Initialize on host + std::vector host_data(n, 42); + axom::copy(device_input, host_data.data(), sizeof(int) * n); + + for(auto _ : state) + { + // Simple kernel: copy input to output + axom::for_all(n, [=] AXOM_HOST_DEVICE(int i) { + device_output[i] = device_input[i] + 1; + }); + + axom::synchronize(); + benchmark::DoNotOptimize(device_output); + } + + axom::deallocate(device_input); + axom::deallocate(device_output); +} + +/*! + * \brief Benchmark parallel batched insertion on device + */ +void BM_FlatMap_Insert_Device_Reserved(benchmark::State& state) +{ + using MapType = axom::FlatMap; + + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0x1CEB00DAULL); + const auto pairs = make_pairs(keys); + + // Check if device allocator is available + const int device_allocator_id = axom::execution_space::allocatorID(); + if(device_allocator_id == axom::INVALID_ALLOCATOR_ID) + { + state.SkipWithError("Device allocator not available"); + return; + } + + // Use axom::Array for host data + using PairType = std::pair; + axom::Array host_pairs(pairs.size(), pairs.size()); + std::copy(pairs.begin(), pairs.end(), host_pairs.data()); + + // Copy to device using axom::Array with device allocator + axom::Array device_pairs(pairs.size(), pairs.size(), device_allocator_id); + axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size()); + + const std::size_t bs = static_cast(std::max(1, ::args_batch_size)); + + // Get device-safe ArrayView and extract raw pointer for template instantiation + auto pairs_view = device_pairs.view(); + PairType* device_pairs_ptr = pairs_view.data(); + const std::size_t total_size = pairs.size(); + + for(auto _ : state) + { + // Create map with device allocator and reserve capacity + MapType map(axom::Allocator {device_allocator_id}); + map.reserve(static_cast(pairs.size())); + + // Benchmark parallel batched insertion using raw pointers from ArrayView + for(std::size_t offset = 0; offset < total_size; offset += bs) + { + const std::size_t count = std::min(bs, total_size - offset); + map.template insert(device_pairs_ptr + offset, device_pairs_ptr + offset + count); + } + + // Synchronize to ensure device operations complete + axom::synchronize(); + + benchmark::DoNotOptimize(map); + } +} + +/*! + * \brief Benchmark parallel lookup on device + */ +void BM_FlatMap_Find_Hit_Device(benchmark::State& state) +{ + using MapType = axom::FlatMap; + + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); + const auto pairs = make_pairs(keys); + const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL); + + // Check if device allocator is available + const int device_allocator_id = axom::execution_space::allocatorID(); + if(device_allocator_id == axom::INVALID_ALLOCATOR_ID) + { + state.SkipWithError("Device allocator not available"); + return; + } + + // Use axom::Array for host data + using PairType = std::pair; + axom::Array host_pairs(pairs.size(), pairs.size()); + std::copy(pairs.begin(), pairs.end(), host_pairs.data()); + + // Copy to device using axom::Array with device allocator + axom::Array device_pairs(pairs.size(), pairs.size(), device_allocator_id); + axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size()); + + // Create and populate map on device + MapType map(axom::Allocator {device_allocator_id}); + map.reserve(static_cast(pairs.size())); + + // Use raw pointer from ArrayView for template instantiation + auto pairs_view = device_pairs.view(); + map.template insert(pairs_view.data(), pairs_view.data() + pairs_view.size()); + + // Copy lookup keys to device using axom::Array + axom::Array host_lookup_keys(lookup_keys.size(), lookup_keys.size()); + std::copy(lookup_keys.begin(), lookup_keys.end(), host_lookup_keys.data()); + + axom::Array device_lookup_keys(lookup_keys.size(), lookup_keys.size(), device_allocator_id); + axom::copy(device_lookup_keys.data(), host_lookup_keys.data(), sizeof(KeyType) * lookup_keys.size()); + + // Allocate result array on device using axom::Array + axom::Array device_results(lookup_keys.size(), lookup_keys.size(), device_allocator_id); + + // Get device-safe views for kernel capture + // Note: Using explicit [=] AXOM_HOST_DEVICE instead of AXOM_LAMBDA to avoid + // RAJA privatizer issues on HIP with non-trivial types in capture + auto map_view = map.view(); + auto lookup_keys_view = device_lookup_keys.view(); + auto results_view = device_results.view(); + + for(auto _ : state) + { + // Perform lookups in parallel using ArrayViews + axom::for_all(static_cast(lookup_keys.size()), + [=] AXOM_HOST_DEVICE(axom::IndexType i) { + auto it = map_view.find(lookup_keys_view[i]); + results_view[i] = + (it != map_view.end()) ? it->second : ValueType {-1}; + }); + + // Synchronize to ensure device operations complete + axom::synchronize(); + + benchmark::DoNotOptimize(device_results.data()); + } +} + +/*! + * \brief Benchmark parallel lookup misses on device + */ +void BM_FlatMap_Find_Miss_Device(benchmark::State& state) +{ + using MapType = axom::FlatMap; + + const int n = state.range(0); + const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); + const auto pairs = make_pairs(keys); + const auto miss_keys = make_miss_keys(keys, static_cast(n) + 11); + + // Check if device allocator is available + const int device_allocator_id = axom::execution_space::allocatorID(); + if(device_allocator_id == axom::INVALID_ALLOCATOR_ID) + { + state.SkipWithError("Device allocator not available"); + return; + } + + // Use axom::Array for host data + using PairType = std::pair; + axom::Array host_pairs(pairs.size(), pairs.size()); + std::copy(pairs.begin(), pairs.end(), host_pairs.data()); + + // Copy to device using axom::Array with device allocator + axom::Array device_pairs(pairs.size(), pairs.size(), device_allocator_id); + axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size()); + + // Create and populate map on device + MapType map(axom::Allocator {device_allocator_id}); + map.reserve(static_cast(pairs.size())); + + // Use raw pointer from ArrayView for template instantiation + auto pairs_view = device_pairs.view(); + map.template insert(pairs_view.data(), pairs_view.data() + pairs_view.size()); + + // Copy miss keys to device using axom::Array + axom::Array host_miss_keys(miss_keys.size(), miss_keys.size()); + std::copy(miss_keys.begin(), miss_keys.end(), host_miss_keys.data()); + + axom::Array device_miss_keys(miss_keys.size(), miss_keys.size(), device_allocator_id); + axom::copy(device_miss_keys.data(), host_miss_keys.data(), sizeof(KeyType) * miss_keys.size()); + + // Allocate result array on device using axom::Array + axom::Array device_misses(miss_keys.size(), miss_keys.size(), device_allocator_id); + + // Get device-safe views for kernel capture + // Note: Using explicit [=] AXOM_HOST_DEVICE instead of AXOM_LAMBDA to avoid + // RAJA privatizer issues on HIP with non-trivial types in capture + auto map_view = map.view(); + auto miss_keys_view = device_miss_keys.view(); + auto misses_view = device_misses.view(); + + for(auto _ : state) + { + // Perform lookups in parallel using ArrayViews + axom::for_all(static_cast(miss_keys.size()), + [=] AXOM_HOST_DEVICE(axom::IndexType i) { + misses_view[i] = + (map_view.find(miss_keys_view[i]) == map_view.end()) ? 1 : 0; + }); + + // Synchronize to ensure device operations complete + axom::synchronize(); + + benchmark::DoNotOptimize(device_misses.data()); + } +} + + #endif // AXOM_USE_CUDA || AXOM_USE_HIP +#endif // AXOM_USE_RAJA && AXOM_USE_UMPIRE + } // namespace //----------------------------------------------------------------------------- @@ -725,6 +984,30 @@ int main(int argc, char* argv[]) "axom::google::sparse_hash_map"); #endif + // Device/parallel benchmarks for debugging +#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && \ + (defined(AXOM_USE_CUDA) || defined(AXOM_USE_HIP)) + + // Device benchmarks enabled with raw pointers (iterators cause host stack address faults) + benchmark::RegisterBenchmark("Device::sanity_check", &BM_Device_Sanity_Check)->Apply(CustomArgs); + + if((::args_benchmark_features & FlatMapFeatureBenchmarks::BatchedInsertion) != + FlatMapFeatureBenchmarks::None) + { + benchmark::RegisterBenchmark("axom::FlatMap::insert_device_reserved", + &BM_FlatMap_Insert_Device_Reserved) + ->Apply(CustomArgs); + } + + if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None) + { + benchmark::RegisterBenchmark("axom::FlatMap::find_hit_device", &BM_FlatMap_Find_Hit_Device) + ->Apply(CustomArgs); + benchmark::RegisterBenchmark("axom::FlatMap::find_miss_device", &BM_FlatMap_Find_Miss_Device) + ->Apply(CustomArgs); + } +#endif // AXOM_USE_RAJA && AXOM_USE_UMPIRE && (AXOM_USE_CUDA || AXOM_USE_HIP) + ::benchmark::RunSpecifiedBenchmarks(); return 0; } From 489d9ff20694823c614918bc78f8c17324809b7d Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 16:46:07 -0700 Subject: [PATCH 25/28] FlatMap: Generalizes the device benchmarks to other execution spaces, including omp --- src/axom/core/tests/CMakeLists.txt | 8 +- .../core/tests/core_benchmark_flatmap.cpp | 307 ++++++++---------- 2 files changed, 139 insertions(+), 176 deletions(-) diff --git a/src/axom/core/tests/CMakeLists.txt b/src/axom/core/tests/CMakeLists.txt index b30ebb25fc..7322f589f6 100644 --- a/src/axom/core/tests/CMakeLists.txt +++ b/src/axom/core/tests/CMakeLists.txt @@ -218,6 +218,11 @@ if (ENABLE_BENCHMARKS) foreach(test ${core_benchmarks}) get_filename_component(test_name ${test} NAME_WE) + set(_num_threads) + if(test STREQUAL "core_benchmark_flatmap.cpp" AND AXOM_ENABLE_OPENMP) + set(_num_threads ${AXOM_TEST_NUM_OMP_THREADS}) + endif() + axom_add_executable(NAME ${test_name} SOURCES ${test} OUTPUT_DIR ${TEST_OUTPUT_DIRECTORY} @@ -225,6 +230,7 @@ if (ENABLE_BENCHMARKS) FOLDER axom/core/benchmarks) blt_add_benchmark(NAME ${test_name} - COMMAND ${test_name} --benchmark_min_time=0.0001s) + COMMAND ${test_name} --benchmark_min_time=0.0001s + NUM_OMP_THREADS ${_num_threads}) endforeach() endif() diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index edb8a12cdb..c99dc61e20 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -549,269 +549,211 @@ void BM_BatchedInsert_Reserved(benchmark::State& state) } } -#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) - - // Device execution policy for benchmarking - #if defined(AXOM_USE_HIP) -using DeviceExec = axom::HIP_EXEC<256>; - #elif defined(AXOM_USE_CUDA) -using DeviceExec = axom::CUDA_EXEC<256>; - #endif - - #if defined(AXOM_USE_CUDA) || defined(AXOM_USE_HIP) - /*! - * \brief Device/parallel benchmarks for FlatMap + * \brief Execution-space benchmarks for FlatMap. * - * These benchmarks measure GPU kernel execution + data transfer overhead. - * Google Benchmark measures wall-clock time, which includes: - * - Host-to-device memory transfers - * - Kernel launch overhead - * - GPU execution time - * - Device-to-host synchronization - * - * For pure kernel performance, profile with nsys/rocprof separately. - * These benchmarks characterize end-to-end device operation cost. + * These benchmarks measure execution-space operation cost. Host-to-exec-space + * data setup is outside the timed loop; kernel launch/execution and + * synchronization are included. */ -/*! - * \brief Simple device sanity check - * - * Minimal test to verify device operations work before trying FlatMap. - */ -void BM_Device_Sanity_Check(benchmark::State& state) +template +bool get_allocator_or_skip(benchmark::State& state, int& allocator_id) +{ + allocator_id = axom::execution_space::allocatorID(); + if(allocator_id == axom::INVALID_ALLOCATOR_ID) + { + state.SkipWithError("Execution-space allocator not available"); + return false; + } + + return true; +} + +template +axom::Array copy_to_allocator(const std::vector& values, int allocator_id) +{ + axom::Array copied_values(values.size(), values.size(), allocator_id); + if(!values.empty()) + { + axom::copy(copied_values.data(), values.data(), sizeof(T) * values.size()); + } + return copied_values; +} + +template +void BM_ExecSpace_Sanity_Check(benchmark::State& state) { const int n = state.range(0); - const int device_allocator_id = axom::execution_space::allocatorID(); - if(device_allocator_id == axom::INVALID_ALLOCATOR_ID) + int allocator_id = axom::INVALID_ALLOCATOR_ID; + if(!get_allocator_or_skip(state, allocator_id)) { - state.SkipWithError("Device allocator not available"); return; } - // Allocate simple arrays on device - int* device_input = axom::allocate(n, device_allocator_id); - int* device_output = axom::allocate(n, device_allocator_id); + int* input = axom::allocate(n, allocator_id); + int* output = axom::allocate(n, allocator_id); - // Initialize on host std::vector host_data(n, 42); - axom::copy(device_input, host_data.data(), sizeof(int) * n); + axom::copy(input, host_data.data(), sizeof(int) * n); for(auto _ : state) { - // Simple kernel: copy input to output - axom::for_all(n, [=] AXOM_HOST_DEVICE(int i) { - device_output[i] = device_input[i] + 1; - }); + axom::for_all(n, [=] AXOM_HOST_DEVICE(int i) { output[i] = input[i] + 1; }); - axom::synchronize(); - benchmark::DoNotOptimize(device_output); + axom::synchronize(); + benchmark::DoNotOptimize(output); } - axom::deallocate(device_input); - axom::deallocate(device_output); + axom::deallocate(input); + axom::deallocate(output); } /*! - * \brief Benchmark parallel batched insertion on device + * \brief Benchmark parallel batched insertion using an execution space. */ -void BM_FlatMap_Insert_Device_Reserved(benchmark::State& state) +template +void BM_FlatMap_Insert_ExecSpace_Reserved(benchmark::State& state) { using MapType = axom::FlatMap; + using PairType = std::pair; const int n = state.range(0); const auto keys = make_shuffled_keys(n, 0x1CEB00DAULL); const auto pairs = make_pairs(keys); - // Check if device allocator is available - const int device_allocator_id = axom::execution_space::allocatorID(); - if(device_allocator_id == axom::INVALID_ALLOCATOR_ID) + int allocator_id = axom::INVALID_ALLOCATOR_ID; + if(!get_allocator_or_skip(state, allocator_id)) { - state.SkipWithError("Device allocator not available"); return; } - // Use axom::Array for host data - using PairType = std::pair; - axom::Array host_pairs(pairs.size(), pairs.size()); - std::copy(pairs.begin(), pairs.end(), host_pairs.data()); - - // Copy to device using axom::Array with device allocator - axom::Array device_pairs(pairs.size(), pairs.size(), device_allocator_id); - axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size()); - const std::size_t bs = static_cast(std::max(1, ::args_batch_size)); - - // Get device-safe ArrayView and extract raw pointer for template instantiation - auto pairs_view = device_pairs.view(); - PairType* device_pairs_ptr = pairs_view.data(); + axom::Array exec_pairs = copy_to_allocator(pairs, allocator_id); + auto pairs_view = exec_pairs.view(); + PairType* pairs_ptr = pairs_view.data(); const std::size_t total_size = pairs.size(); for(auto _ : state) { - // Create map with device allocator and reserve capacity - MapType map(axom::Allocator {device_allocator_id}); + MapType map(axom::Allocator {allocator_id}); map.reserve(static_cast(pairs.size())); - // Benchmark parallel batched insertion using raw pointers from ArrayView for(std::size_t offset = 0; offset < total_size; offset += bs) { const std::size_t count = std::min(bs, total_size - offset); - map.template insert(device_pairs_ptr + offset, device_pairs_ptr + offset + count); + map.template insert(pairs_ptr + offset, pairs_ptr + offset + count); } - // Synchronize to ensure device operations complete - axom::synchronize(); + axom::synchronize(); benchmark::DoNotOptimize(map); } } /*! - * \brief Benchmark parallel lookup on device + * \brief Benchmark parallel successful lookup using an execution space. */ -void BM_FlatMap_Find_Hit_Device(benchmark::State& state) +template +void BM_FlatMap_Find_Hit_ExecSpace(benchmark::State& state) { using MapType = axom::FlatMap; + using PairType = std::pair; const int n = state.range(0); const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); const auto pairs = make_pairs(keys); const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL); - // Check if device allocator is available - const int device_allocator_id = axom::execution_space::allocatorID(); - if(device_allocator_id == axom::INVALID_ALLOCATOR_ID) + int allocator_id = axom::INVALID_ALLOCATOR_ID; + if(!get_allocator_or_skip(state, allocator_id)) { - state.SkipWithError("Device allocator not available"); return; } - // Use axom::Array for host data - using PairType = std::pair; - axom::Array host_pairs(pairs.size(), pairs.size()); - std::copy(pairs.begin(), pairs.end(), host_pairs.data()); - - // Copy to device using axom::Array with device allocator - axom::Array device_pairs(pairs.size(), pairs.size(), device_allocator_id); - axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size()); - - // Create and populate map on device - MapType map(axom::Allocator {device_allocator_id}); + axom::Array exec_pairs = copy_to_allocator(pairs, allocator_id); + MapType map(axom::Allocator {allocator_id}); map.reserve(static_cast(pairs.size())); - // Use raw pointer from ArrayView for template instantiation - auto pairs_view = device_pairs.view(); - map.template insert(pairs_view.data(), pairs_view.data() + pairs_view.size()); - - // Copy lookup keys to device using axom::Array - axom::Array host_lookup_keys(lookup_keys.size(), lookup_keys.size()); - std::copy(lookup_keys.begin(), lookup_keys.end(), host_lookup_keys.data()); + auto pairs_view = exec_pairs.view(); + map.template insert(pairs_view.data(), pairs_view.data() + pairs_view.size()); + axom::synchronize(); - axom::Array device_lookup_keys(lookup_keys.size(), lookup_keys.size(), device_allocator_id); - axom::copy(device_lookup_keys.data(), host_lookup_keys.data(), sizeof(KeyType) * lookup_keys.size()); + axom::Array exec_lookup_keys = copy_to_allocator(lookup_keys, allocator_id); + axom::Array exec_results(lookup_keys.size(), lookup_keys.size(), allocator_id); - // Allocate result array on device using axom::Array - axom::Array device_results(lookup_keys.size(), lookup_keys.size(), device_allocator_id); - - // Get device-safe views for kernel capture // Note: Using explicit [=] AXOM_HOST_DEVICE instead of AXOM_LAMBDA to avoid // RAJA privatizer issues on HIP with non-trivial types in capture auto map_view = map.view(); - auto lookup_keys_view = device_lookup_keys.view(); - auto results_view = device_results.view(); + auto lookup_keys_view = exec_lookup_keys.view(); + auto results_view = exec_results.view(); for(auto _ : state) { - // Perform lookups in parallel using ArrayViews - axom::for_all(static_cast(lookup_keys.size()), - [=] AXOM_HOST_DEVICE(axom::IndexType i) { - auto it = map_view.find(lookup_keys_view[i]); - results_view[i] = - (it != map_view.end()) ? it->second : ValueType {-1}; - }); + axom::for_all(static_cast(lookup_keys.size()), + [=] AXOM_HOST_DEVICE(axom::IndexType i) { + auto it = map_view.find(lookup_keys_view[i]); + results_view[i] = (it != map_view.end()) ? it->second : ValueType {-1}; + }); - // Synchronize to ensure device operations complete - axom::synchronize(); + axom::synchronize(); - benchmark::DoNotOptimize(device_results.data()); + benchmark::DoNotOptimize(exec_results.data()); } } /*! - * \brief Benchmark parallel lookup misses on device + * \brief Benchmark parallel missed lookup using an execution space. */ -void BM_FlatMap_Find_Miss_Device(benchmark::State& state) +template +void BM_FlatMap_Find_Miss_ExecSpace(benchmark::State& state) { using MapType = axom::FlatMap; + using PairType = std::pair; const int n = state.range(0); const auto keys = make_shuffled_keys(n, 0xC0FFEEULL); const auto pairs = make_pairs(keys); const auto miss_keys = make_miss_keys(keys, static_cast(n) + 11); - // Check if device allocator is available - const int device_allocator_id = axom::execution_space::allocatorID(); - if(device_allocator_id == axom::INVALID_ALLOCATOR_ID) + int allocator_id = axom::INVALID_ALLOCATOR_ID; + if(!get_allocator_or_skip(state, allocator_id)) { - state.SkipWithError("Device allocator not available"); return; } - // Use axom::Array for host data - using PairType = std::pair; - axom::Array host_pairs(pairs.size(), pairs.size()); - std::copy(pairs.begin(), pairs.end(), host_pairs.data()); - - // Copy to device using axom::Array with device allocator - axom::Array device_pairs(pairs.size(), pairs.size(), device_allocator_id); - axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size()); - - // Create and populate map on device - MapType map(axom::Allocator {device_allocator_id}); + axom::Array exec_pairs = copy_to_allocator(pairs, allocator_id); + MapType map(axom::Allocator {allocator_id}); map.reserve(static_cast(pairs.size())); - // Use raw pointer from ArrayView for template instantiation - auto pairs_view = device_pairs.view(); - map.template insert(pairs_view.data(), pairs_view.data() + pairs_view.size()); - - // Copy miss keys to device using axom::Array - axom::Array host_miss_keys(miss_keys.size(), miss_keys.size()); - std::copy(miss_keys.begin(), miss_keys.end(), host_miss_keys.data()); + auto pairs_view = exec_pairs.view(); + map.template insert(pairs_view.data(), pairs_view.data() + pairs_view.size()); + axom::synchronize(); - axom::Array device_miss_keys(miss_keys.size(), miss_keys.size(), device_allocator_id); - axom::copy(device_miss_keys.data(), host_miss_keys.data(), sizeof(KeyType) * miss_keys.size()); + axom::Array exec_miss_keys = copy_to_allocator(miss_keys, allocator_id); + axom::Array exec_misses(miss_keys.size(), miss_keys.size(), allocator_id); - // Allocate result array on device using axom::Array - axom::Array device_misses(miss_keys.size(), miss_keys.size(), device_allocator_id); - - // Get device-safe views for kernel capture // Note: Using explicit [=] AXOM_HOST_DEVICE instead of AXOM_LAMBDA to avoid // RAJA privatizer issues on HIP with non-trivial types in capture auto map_view = map.view(); - auto miss_keys_view = device_miss_keys.view(); - auto misses_view = device_misses.view(); + auto miss_keys_view = exec_miss_keys.view(); + auto misses_view = exec_misses.view(); for(auto _ : state) { - // Perform lookups in parallel using ArrayViews - axom::for_all(static_cast(miss_keys.size()), - [=] AXOM_HOST_DEVICE(axom::IndexType i) { - misses_view[i] = - (map_view.find(miss_keys_view[i]) == map_view.end()) ? 1 : 0; - }); + axom::for_all(static_cast(miss_keys.size()), + [=] AXOM_HOST_DEVICE(axom::IndexType i) { + misses_view[i] = + (map_view.find(miss_keys_view[i]) == map_view.end()) ? 1 : 0; + }); - // Synchronize to ensure device operations complete - axom::synchronize(); + axom::synchronize(); - benchmark::DoNotOptimize(device_misses.data()); + benchmark::DoNotOptimize(exec_misses.data()); } } - #endif // AXOM_USE_CUDA || AXOM_USE_HIP -#endif // AXOM_USE_RAJA && AXOM_USE_UMPIRE - } // namespace //----------------------------------------------------------------------------- @@ -863,6 +805,33 @@ void RegisterFlatMapPrehashedBenchmarks() } } +template +void RegisterFlatMapExecSpaceBenchmarks(const std::string& exec_suffix, + const std::string& sanity_prefix) +{ + benchmark::RegisterBenchmark(axom::fmt::format("{}::sanity_check", sanity_prefix), + &BM_ExecSpace_Sanity_Check) + ->Apply(CustomArgs); + + if((::args_benchmark_features & FlatMapFeatureBenchmarks::BatchedInsertion) != + FlatMapFeatureBenchmarks::None) + { + benchmark::RegisterBenchmark(axom::fmt::format("axom::FlatMap::insert_{}_reserved", exec_suffix), + &BM_FlatMap_Insert_ExecSpace_Reserved) + ->Apply(CustomArgs); + } + + if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None) + { + benchmark::RegisterBenchmark(axom::fmt::format("axom::FlatMap::find_hit_{}", exec_suffix), + &BM_FlatMap_Find_Hit_ExecSpace) + ->Apply(CustomArgs); + benchmark::RegisterBenchmark(axom::fmt::format("axom::FlatMap::find_miss_{}", exec_suffix), + &BM_FlatMap_Find_Miss_ExecSpace) + ->Apply(CustomArgs); + } +} + int main(int argc, char* argv[]) { std::vector local_test_sizes; @@ -984,29 +953,17 @@ int main(int argc, char* argv[]) "axom::google::sparse_hash_map"); #endif - // Device/parallel benchmarks for debugging -#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && \ - (defined(AXOM_USE_CUDA) || defined(AXOM_USE_HIP)) - - // Device benchmarks enabled with raw pointers (iterators cause host stack address faults) - benchmark::RegisterBenchmark("Device::sanity_check", &BM_Device_Sanity_Check)->Apply(CustomArgs); + RegisterFlatMapExecSpaceBenchmarks("seq", "SEQ"); - if((::args_benchmark_features & FlatMapFeatureBenchmarks::BatchedInsertion) != - FlatMapFeatureBenchmarks::None) - { - benchmark::RegisterBenchmark("axom::FlatMap::insert_device_reserved", - &BM_FlatMap_Insert_Device_Reserved) - ->Apply(CustomArgs); - } +#if defined(AXOM_USE_OPENMP) && defined(AXOM_USE_RAJA) + RegisterFlatMapExecSpaceBenchmarks("omp", "OMP"); +#endif - if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None) - { - benchmark::RegisterBenchmark("axom::FlatMap::find_hit_device", &BM_FlatMap_Find_Hit_Device) - ->Apply(CustomArgs); - benchmark::RegisterBenchmark("axom::FlatMap::find_miss_device", &BM_FlatMap_Find_Miss_Device) - ->Apply(CustomArgs); - } -#endif // AXOM_USE_RAJA && AXOM_USE_UMPIRE && (AXOM_USE_CUDA || AXOM_USE_HIP) +#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && defined(AXOM_USE_HIP) + RegisterFlatMapExecSpaceBenchmarks>("device", "Device"); +#elif defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && defined(AXOM_USE_CUDA) + RegisterFlatMapExecSpaceBenchmarks>("device", "Device"); +#endif ::benchmark::RunSpecifiedBenchmarks(); return 0; From fd31c5e5d63f7d0bad89ed2b6506abb1582ede7f Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 17:01:01 -0700 Subject: [PATCH 26/28] Add number of threads to omp benchmarks --- .../core/tests/core_benchmark_flatmap.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp index c99dc61e20..35119b1a7a 100644 --- a/src/axom/core/tests/core_benchmark_flatmap.cpp +++ b/src/axom/core/tests/core_benchmark_flatmap.cpp @@ -13,6 +13,10 @@ #include "axom/CLI11.hpp" #include "axom/fmt.hpp" +#if defined(AXOM_USE_OPENMP) + #include +#endif + #if defined(AXOM_USE_SPARSEHASH) #include "axom/sparsehash/sparse_hash_map" #endif @@ -805,6 +809,18 @@ void RegisterFlatMapPrehashedBenchmarks() } } +#if defined(AXOM_USE_OPENMP) && defined(AXOM_USE_RAJA) +std::string make_openmp_exec_suffix() +{ + return axom::fmt::format("omp_{}t", omp_get_max_threads()); +} + +std::string make_openmp_sanity_prefix() +{ + return axom::fmt::format("OMP_{}t", omp_get_max_threads()); +} +#endif + template void RegisterFlatMapExecSpaceBenchmarks(const std::string& exec_suffix, const std::string& sanity_prefix) @@ -956,7 +972,8 @@ int main(int argc, char* argv[]) RegisterFlatMapExecSpaceBenchmarks("seq", "SEQ"); #if defined(AXOM_USE_OPENMP) && defined(AXOM_USE_RAJA) - RegisterFlatMapExecSpaceBenchmarks("omp", "OMP"); + RegisterFlatMapExecSpaceBenchmarks(make_openmp_exec_suffix(), + make_openmp_sanity_prefix()); #endif #if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && defined(AXOM_USE_HIP) From a85db8211547ab4724d1f8c916ddd89f0e223457 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Thu, 11 Jun 2026 17:05:21 -0700 Subject: [PATCH 27/28] Updates RELEASE-NOTES --- RELEASE-NOTES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index a4bd67cc64..059766fcb2 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -59,6 +59,8 @@ The Axom project release numbers follow [Semantic Versioning](http://semver.org/ - Primal: Improves reproducibility of 3D GWN methods by removing some sources of randomness - Core: ArrayView assigments/copies now copy the stride - Core: Array construction from strided ArrayView now correctly copies the strided elements +- Core: Improved `axom::FlatMap` insertion performance by fusing duplicate-key lookup with empty-slot probing. +- Core: Updated DeviceHash to use 64-bit hash results and improved coverage for integer and floating-point hashing. ## [Version 0.14.0] - Release date 2026-03-31 From d8bb8e9d3ae99fddfa806581d6257aae8e31b5c8 Mon Sep 17 00:00:00 2001 From: Kenneth Weiss Date: Fri, 12 Jun 2026 18:55:32 -0700 Subject: [PATCH 28/28] Bugfix for rzvector -- `if constexpr` needs an `else` --- src/CMakeLists.txt | 2 +- src/axom/core/FlatMapUtil.hpp | 393 +++++++++++++++++----------------- 2 files changed, 198 insertions(+), 197 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 362e28914d..08eabf44f7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -42,7 +42,7 @@ else() endif() endif() -if (“${PROJECT_SOURCE_DIR}” STREQUAL “${CMAKE_SOURCE_DIR}”) +if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") # Set some default BLT options before loading BLT only if not included in # another project if (NOT BLT_CXX_STD) diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp index da0b363631..24cb9e38d6 100644 --- a/src/axom/core/FlatMapUtil.hpp +++ b/src/axom/core/FlatMapUtil.hpp @@ -285,233 +285,234 @@ void FlatMap::insert(InputIt kv_begin, InputIt kv_end) this->insert_or_assign(std::forward(kv).first, std::forward(kv).second); } - return; } - - using HashResult = typename Hash::result_type; - using GroupBucket = detail::flat_map::GroupBucket; - - IndexType num_elems = std::distance(kv_begin, kv_end); - - // Batched insertion assumes probing sequences are gap-free - // (i.e., there are no tombstones from prior erase() operations). - // When tombstones exist, the parallel insertion logic can mishandle duplicates - // under contention (e.g. OpenMP) and produce incorrect size/value results. - // - // If tombstones exist, rehash to compact the table and restore the invariants required by this algorithm. - if(this->m_loadCount != static_cast(this->m_size)) + else { - this->rehash(this->m_size + num_elems); - } + using HashResult = typename Hash::result_type; + using GroupBucket = detail::flat_map::GroupBucket; + + IndexType num_elems = std::distance(kv_begin, kv_end); + + // Batched insertion assumes probing sequences are gap-free + // (i.e., there are no tombstones from prior erase() operations). + // When tombstones exist, the parallel insertion logic can mishandle duplicates + // under contention (e.g. OpenMP) and produce incorrect size/value results. + // + // If tombstones exist, rehash to compact the table and restore the invariants required by this algorithm. + if(this->m_loadCount != static_cast(this->m_size)) + { + this->rehash(this->m_size + num_elems); + } - const bool is_gap_free = (this->m_loadCount == static_cast(this->m_size)); + const bool is_gap_free = (this->m_loadCount == static_cast(this->m_size)); - // Assume that all elements will be inserted into an empty slot. - this->reserve(this->size() + num_elems); + // Assume that all elements will be inserted into an empty slot. + this->reserve(this->size() + num_elems); - FlatMap temp; - bool allocate_temp_map = false; + FlatMap temp; + bool allocate_temp_map = false; #if defined(AXOM_USE_CUDA) && defined(AXOM_USE_UMPIRE) - if(this->m_allocator.getSpace() == MemorySpace::Pinned) - { - // Pinned memory is allocated on the CPU, and is not always coherent with respect to the GPU. - // Instead of using system-scope atomics, we just construct a temporary map in device memory - // and copy it back to the pinned space. - axom::Allocator device_allocator {axom::detail::getAllocatorID()}; - temp = FlatMap(*this, device_allocator); - allocate_temp_map = true; - } + if(this->m_allocator.getSpace() == MemorySpace::Pinned) + { + // Pinned memory is allocated on the CPU, and is not always coherent with respect to the GPU. + // Instead of using system-scope atomics, we just construct a temporary map in device memory + // and copy it back to the pinned space. + axom::Allocator device_allocator {axom::detail::getAllocatorID()}; + temp = FlatMap(*this, device_allocator); + allocate_temp_map = true; + } #endif - FlatMap& map = allocate_temp_map ? temp : *this; - - // Grab some needed internal fields from the flat map. - // We're going to be constructing metadata and the K-V pairs directly - // in-place. - const int ngroups_pow_2 = map.m_numGroups2; - const auto meta_group = map.m_metadata.view(); - const auto buckets = map.m_buckets.view(); - - // Construct an array of locks per-group. This guards metadata updates for - // each insertion. - const IndexType num_groups = 1 << ngroups_pow_2; - Array lock_vec(num_groups, num_groups, map.m_allocator.getID()); - const auto group_locks = lock_vec.view(); - - // Map bucket slots to k-v pair indices. This is used to deduplicate pairs - // with the same key value. - Array key_index_dedup_vec(0, 0, map.m_allocator.getID()); - key_index_dedup_vec.resize(num_groups * GroupBucket::Size, -1); - const auto key_index_dedup = key_index_dedup_vec.view(); - - // Map k-v pair indices to bucket slots. This is essentially the inverse of - // the above mapping. - Array key_index_to_bucket_vec(num_elems, num_elems, map.m_allocator.getID()); - const auto key_index_to_bucket = key_index_to_bucket_vec.view(); - - axom::ReduceSum total_overwrites(0); - - for_all( - num_elems, - AXOM_LAMBDA(IndexType idx) { - // Construct key. - KeyType key = (*(kv_begin + idx)).first; - - // Hash keys. - auto hash = Hash {}(key); - - // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k. - const auto init = - detail::flat_map::SequentialLookupPolicy::initGroupProbe(hash, ngroups_pow_2); - const HashResult group_mask = init.group_mask; - HashResult curr_group = init.curr_group; - - std::uint8_t hash_8 = static_cast(hash); - - IndexType duplicate_bucket_index = -1; - IndexType empty_bucket_index = -1; - int iteration = 0; - while(iteration < meta_group.size()) - { - // Try to lock the group. We do this in a non-blocking manner to avoid - // intra-warp progress hazards. - bool group_locked = group_locks[curr_group].tryLock(); - - if(group_locked) + FlatMap& map = allocate_temp_map ? temp : *this; + + // Grab some needed internal fields from the flat map. + // We're going to be constructing metadata and the K-V pairs directly + // in-place. + const int ngroups_pow_2 = map.m_numGroups2; + const auto meta_group = map.m_metadata.view(); + const auto buckets = map.m_buckets.view(); + + // Construct an array of locks per-group. This guards metadata updates for + // each insertion. + const IndexType num_groups = 1 << ngroups_pow_2; + Array lock_vec(num_groups, num_groups, map.m_allocator.getID()); + const auto group_locks = lock_vec.view(); + + // Map bucket slots to k-v pair indices. This is used to deduplicate pairs + // with the same key value. + Array key_index_dedup_vec(0, 0, map.m_allocator.getID()); + key_index_dedup_vec.resize(num_groups * GroupBucket::Size, -1); + const auto key_index_dedup = key_index_dedup_vec.view(); + + // Map k-v pair indices to bucket slots. This is essentially the inverse of + // the above mapping. + Array key_index_to_bucket_vec(num_elems, num_elems, map.m_allocator.getID()); + const auto key_index_to_bucket = key_index_to_bucket_vec.view(); + + axom::ReduceSum total_overwrites(0); + + for_all( + num_elems, + AXOM_LAMBDA(IndexType idx) { + // Construct key. + KeyType key = (*(kv_begin + idx)).first; + + // Hash keys. + auto hash = Hash {}(key); + + // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k. + const auto init = + detail::flat_map::SequentialLookupPolicy::initGroupProbe(hash, ngroups_pow_2); + const HashResult group_mask = init.group_mask; + HashResult curr_group = init.curr_group; + + std::uint8_t hash_8 = static_cast(hash); + + IndexType duplicate_bucket_index = -1; + IndexType empty_bucket_index = -1; + int iteration = 0; + while(iteration < meta_group.size()) { - // Every bucket visit - check prior filled buckets for duplicate - // keys. - meta_group[curr_group].visitHashBucket(hash_8, [&](int matching_slot) -> bool { - IndexType bucket_index = curr_group * GroupBucket::Size + matching_slot; + // Try to lock the group. We do this in a non-blocking manner to avoid + // intra-warp progress hazards. + bool group_locked = group_locks[curr_group].tryLock(); - if(buckets[bucket_index].get().first == key) - { - duplicate_bucket_index = bucket_index; - return false; // Don't need to search other buckets. - } - return true; - }); - int empty_slot_index = meta_group[curr_group].getEmptyBucket(); - - if(duplicate_bucket_index == -1 && empty_bucket_index == -1) + if(group_locked) { - // Default probing behavior: no duplicate found yet, and no empty - // bucket found prior. - if(empty_slot_index == GroupBucket::InvalidSlot) - { - // Group is full. Set overflow bit for the group. - meta_group[curr_group].template setOverflow(hash_8); - } - else + // Every bucket visit - check prior filled buckets for duplicate + // keys. + meta_group[curr_group].visitHashBucket(hash_8, [&](int matching_slot) -> bool { + IndexType bucket_index = curr_group * GroupBucket::Size + matching_slot; + + if(buckets[bucket_index].get().first == key) + { + duplicate_bucket_index = bucket_index; + return false; // Don't need to search other buckets. + } + return true; + }); + int empty_slot_index = meta_group[curr_group].getEmptyBucket(); + + if(duplicate_bucket_index == -1 && empty_bucket_index == -1) { - // Update empty bucket index with first empty slot we encounter. - empty_bucket_index = curr_group * GroupBucket::Size + empty_slot_index; - key_index_dedup[empty_bucket_index] = idx; - key_index_to_bucket[idx] = empty_bucket_index; - - // Insert initial element, this will be updated with the value of - // the "winning" key-value pair. - meta_group[curr_group].template setBucket(empty_slot_index, hash_8); + // Default probing behavior: no duplicate found yet, and no empty + // bucket found prior. + if(empty_slot_index == GroupBucket::InvalidSlot) + { + // Group is full. Set overflow bit for the group. + meta_group[curr_group].template setOverflow(hash_8); + } + else + { + // Update empty bucket index with first empty slot we encounter. + empty_bucket_index = curr_group * GroupBucket::Size + empty_slot_index; + key_index_dedup[empty_bucket_index] = idx; + key_index_to_bucket[idx] = empty_bucket_index; + + // Insert initial element, this will be updated with the value of + // the "winning" key-value pair. + meta_group[curr_group].template setBucket(empty_slot_index, hash_8); #if defined(__CUDA_ARCH__) - detail::constructPairInPlace(buckets[empty_bucket_index].get(), - key, - (*(kv_begin + idx)).second); + detail::constructPairInPlace(buckets[empty_bucket_index].get(), + key, + (*(kv_begin + idx)).second); #else - new(&buckets[empty_bucket_index]) KeyValuePair(*(kv_begin + idx)); + new(&buckets[empty_bucket_index]) KeyValuePair(*(kv_begin + idx)); #endif + } } - } - else if(duplicate_bucket_index != -1) - { - // Found a duplicate bucket. - if(!is_gap_free && empty_bucket_index != -1) - { - // We've already encountered an empty bucket earlier to place a - // k-v pair. This may occur if a probing sequence contains gaps - // (insertions followed by erasures). - // - // Just erase this element. - total_overwrites += 1; - - int slot_index = duplicate_bucket_index - curr_group * GroupBucket::Size; - buckets[duplicate_bucket_index].get().~KeyValuePair(); - meta_group[curr_group].clearBucket(slot_index); - } - else + else if(duplicate_bucket_index != -1) { - if(key_index_dedup[duplicate_bucket_index] == -1) + // Found a duplicate bucket. + if(!is_gap_free && empty_bucket_index != -1) { - // The k-v pair matches an already-existing pair in the map. - // Keep track of the number of overwrites so that we don't - // double-count them when incrementing the size. + // We've already encountered an empty bucket earlier to place a + // k-v pair. This may occur if a probing sequence contains gaps + // (insertions followed by erasures). + // + // Just erase this element. total_overwrites += 1; + + int slot_index = duplicate_bucket_index - curr_group * GroupBucket::Size; + buckets[duplicate_bucket_index].get().~KeyValuePair(); + meta_group[curr_group].clearBucket(slot_index); + } + else + { + if(key_index_dedup[duplicate_bucket_index] == -1) + { + // The k-v pair matches an already-existing pair in the map. + // Keep track of the number of overwrites so that we don't + // double-count them when incrementing the size. + total_overwrites += 1; + } + // Highest-indexed kv pair wins. + axom::atomicMax(&key_index_dedup[duplicate_bucket_index], idx); + key_index_to_bucket[idx] = duplicate_bucket_index; } - // Highest-indexed kv pair wins. - axom::atomicMax(&key_index_dedup[duplicate_bucket_index], idx); - key_index_to_bucket[idx] = duplicate_bucket_index; } - } - // Unlock group once we're done. - group_locks[curr_group].unlock(); + // Unlock group once we're done. + group_locks[curr_group].unlock(); - if(duplicate_bucket_index != -1) - { - // We've found a duplicate key to overwrite. - break; - } - else if(empty_bucket_index != -1 && - (is_gap_free || !meta_group[curr_group].getMaybeOverflowed(hash_8))) - { - // If we're inserting into a gap-free map, empty bucket signals the - // end of the probing sequence. - // Otherwise, we need to check the overflow mask to continue probing. - break; - } - else - { - // Move to next group. - curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) & group_mask; - iteration++; + if(duplicate_bucket_index != -1) + { + // We've found a duplicate key to overwrite. + break; + } + else if(empty_bucket_index != -1 && + (is_gap_free || !meta_group[curr_group].getMaybeOverflowed(hash_8))) + { + // If we're inserting into a gap-free map, empty bucket signals the + // end of the probing sequence. + // Otherwise, we need to check the overflow mask to continue probing. + break; + } + else + { + // Move to next group. + curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) & group_mask; + iteration++; + } } } - } - }); - - // Add a counter for duplicated inserts. - axom::ReduceSum total_inserts(0); - - // Using key-deduplication map, assign unique k-v pairs to buckets. - for_all( - num_elems, - AXOM_LAMBDA(IndexType kv_idx) { - IndexType bucket_idx = key_index_to_bucket[kv_idx]; - IndexType winning_idx = key_index_dedup[bucket_idx]; - // Place k-v pair at bucket_idx. - if(kv_idx == winning_idx) - { + }); + + // Add a counter for duplicated inserts. + axom::ReduceSum total_inserts(0); + + // Using key-deduplication map, assign unique k-v pairs to buckets. + for_all( + num_elems, + AXOM_LAMBDA(IndexType kv_idx) { + IndexType bucket_idx = key_index_to_bucket[kv_idx]; + IndexType winning_idx = key_index_dedup[bucket_idx]; + // Place k-v pair at bucket_idx. + if(kv_idx == winning_idx) + { #if defined(__CUDA_ARCH__) - detail::constructPairInPlace(buckets[bucket_idx].get(), - (*(kv_begin + kv_idx)).first, - (*(kv_begin + kv_idx)).second); + detail::constructPairInPlace(buckets[bucket_idx].get(), + (*(kv_begin + kv_idx)).first, + (*(kv_begin + kv_idx)).second); #else - new(&buckets[bucket_idx]) KeyValuePair(*(kv_begin + kv_idx)); + new(&buckets[bucket_idx]) KeyValuePair(*(kv_begin + kv_idx)); #endif - total_inserts += 1; - } - }); + total_inserts += 1; + } + }); - map.m_size += total_inserts.get() - total_overwrites.get(); - map.m_loadCount += total_inserts.get() - total_overwrites.get(); + map.m_size += total_inserts.get() - total_overwrites.get(); + map.m_loadCount += total_inserts.get() - total_overwrites.get(); #if defined(AXOM_USE_CUDA) && defined(AXOM_USE_UMPIRE) - if(allocate_temp_map) - { - // Original pinned map is in temp. - axom::Allocator pinned_allocator = temp.getAllocator(); + if(allocate_temp_map) + { + // Original pinned map is in temp. + axom::Allocator pinned_allocator = temp.getAllocator(); - // Move new FlatMap to pinned memory. - *this = FlatMap(map, pinned_allocator); - } + // Move new FlatMap to pinned memory. + *this = FlatMap(map, pinned_allocator); + } #endif + } } } // namespace axom