From 61255d6df1580b84929d755376ab005e8c0aaa7f Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Wed, 10 Jun 2026 21:51:12 +0000
Subject: [PATCH 01/28] Fix FlatMap copy assignment -- need to compare
 addresses, not values

Adds typed tests covering assignment over a non-empty target,
source preservation, and self-assignment.
---
 src/axom/core/FlatMap.hpp            |  2 +-
 src/axom/core/tests/core_flatmap.hpp | 39 ++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index 9e6d8d5e98..13f83dfdda 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -179,7 +179,7 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
     static_assert(std::is_copy_constructible<ValueType>::value,
                   "Cannot copy an axom::FlatMap when value type is not "
                   "copy-constructible.");
-    if(*this != other)
+    if(this != &other)
     {
       FlatMap new_map(other);
       swap(new_map);
diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp
index 705b2eb073..b032c9887d 100644
--- a/src/axom/core/tests/core_flatmap.hpp
+++ b/src/axom/core/tests/core_flatmap.hpp
@@ -481,6 +481,45 @@ AXOM_TYPED_TEST(core_flatmap, init_and_copy)
   }
 }
 
+AXOM_TYPED_TEST(core_flatmap, copy_assign)
+{
+  using MapType = typename TestFixture::MapType;
+  MapType test_map;
+  const int NUM_ELEMS = 40;
+
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    test_map[this->getKey(i)] = this->getValue(i + 10.0);
+  }
+
+  // Copy-assign over a non-empty map with different contents should replace prior contents
+  MapType copied_map;
+  copied_map[this->getKey(NUM_ELEMS + 5)] = this->getValue(0.0);
+  copied_map = test_map;
+
+  EXPECT_EQ(copied_map.size(), NUM_ELEMS);
+  EXPECT_EQ(copied_map.find(this->getKey(NUM_ELEMS + 5)), copied_map.end());
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    auto it = copied_map.find(this->getKey(i));
+    ASSERT_NE(it, copied_map.end());
+    EXPECT_EQ(it->second, this->getValue(i + 10.0));
+  }
+
+  // The source should be unchanged
+  EXPECT_EQ(test_map.size(), NUM_ELEMS);
+
+  // Self-assignment is a no-op
+  copied_map = static_cast<const MapType&>(copied_map);
+  EXPECT_EQ(copied_map.size(), NUM_ELEMS);
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    auto it = copied_map.find(this->getKey(i));
+    ASSERT_NE(it, copied_map.end());
+    EXPECT_EQ(it->second, this->getValue(i + 10.0));
+  }
+}
+
 AXOM_TYPED_TEST(core_flatmap, insert_until_rehash)
 {
   using MapType = typename TestFixture::MapType;

From 2baf6a5b6dd5c0d997594428edacbc5852592b0a Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Wed, 10 Jun 2026 21:52:27 +0000
Subject: [PATCH 02/28] Remove FlatMap's const operator[], which inserts for
 missing keys

Removing it cannot break callers since this would not have compiled.
Const callers should use find()/at()/count()/contains().
at() throws std::out_of_range on a missing key.
---
 src/axom/core/FlatMap.hpp            |  9 -------
 src/axom/core/tests/core_flatmap.hpp | 35 ++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index 13f83dfdda..0a7b1209c4 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -332,7 +332,6 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
    *
    * \pre ValueType is default-constructible
    */
-  /// @{
   ValueType& operator[](const KeyType& key)
   {
     static_assert(std::is_default_constructible<ValueType>::value,
@@ -340,14 +339,6 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
                   "default-constructible.");
     return this->try_emplace(key).first->second;
   }
-  const ValueType& operator[](const KeyType& key) const
-  {
-    static_assert(std::is_default_constructible<ValueType>::value,
-                  "Cannot use axom::FlatMap::operator[] when value type is not "
-                  "default-constructible.");
-    return this->try_emplace(key).first->second;
-  }
-  /// @}
 
   /*!
    * \brief Return the number of entries matching a given key.
diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp
index b032c9887d..89c6616fab 100644
--- a/src/axom/core/tests/core_flatmap.hpp
+++ b/src/axom/core/tests/core_flatmap.hpp
@@ -520,6 +520,41 @@ AXOM_TYPED_TEST(core_flatmap, copy_assign)
   }
 }
 
+AXOM_TYPED_TEST(core_flatmap, const_lookup)
+{
+  using MapType = typename TestFixture::MapType;
+  MapType test_map;
+  const int NUM_ELEMS = 20;
+
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    test_map[this->getKey(i)] = this->getValue(i + 10.0);
+  }
+
+  // Read-only lookups must be through const reference (matching std::unordered_map)
+  // operator[] is intentionally non-const since it inserts on a missing key
+  const MapType& const_map = test_map;
+  EXPECT_EQ(const_map.size(), NUM_ELEMS);
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    auto key = this->getKey(i);
+    auto value = this->getValue(i + 10.0);
+
+    auto it = const_map.find(key);
+    ASSERT_NE(it, const_map.end());
+    EXPECT_EQ(it->second, value);
+    EXPECT_EQ(const_map.at(key), value);
+    EXPECT_EQ(const_map.count(key), 1);
+    EXPECT_TRUE(const_map.contains(key));
+  }
+
+  auto missing = this->getKey(NUM_ELEMS + 5);
+  EXPECT_EQ(const_map.find(missing), const_map.end());
+  EXPECT_EQ(const_map.count(missing), 0);
+  EXPECT_FALSE(const_map.contains(missing));
+  EXPECT_THROW(const_map.at(missing), std::out_of_range);
+}
+
 AXOM_TYPED_TEST(core_flatmap, insert_until_rehash)
 {
   using MapType = typename TestFixture::MapType;

From d1cb266754c3afdfd2ba7d748b37139330b75d09 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Wed, 10 Jun 2026 21:54:17 +0000
Subject: [PATCH 03/28] DeviceHash: hash in 64 bits regardless of IndexType
 width

DeviceHashHelper returned axom::IndexType and integer keys were converted
before the 64-bit mixer ran. With AXOM_USE_64BIT_INDEXTYPE=OFF every key wider than 32 bits
is truncated first, so keys equal mod 2^32 produce identical final hashes.
This was happening in the Morton codes in spin's SparseOctreeLevel and in numerics/quadrature.
---
 src/axom/core/DeviceHash.hpp             | 36 +++++++++++++++---------
 src/axom/core/tests/core_device_hash.hpp | 31 ++++++++++++++++++++
 2 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/src/axom/core/DeviceHash.hpp b/src/axom/core/DeviceHash.hpp
index 0c934fbadc..cbd6d5e3e6 100644
--- a/src/axom/core/DeviceHash.hpp
+++ b/src/axom/core/DeviceHash.hpp
@@ -11,6 +11,7 @@
 #include "axom/core/Macros.hpp"
 #include "axom/core/Types.hpp"
 
+#include <cstdint>
 #include <type_traits>
 
 namespace axom
@@ -25,8 +26,11 @@ template <typename T>
 struct DeviceHashHelper<T, std::enable_if_t<std::is_integral<T>::value>>
 {
   using argument_type = T;
-  using result_type = axom::IndexType;
-  AXOM_HOST_DEVICE axom::IndexType operator()(T value) const { return value; }
+  using result_type = std::uint64_t;
+  AXOM_HOST_DEVICE std::uint64_t operator()(T value) const
+  {
+    return static_cast<std::uint64_t>(value);
+  }
 };
 
 /// \brief Specialization for floating-point types
@@ -34,15 +38,15 @@ template <typename T>
 struct DeviceHashHelper<T, std::enable_if_t<std::is_floating_point<T>::value>>
 {
   using argument_type = T;
-  using result_type = axom::IndexType;
-  AXOM_HOST_DEVICE axom::IndexType operator()(T value) const
+  using result_type = std::uint64_t;
+  AXOM_HOST_DEVICE std::uint64_t operator()(T value) const
   {
     // Special case: -0.0 and 0.0 compare equal but have different byte representations.
     if(value == T {0.})
     {
       return 0;
     }
-    return value;
+    return static_cast<std::uint64_t>(static_cast<std::int64_t>(value));
   }
 };
 
@@ -51,10 +55,10 @@ template <typename T>
 struct DeviceHashHelper<T, std::enable_if_t<std::is_enum<T>::value>>
 {
   using argument_type = T;
-  using result_type = axom::IndexType;
-  AXOM_HOST_DEVICE axom::IndexType operator()(T value) const
+  using result_type = std::uint64_t;
+  AXOM_HOST_DEVICE std::uint64_t operator()(T value) const
   {
-    return static_cast<axom::IndexType>(value);
+    return static_cast<std::uint64_t>(value);
   }
 };
 
@@ -63,10 +67,10 @@ template <typename T>
 struct DeviceHashHelper<T*, void>
 {
   using argument_type = T*;
-  using result_type = axom::IndexType;
-  AXOM_HOST_DEVICE axom::IndexType operator()(T* ptr) const
+  using result_type = std::uint64_t;
+  AXOM_HOST_DEVICE std::uint64_t operator()(T* ptr) const
   {
-    return static_cast<axom::IndexType>(reinterpret_cast<std::uintptr_t>(ptr));
+    return static_cast<std::uint64_t>(reinterpret_cast<std::uintptr_t>(ptr));
   }
 };
 
@@ -75,10 +79,10 @@ template <typename T, typename Enable>
 struct DeviceHashHelper
 {
   using argument_type = T;
-  using result_type = axom::IndexType;
-  axom::IndexType operator()(const T& object) const
+  using result_type = std::uint64_t;
+  std::uint64_t operator()(const T& object) const
   {
-    return static_cast<axom::IndexType>(std::hash<T> {}(object));
+    return static_cast<std::uint64_t>(std::hash<T> {}(object));
   }
 };
 
@@ -89,6 +93,10 @@ struct DeviceHashHelper
  *
  * \brief Implements a host/device-callable hash function for supported types,
  *  and passes through to std::hash otherwise.
+ *
+ *  The result type is always std::uint64_t, independent of the configured axom::IndexType width.
+ *  Hashes feed bit mixers and bucket selection, where truncating wide keys (e.g. 64-bit Morton codes)
+ *  to a 32-bit IndexType before mixing would make keys equal mod 2^32 collide.
  */
 template <typename T>
 struct DeviceHash : public detail::DeviceHashHelper<T>
diff --git a/src/axom/core/tests/core_device_hash.hpp b/src/axom/core/tests/core_device_hash.hpp
index 8ccee915d9..d19ac69386 100644
--- a/src/axom/core/tests/core_device_hash.hpp
+++ b/src/axom/core/tests/core_device_hash.hpp
@@ -259,3 +259,34 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined)
     }
   }
 }
+
+TEST(core_device_hash, hash_width_decoupled_from_indextype)
+{
+  // The hash result must be 64 bits wide regardless of the configured
+  // axom::IndexType. When the result type was IndexType, builds with
+  // AXOM_USE_64BIT_INDEXTYPE=OFF truncated integer keys to 32 bits before
+  // the FlatMap bit mixer ran, so keys equal mod 2^32 (e.g. deep Morton codes)
+  // produced identical hashes. The type assertions catch the coupling in every
+  // build configuration; the value checks fail in the truncating configuration itself.
+  static_assert(std::is_same<axom::DeviceHash<std::uint64_t>::result_type, std::uint64_t>::value,
+                "integral hash result must be std::uint64_t");
+  static_assert(std::is_same<axom::DeviceHash<int>::result_type, std::uint64_t>::value,
+                "integral hash result must be std::uint64_t");
+  static_assert(std::is_same<axom::DeviceHash<double>::result_type, std::uint64_t>::value,
+                "floating-point hash result must be std::uint64_t");
+  static_assert(std::is_same<axom::DeviceHash<int*>::result_type, std::uint64_t>::value,
+                "pointer hash result must be std::uint64_t");
+  static_assert(std::is_same<axom::DeviceHash<std::string>::result_type, std::uint64_t>::value,
+                "catch-all (std::hash) result must be std::uint64_t");
+  static_assert(
+    std::is_same<decltype(axom::DeviceHash<std::uint64_t> {}(std::uint64_t {})), std::uint64_t>::value,
+    "integral hash operator() must return std::uint64_t");
+
+  axom::DeviceHash<std::uint64_t> device_hasher;
+  const std::uint64_t base = 1;
+  const std::uint64_t plus_2_32 = base + (std::uint64_t {1} << 32);
+  const std::uint64_t plus_2_33 = base + (std::uint64_t {1} << 33);
+  EXPECT_NE(device_hasher(base), device_hasher(plus_2_32));
+  EXPECT_NE(device_hasher(base), device_hasher(plus_2_33));
+  EXPECT_NE(device_hasher(plus_2_32), device_hasher(plus_2_33));
+}

From 51f9a863c2a063d54db4d77918b94769ed5b344c Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Wed, 10 Jun 2026 22:00:30 +0000
Subject: [PATCH 04/28] DeviceHash: hash floating-point keys by bit pattern,
 not value

The floating-point specialization returned the key converted to an integer.
Every key sharing an integer part therefore collided --
e.g. all numbers between -1 and 1 converted to the integer 0,
so a FlatMap keyed on fractional floats degenerated into one probe chain with O(size) inserts and finds
---
 src/axom/core/DeviceHash.hpp             | 27 +++++++++++++++--
 src/axom/core/tests/core_device_hash.hpp | 38 +++++++++++++++++++++++-
 src/axom/core/tests/core_flatmap.hpp     | 27 +++++++++++++++++
 3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/src/axom/core/DeviceHash.hpp b/src/axom/core/DeviceHash.hpp
index cbd6d5e3e6..d1ac31ced2 100644
--- a/src/axom/core/DeviceHash.hpp
+++ b/src/axom/core/DeviceHash.hpp
@@ -12,6 +12,7 @@
 #include "axom/core/Types.hpp"
 
 #include <cstdint>
+#include <cstring>
 #include <type_traits>
 
 namespace axom
@@ -41,12 +42,32 @@ struct DeviceHashHelper<T, std::enable_if_t<std::is_floating_point<T>::value>>
   using result_type = std::uint64_t;
   AXOM_HOST_DEVICE std::uint64_t operator()(T value) const
   {
-    // Special case: -0.0 and 0.0 compare equal but have different byte representations.
+    // -0.0 and 0.0 compare equal but have different bit patterns; normalize so both hash identically
     if(value == T {0.})
     {
-      return 0;
+      value = T {0.};
     }
-    return static_cast<std::uint64_t>(static_cast<std::int64_t>(value));
+
+    // Hash the bit pattern, not the converted value.
+    // A float-to-integer value conversion collapses every key sharing an integer part,
+    // e.g. all numbers between -1 and 1 converts to integer 0
+
+    // NUM_WORDS is 1 for float or double, possibly 2 for long double
+    constexpr std::size_t NUM_WORDS = (sizeof(T) + sizeof(std::uint64_t) - 1) / sizeof(std::uint64_t);
+    // zero out words since we might only copy 4 bytes in for floats
+    std::uint64_t words[NUM_WORDS] = {0};
+    memcpy(words, &value, sizeof(T));
+
+    std::uint64_t result = words[0];
+    // Extra processing fortypes wider than 64 bits (long double).
+    // Use an odd multiplier (2^64/golden-ratio-phi),
+    // so the halves cannot cancel under a later XOR-style mixer
+    for(std::size_t i = 1; i < NUM_WORDS; i++)
+    {
+      result = result * std::uint64_t {0x9e3779b97f4a7c15} + words[i];
+    }
+
+    return result;
   }
 };
 
diff --git a/src/axom/core/tests/core_device_hash.hpp b/src/axom/core/tests/core_device_hash.hpp
index d19ac69386..58173725e1 100644
--- a/src/axom/core/tests/core_device_hash.hpp
+++ b/src/axom/core/tests/core_device_hash.hpp
@@ -11,6 +11,9 @@
 // gtest includes
 #include "gtest/gtest.h"
 
+// C++ includes
+#include <set>
+
 template <typename TheExecSpace>
 class core_device_hash : public ::testing::Test
 {
@@ -274,7 +277,7 @@ TEST(core_device_hash, hash_width_decoupled_from_indextype)
                 "integral hash result must be std::uint64_t");
   static_assert(std::is_same<axom::DeviceHash<double>::result_type, std::uint64_t>::value,
                 "floating-point hash result must be std::uint64_t");
-  static_assert(std::is_same<axom::DeviceHash<int*>::result_type, std::uint64_t>::value,
+  static_assert(std::is_same<axom::DeviceHash<int *>::result_type, std::uint64_t>::value,
                 "pointer hash result must be std::uint64_t");
   static_assert(std::is_same<axom::DeviceHash<std::string>::result_type, std::uint64_t>::value,
                 "catch-all (std::hash) result must be std::uint64_t");
@@ -290,3 +293,36 @@ TEST(core_device_hash, hash_width_decoupled_from_indextype)
   EXPECT_NE(device_hasher(base), device_hasher(plus_2_33));
   EXPECT_NE(device_hasher(plus_2_32), device_hasher(plus_2_33));
 }
+
+TEST(core_device_hash, hash_float_bit_pattern)
+{
+  // Floating-point keys must be hashed by bit pattern, not by integer value conversion.
+  // This is a regression test for a previous implementation where the conversion collapsed
+  // every key with the same integer value, e.g. all numbers between -1 and 1 converted to integer 0
+  // so a FlatMap keyed on fractional floats degenerated into a single probe chain.
+  axom::DeviceHash<float> float_hasher;
+  axom::DeviceHash<double> double_hasher;
+
+  EXPECT_NE(float_hasher(0.25f), float_hasher(0.75f));
+  EXPECT_NE(float_hasher(0.25f), std::uint64_t {0});
+  EXPECT_NE(double_hasher(0.25), double_hasher(0.75));
+
+  // A spread of fractional keys must be collision-free at this scale
+  std::set<std::uint64_t> float_hashes, double_hashes;
+  const int NUM_KEYS = 1000;
+  for(int i = 1; i <= NUM_KEYS; i++)
+  {
+    float_hashes.insert(float_hasher(i / static_cast<float>(NUM_KEYS + 1)));
+    double_hashes.insert(double_hasher(i / static_cast<double>(NUM_KEYS + 1)));
+  }
+  EXPECT_EQ(float_hashes.size(), NUM_KEYS);
+  EXPECT_EQ(double_hashes.size(), NUM_KEYS);
+
+  // Signed zeros compare equal and must hash equal
+  EXPECT_EQ(float_hasher(0.0f), float_hasher(-0.0f));
+  EXPECT_EQ(double_hasher(0.0), double_hasher(-0.0));
+
+  // Magnitudes beyond any integer type's range are now well-defined and distinct
+  EXPECT_NE(double_hasher(1e300), double_hasher(2e300));
+  EXPECT_NE(float_hasher(-0.5f), float_hasher(0.5f));
+}
diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp
index 89c6616fab..186fbbf70a 100644
--- a/src/axom/core/tests/core_flatmap.hpp
+++ b/src/axom/core/tests/core_flatmap.hpp
@@ -129,6 +129,33 @@ using MyTypes = ::testing::Types<axom::FlatMap<int, double>,
 
 TYPED_TEST_SUITE(core_flatmap, MyTypes);
 
+TEST(core_flatmap_unit, float_keys_in_unit_interval)
+{
+  // Regression test for the floating-point DeviceHash specialization, which
+  // converted keys to integers by value: every key in (0, 1) hashed to 0, so
+  // this map was a single probe chain and each insert/find was O(size).
+  // With bit-pattern hashing the keys spread normally.
+  axom::FlatMap<float, int> test_map;
+  const int NUM_ELEMS = 512;
+
+  for(int i = 1; i <= NUM_ELEMS; i++)
+  {
+    float key = i / static_cast<float>(NUM_ELEMS + 2);
+    test_map[key] = i;
+  }
+
+  EXPECT_EQ(test_map.size(), NUM_ELEMS);
+  for(int i = 1; i <= NUM_ELEMS; i++)
+  {
+    float key = i / static_cast<float>(NUM_ELEMS + 2);
+    auto it = test_map.find(key);
+    ASSERT_NE(it, test_map.end());
+    EXPECT_EQ(it->second, i);
+  }
+  EXPECT_EQ(test_map.find(1.5f), test_map.end());
+  EXPECT_EQ(test_map.count(0.5f), 1);
+}
+
 AXOM_TYPED_TEST(core_flatmap, default_init)
 {
   using MapType = typename TestFixture::MapType;

From 2c0060213e1337f7b93ee01630ae30cb9532ad7a Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Wed, 10 Jun 2026 22:03:17 +0000
Subject: [PATCH 05/28] FlatTable: wrap probe advance with a mask, not a signed
 division

The quadratic probe advance in probeIndex and probeEmptyIndex wrapped
using a mod (%) operator. Since the group count is always a power of two,
we can use a bitmask instead.

Adds a cross-group probe stress test: a degenerate hash drives 600
keys through one initial group so inserts, lookups, misses, erases,
and reinserts all walk and wrap the group sequence.
---
 src/axom/core/detail/FlatTable.hpp   | 27 ++++++-----
 src/axom/core/tests/core_flatmap.hpp | 72 ++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index e99853bdcf..66ac4019b7 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -319,10 +319,11 @@ struct SequentialLookupPolicy : ProbePolicy
   IndexType probeEmptyIndex(int ngroups_pow_2, ArrayView<GroupBucket> metadata, HashType hash) const
   {
     // We use the k MSBs of the hash as the initial group probe point,
-    // where ngroups = 2^k.
-    int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2);
-    HashType curr_group = hash >> bitshift_right;
-    curr_group &= ((1 << ngroups_pow_2) - 1);
+    // where ngroups = 2^k. Since the group count is always a power of two,
+    // wrapping a group index is a bitwise AND with this mask.
+    const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2);
+    const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1;
+    HashType curr_group = (hash >> bitshift_right) & group_mask;
     int empty_group = NO_MATCH;
     int empty_bucket = NO_MATCH;
 
@@ -347,7 +348,10 @@ struct SequentialLookupPolicy : ProbePolicy
         // Set the overflow bit and continue probing.
         metadata[curr_group].setOverflow(hash_8);
       }
-      curr_group = (curr_group + this->getNext(iteration)) % metadata.size();
+      // Mask instead of "% metadata.size()": the group count is a power of
+      // two, and the modulo compiled to a 64-bit signed division on the
+      // critical path of every probe continuation.
+      curr_group = (curr_group + this->getNext(iteration)) & group_mask;
     }
     if(empty_group != NO_MATCH)
     {
@@ -373,10 +377,11 @@ struct SequentialLookupPolicy : ProbePolicy
                                    FoundIndex&& on_hash_found) const
   {
     // We use the k MSBs of the hash as the initial group probe point,
-    // where ngroups = 2^k.
-    int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2);
-    HashType curr_group = hash >> bitshift_right;
-    curr_group &= ((1 << ngroups_pow_2) - 1);
+    // where ngroups = 2^k. Since the group count is always a power of two,
+    // wrapping a group index is a bitwise AND with this mask.
+    const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2);
+    const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1;
+    HashType curr_group = (hash >> bitshift_right) & group_mask;
 
     std::uint8_t hash_8 = static_cast<std::uint8_t>(hash);
     bool keep_going = true;
@@ -397,8 +402,8 @@ struct SequentialLookupPolicy : ProbePolicy
       {
         break;
       }
-      // Probe the next bucket.
-      curr_group = (curr_group + this->getNext(iteration)) % metadata.size();
+      // Probe the next bucket. Note that the group count is a power of 2 so we can use a bit mask
+      curr_group = (curr_group + this->getNext(iteration)) & group_mask;
     }
   }
 
diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp
index 186fbbf70a..4e3a2f4506 100644
--- a/src/axom/core/tests/core_flatmap.hpp
+++ b/src/axom/core/tests/core_flatmap.hpp
@@ -156,6 +156,78 @@ TEST(core_flatmap_unit, float_keys_in_unit_interval)
   EXPECT_EQ(test_map.count(0.5f), 1);
 }
 
+// Hash functor whose group-selector bits (the top bits) are always zero,
+// so every key lands in the same initial group and probing must walk across groups.
+// Stress tests the cross-group probe sequence.
+struct DegenerateGroupHash
+{
+  using argument_type = int;
+  using result_type = std::uint64_t;
+  std::uint64_t operator()(int key) const
+  {
+    return static_cast<std::uint64_t>(static_cast<unsigned>(key) & 0xFF);
+  }
+};
+
+TEST(core_flatmap_unit, cross_group_probe_chains)
+{
+  // Forces hundreds of keys through a single initial group:
+  // inserts walk probeEmptyIndex's group sequence, lookups walk probeIndex's,
+  // both wrap around the group array, and erases punch holes mid-sequence.
+  // Guards the probe-advance arithmetic (group wrapping) against regressions.
+  axom::FlatMap<int, int, DegenerateGroupHash> test_map;
+  const int NUM_ELEMS = 600;
+
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    test_map[i] = i * 3;
+  }
+  EXPECT_EQ(test_map.size(), NUM_ELEMS);
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    auto it = test_map.find(i);
+    ASSERT_NE(it, test_map.end());
+    EXPECT_EQ(it->second, i * 3);
+  }
+  for(int i = NUM_ELEMS; i < NUM_ELEMS + 64; i++)
+  {
+    EXPECT_EQ(test_map.find(i), test_map.end());
+  }
+
+  // Erase every third key (some mid-probe-sequence) and re-verify
+  for(int i = 0; i < NUM_ELEMS; i += 3)
+  {
+    EXPECT_EQ(test_map.erase(i), 1);
+  }
+  EXPECT_EQ(test_map.size(), NUM_ELEMS - (NUM_ELEMS + 2) / 3);
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    if(i % 3 == 0)
+    {
+      EXPECT_EQ(test_map.find(i), test_map.end());
+    }
+    else
+    {
+      auto it = test_map.find(i);
+      ASSERT_NE(it, test_map.end());
+      EXPECT_EQ(it->second, i * 3);
+    }
+  }
+
+  // Reinsert over the holes and verify
+  for(int i = 0; i < NUM_ELEMS; i += 3)
+  {
+    test_map[i] = i * 7;
+  }
+  EXPECT_EQ(test_map.size(), NUM_ELEMS);
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    auto it = test_map.find(i);
+    ASSERT_NE(it, test_map.end());
+    EXPECT_EQ(it->second, (i % 3 == 0) ? i * 7 : i * 3);
+  }
+}
+
 AXOM_TYPED_TEST(core_flatmap, default_init)
 {
   using MapType = typename TestFixture::MapType;

From c6555a89ab3d7c04e8696319c7f3df7a4cbe327e Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 15 Jan 2026 19:55:17 -0800
Subject: [PATCH 06/28] Adds initial benchmark for flatmap vs map vs
 unordered_map vs sparsehash

---
 src/axom/core/tests/CMakeLists.txt            |   3 +-
 .../core/tests/core_benchmark_flatmap.cpp     | 463 ++++++++++++++++++
 2 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 src/axom/core/tests/core_benchmark_flatmap.cpp

diff --git a/src/axom/core/tests/CMakeLists.txt b/src/axom/core/tests/CMakeLists.txt
index eecb1bc949..b30ebb25fc 100644
--- a/src/axom/core/tests/CMakeLists.txt
+++ b/src/axom/core/tests/CMakeLists.txt
@@ -212,7 +212,8 @@ endforeach()
 if (ENABLE_BENCHMARKS)
 
   set(core_benchmarks
-      core_benchmark_array.cpp )
+      core_benchmark_array.cpp
+      core_benchmark_flatmap.cpp )
 
   foreach(test ${core_benchmarks})
     get_filename_component(test_name ${test} NAME_WE)
diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
new file mode 100644
index 0000000000..978c47f6d1
--- /dev/null
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -0,0 +1,463 @@
+// Copyright (c) Lawrence Livermore National Security, LLC and other
+// Axom Project Contributors. See top-level LICENSE and COPYRIGHT
+// files for dates and other details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+
+#include "benchmark/benchmark.h"
+
+#include "axom/config.hpp"
+#include "axom/core.hpp"
+#include "axom/slic.hpp"
+
+#include "axom/CLI11.hpp"
+#include "axom/fmt.hpp"
+
+#include "axom/core/FlatMap.hpp"
+#include "axom/core/FlatMapUtil.hpp"
+
+#if defined(AXOM_USE_SPARSEHASH)
+  #include "axom/sparsehash/sparse_hash_map"
+#endif
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <random>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace
+{
+
+using KeyType = std::int64_t;
+using ValueType = std::int64_t;
+
+enum class FlatMapFeatureBenchmarks
+{
+  None = 0,
+  Insertion = 1 << 0,
+  Lookup = 1 << 1,
+  BatchedInsertion = 1 << 2,
+
+  All = Insertion | Lookup | BatchedInsertion
+};
+
+inline FlatMapFeatureBenchmarks operator|(FlatMapFeatureBenchmarks lhs, FlatMapFeatureBenchmarks rhs)
+{
+  using T = std::underlying_type_t<FlatMapFeatureBenchmarks>;
+  return static_cast<FlatMapFeatureBenchmarks>(static_cast<T>(lhs) | static_cast<T>(rhs));
+}
+
+inline FlatMapFeatureBenchmarks& operator|=(FlatMapFeatureBenchmarks& lhs,
+                                            FlatMapFeatureBenchmarks rhs)
+{
+  lhs = lhs | rhs;
+  return lhs;
+}
+
+inline FlatMapFeatureBenchmarks operator&(FlatMapFeatureBenchmarks lhs, FlatMapFeatureBenchmarks rhs)
+{
+  using T = std::underlying_type_t<FlatMapFeatureBenchmarks>;
+  return static_cast<FlatMapFeatureBenchmarks>(static_cast<T>(lhs) & static_cast<T>(rhs));
+}
+
+std::vector<int> args_benchmark_sizes;
+FlatMapFeatureBenchmarks args_benchmark_features {FlatMapFeatureBenchmarks::None};
+int args_batch_size = 1 << 10;
+}  // namespace
+
+template <>
+struct axom::fmt::formatter<FlatMapFeatureBenchmarks>
+{
+  template <typename ParseContext>
+  constexpr auto parse(ParseContext& ctx)
+  {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(FlatMapFeatureBenchmarks feature, FormatContext& ctx) const
+  {
+    static const std::map<FlatMapFeatureBenchmarks, std::string> feature_map = {
+      {FlatMapFeatureBenchmarks::Insertion, "Insertion"},
+      {FlatMapFeatureBenchmarks::Lookup, "Lookup"},
+      {FlatMapFeatureBenchmarks::BatchedInsertion, "BatchedInsertion"}};
+
+    if(feature == FlatMapFeatureBenchmarks::None)
+    {
+      return axom::fmt::format_to(ctx.out(), "None");
+    }
+    else if(feature == FlatMapFeatureBenchmarks::All)
+    {
+      return axom::fmt::format_to(ctx.out(), "All");
+    }
+
+    std::string name;
+    for(const auto& kv : feature_map)
+    {
+      if((feature & kv.first) != FlatMapFeatureBenchmarks::None)
+      {
+        name += name.empty() ? kv.second : "|" + kv.second;
+      }
+    }
+    return axom::fmt::format_to(ctx.out(), "{}", name);
+  }
+};
+
+namespace
+{
+
+void CustomArgs(benchmark::internal::Benchmark* b)
+{
+  for(int sz : ::args_benchmark_sizes)
+  {
+    b->Arg(sz);
+  }
+}
+
+std::vector<KeyType> make_shuffled_keys(int n, std::uint64_t seed)
+{
+  std::vector<KeyType> keys;
+  keys.reserve(static_cast<std::size_t>(n));
+  for(int i = 0; i < n; ++i)
+  {
+    keys.push_back(static_cast<KeyType>(i));
+  }
+
+  std::mt19937_64 rng(seed);
+  std::shuffle(keys.begin(), keys.end(), rng);
+  return keys;
+}
+
+std::vector<std::pair<KeyType, ValueType>> make_pairs(const std::vector<KeyType>& keys)
+{
+  std::vector<std::pair<KeyType, ValueType>> pairs;
+  pairs.reserve(keys.size());
+  for(std::size_t i = 0; i < keys.size(); ++i)
+  {
+    pairs.emplace_back(keys[i], static_cast<ValueType>(i));
+  }
+  return pairs;
+}
+
+std::vector<KeyType> make_miss_keys(const std::vector<KeyType>& keys, KeyType offset)
+{
+  std::vector<KeyType> misses;
+  misses.reserve(keys.size());
+  for(KeyType k : keys)
+  {
+    misses.push_back(k + offset);
+  }
+  return misses;
+}
+
+template <typename MapType>
+struct MapFactory
+{
+  static MapType make_empty(std::size_t) { return MapType {}; }
+  static void reserve(MapType&, std::size_t) { }
+};
+
+template <typename Key, typename Value, typename Hash, typename Eq, typename Alloc>
+struct MapFactory<std::unordered_map<Key, Value, Hash, Eq, Alloc>>
+{
+  using MapType = std::unordered_map<Key, Value, Hash, Eq, Alloc>;
+  static MapType make_empty(std::size_t) { return MapType {}; }
+  static void reserve(MapType& map, std::size_t n) { map.reserve(n); }
+};
+
+template <typename Key, typename Value, typename Hash>
+struct MapFactory<axom::FlatMap<Key, Value, Hash>>
+{
+  using MapType = axom::FlatMap<Key, Value, Hash>;
+  static MapType make_empty(std::size_t) { return MapType {}; }
+  static void reserve(MapType& map, std::size_t n) { map.reserve(static_cast<axom::IndexType>(n)); }
+};
+
+#if defined(AXOM_USE_SPARSEHASH)
+template <typename Key, typename Value, typename Hash, typename Eq, typename Alloc>
+void reserve_sparsehash(axom::google::sparse_hash_map<Key, Value, Hash, Eq, Alloc>& map, std::size_t n)
+{
+  map.max_load_factor(0.8f);
+  const auto buckets_needed =
+    static_cast<std::size_t>(static_cast<double>(n) / map.max_load_factor()) + 1;
+  map.resize(buckets_needed);
+}
+#endif
+
+#if defined(AXOM_USE_SPARSEHASH)
+template <typename Key, typename Value, typename Hash, typename Eq, typename Alloc>
+struct MapFactory<axom::google::sparse_hash_map<Key, Value, Hash, Eq, Alloc>>
+{
+  using MapType = axom::google::sparse_hash_map<Key, Value, Hash, Eq, Alloc>;
+  static MapType make_empty(std::size_t) { return MapType {}; }
+  static void reserve(MapType& map, std::size_t n) { reserve_sparsehash(map, n); }
+};
+#endif
+
+template <typename MapType>
+MapType make_reserved_map(std::size_t n)
+{
+  MapType map = MapFactory<MapType>::make_empty(n);
+  MapFactory<MapType>::reserve(map, n);
+  return map;
+}
+
+template <typename MapType>
+MapType make_filled_map(const std::vector<std::pair<KeyType, ValueType>>& pairs)
+{
+  MapType map = make_reserved_map<MapType>(pairs.size());
+  map.insert(pairs.begin(), pairs.end());
+  return map;
+}
+
+template <typename MapType>
+void BM_Insert_StartEmpty(benchmark::State& state)
+{
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xA2D5B7C4ULL);
+  const auto pairs = make_pairs(keys);
+
+  for(auto _ : state)
+  {
+    MapType map = MapFactory<MapType>::make_empty(pairs.size());
+    map.insert(pairs.begin(), pairs.end());
+    benchmark::DoNotOptimize(map);
+  }
+}
+
+template <typename MapType>
+void BM_Insert_Reserved(benchmark::State& state)
+{
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xA2D5B7C4ULL);
+  const auto pairs = make_pairs(keys);
+
+  for(auto _ : state)
+  {
+    MapType map = make_reserved_map<MapType>(pairs.size());
+    map.insert(pairs.begin(), pairs.end());
+    benchmark::DoNotOptimize(map);
+  }
+}
+
+template <typename MapType>
+void BM_Find_Hit(benchmark::State& state)
+{
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
+  const auto pairs = make_pairs(keys);
+  const MapType map = make_filled_map<MapType>(pairs);
+
+  for(auto _ : state)
+  {
+    ValueType sum = 0;
+    for(KeyType k : keys)
+    {
+      auto it = map.find(k);
+      if(it != map.end())
+      {
+        sum += it->second;
+      }
+    }
+    benchmark::DoNotOptimize(sum);
+  }
+}
+
+template <typename MapType>
+void BM_Find_Miss(benchmark::State& state)
+{
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
+  const auto pairs = make_pairs(keys);
+  const MapType map = make_filled_map<MapType>(pairs);
+  const auto miss_keys = make_miss_keys(keys, static_cast<KeyType>(n) + 11);
+
+  for(auto _ : state)
+  {
+    std::int64_t misses = 0;
+    for(KeyType k : miss_keys)
+    {
+      misses += (map.find(k) == map.end()) ? 1 : 0;
+    }
+    benchmark::DoNotOptimize(misses);
+  }
+}
+
+template <typename MapType>
+void insert_pairs_in_batches(MapType& map,
+                             const std::vector<std::pair<KeyType, ValueType>>& pairs,
+                             int batch_size)
+{
+  const std::size_t n = pairs.size();
+  const std::size_t bs = static_cast<std::size_t>(std::max(1, batch_size));
+  for(std::size_t offset = 0; offset < n; offset += bs)
+  {
+    const std::size_t count = std::min(bs, n - offset);
+    map.insert(pairs.begin() + static_cast<std::ptrdiff_t>(offset),
+               pairs.begin() + static_cast<std::ptrdiff_t>(offset + count));
+  }
+}
+
+template <typename Key, typename Value, typename Hash>
+void insert_pairs_in_batches(axom::FlatMap<Key, Value, Hash>& map,
+                             const std::vector<std::pair<KeyType, ValueType>>& pairs,
+                             int batch_size)
+{
+  const std::size_t n = pairs.size();
+  const std::size_t bs = static_cast<std::size_t>(std::max(1, batch_size));
+  for(std::size_t offset = 0; offset < n; offset += bs)
+  {
+    const std::size_t count = std::min(bs, n - offset);
+    map.template insert<axom::SEQ_EXEC>(pairs.begin() + static_cast<std::ptrdiff_t>(offset),
+                                        pairs.begin() + static_cast<std::ptrdiff_t>(offset + count));
+  }
+}
+
+template <typename MapType>
+void BM_BatchedInsert_Reserved(benchmark::State& state)
+{
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0x1CEB00DAULL);
+  const auto pairs = make_pairs(keys);
+
+  for(auto _ : state)
+  {
+    MapType map = make_reserved_map<MapType>(pairs.size());
+    insert_pairs_in_batches(map, pairs, ::args_batch_size);
+    benchmark::DoNotOptimize(map);
+  }
+}
+
+}  // namespace
+
+//-----------------------------------------------------------------------------
+// Register benchmarks
+//-----------------------------------------------------------------------------
+
+template <typename MapType>
+void RegisterBenchmarksFor(const std::string& map_name)
+{
+  auto name = [&map_name](const std::string& op) {
+    return axom::fmt::format("{}::{}", map_name, op);
+  };
+
+  // clang-format off
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::Insertion) != FlatMapFeatureBenchmarks::None)
+  {
+    benchmark::RegisterBenchmark(name("insert_startEmpty"), &BM_Insert_StartEmpty<MapType>)->Apply(CustomArgs);
+    benchmark::RegisterBenchmark(name("insert_reserved"), &BM_Insert_Reserved<MapType>)->Apply(CustomArgs);
+  }
+
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None)
+  {
+    benchmark::RegisterBenchmark(name("find_hit"), &BM_Find_Hit<MapType>)->Apply(CustomArgs);
+    benchmark::RegisterBenchmark(name("find_miss"), &BM_Find_Miss<MapType>)->Apply(CustomArgs);
+  }
+
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::BatchedInsertion) != FlatMapFeatureBenchmarks::None)
+  {
+    benchmark::RegisterBenchmark(name("insert_batched_reserved"), &BM_BatchedInsert_Reserved<MapType>)->Apply(CustomArgs);
+  }
+  // clang-format on
+}
+
+int main(int argc, char* argv[])
+{
+  std::vector<int> local_test_sizes;
+  FlatMapFeatureBenchmarks local_benchmark_features {FlatMapFeatureBenchmarks::None};
+  int local_batch_size = ::args_batch_size;
+
+  axom::CLI::App app {"Axom FlatMap benchmarks"};
+  app.add_option("-s,--custom_sizes", local_test_sizes)
+    ->description("Adds custom map sizes to benchmark (positive numbers only)")
+    ->expected(-1)
+    ->default_val(std::vector<int> {1 << 16})
+    ->each([](const std::string& num_str) {
+      int num = std::stoi(num_str);
+      if(num < 0)
+      {
+        throw axom::CLI::ValidationError("Negative numbers are not allowed");
+      }
+    });
+
+  app
+    .add_flag_callback("--use_cache_related_sizes",
+                       [&local_test_sizes]() {
+                         local_test_sizes.push_back(1 << 3);   // small
+                         local_test_sizes.push_back(1 << 16);  // larger than  32K L1 cache
+                         local_test_sizes.push_back(1 << 19);  // larger than 256K L2 cache
+                         //local_test_sizes.push_back(1 << 25);  // larger than  25M L3 cache
+                       })
+    ->description("Test map sizes related to typical cache sizes");
+
+  app.add_option("--batch_size", local_batch_size)
+    ->description("Batch size for batched insertion benchmarks")
+    ->default_val(local_batch_size)
+    ->check(axom::CLI::PositiveNumber);
+
+  std::vector<std::string> feature_strings;
+  auto feature_opt =
+    app.add_option("-f,--features", feature_strings)
+      ->description(
+        "Features to benchmark (Insertion, Lookup, BatchedInsertion, All); default is 'All'")
+      ->expected(-1)
+      ->each([&local_benchmark_features](const std::string& feature) {
+        static const std::map<std::string, FlatMapFeatureBenchmarks> feature_map = {
+          {"insertion", FlatMapFeatureBenchmarks::Insertion},
+          {"lookup", FlatMapFeatureBenchmarks::Lookup},
+          {"batchedinsertion", FlatMapFeatureBenchmarks::BatchedInsertion},
+          {"all", FlatMapFeatureBenchmarks::All}};
+
+        std::string lower_feature = feature;
+        std::transform(lower_feature.begin(), lower_feature.end(), lower_feature.begin(), ::tolower);
+        auto it = feature_map.find(lower_feature);
+        if(it == feature_map.end())
+        {
+          throw axom::CLI::ValidationError("Invalid feature: " + feature);
+        }
+
+        local_benchmark_features |= it->second;
+      });
+
+  app.allow_extras();  // pass additional args to gbenchmark
+  CLI11_PARSE(app, argc, argv);
+
+  ::benchmark::Initialize(&argc, argv);
+  axom::slic::SimpleLogger logger;
+
+  // process input into global variables
+  {
+    ::args_benchmark_features =
+      feature_opt->count() > 0 ? local_benchmark_features : FlatMapFeatureBenchmarks::All;
+
+    std::sort(local_test_sizes.begin(), local_test_sizes.end());
+    auto last = std::unique(local_test_sizes.begin(), local_test_sizes.end());
+    local_test_sizes.erase(last, local_test_sizes.end());
+    std::swap(::args_benchmark_sizes, local_test_sizes);
+
+    ::args_batch_size = local_batch_size;
+
+    SLIC_INFO("Parsed and processed command line arguments:");
+    SLIC_INFO(axom::fmt::format("- Map sizes: {}", axom::fmt::join(::args_benchmark_sizes, ",")));
+    SLIC_INFO(axom::fmt::format("- Batch size: {}", ::args_batch_size));
+    SLIC_INFO(axom::fmt::format("- Map features to test: {}", ::args_benchmark_features));
+  }
+
+  RegisterBenchmarksFor<axom::FlatMap<KeyType, ValueType>>("axom::FlatMap");
+  RegisterBenchmarksFor<std::unordered_map<KeyType, ValueType>>("std::unordered_map");
+  RegisterBenchmarksFor<std::map<KeyType, ValueType>>("std::map");
+
+#if defined(AXOM_USE_SPARSEHASH)
+  RegisterBenchmarksFor<axom::google::sparse_hash_map<KeyType, ValueType>>(
+    "axom::google::sparse_hash_map");
+#endif
+
+  ::benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}

From 09dc808d31866b2c19561aa61c2501319d0f9b77 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 15 Jan 2026 20:20:13 -0800
Subject: [PATCH 07/28] Improves performance of FlatMap batched insertion for
 SEQ policy

---
 src/axom/core/FlatMapUtil.hpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp
index 5515607046..d1adb18f90 100644
--- a/src/axom/core/FlatMapUtil.hpp
+++ b/src/axom/core/FlatMapUtil.hpp
@@ -263,6 +263,27 @@ void FlatMap<KeyType, ValueType, Hash>::insert(InputIt kv_begin, InputIt kv_end)
                                 typename std::iterator_traits<InputIt>::iterator_category>::value,
                 "InputIt must be a random-access iterator for batched construction");
 
+  // Fast path for sequential execution:
+  // The batched insertion algorithm below is designed for parallel execution and
+  // uses per-group locks and auxiliary arrays for deduplication. In SEQ, those
+  // structures add significant overhead; a simple sequential loop provides
+  // better performance while preserving the documented semantics that later
+  // duplicates overwrite earlier ones.
+  if constexpr(std::is_same_v<ExecSpace, axom::SEQ_EXEC>)
+  {
+    const IndexType num_elems = std::distance(kv_begin, kv_end);
+
+    // Ensure we have enough capacity up-front to avoid repeated rehashing.
+    this->reserve(this->size() + num_elems);
+
+    for(IndexType idx = 0; idx < num_elems; ++idx)
+    {
+      auto kv = *(kv_begin + idx);
+      this->insert_or_assign(kv.first, kv.second);
+    }
+    return;
+  }
+
   using HashResult = typename Hash::result_type;
   using GroupBucket = detail::flat_map::GroupBucket;
 

From acdf68343bdefa4ffa71410a32d00fd4671fdbfb Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 15 Jan 2026 20:50:35 -0800
Subject: [PATCH 08/28] Adds FlatMap benchmarks for hits and misses of
 precached entities

---
 src/axom/core/FlatMap.hpp                     | 18 +++++
 src/axom/core/detail/FlatTable.hpp            |  4 +-
 .../core/tests/core_benchmark_flatmap.cpp     | 75 +++++++++++++++++++
 3 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index 0a7b1209c4..5f3aca38ae 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -74,6 +74,8 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
   using mapped_type = ValueType;
   using size_type = IndexType;
   using value_type = KeyValuePair;
+  using hasher = Hash;
+  using hash_result_type = typename Hash::result_type;
   using iterator = IteratorImpl<false>;
   using const_iterator = IteratorImpl<true>;
 
@@ -289,6 +291,8 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
   /// @{
   iterator find(const KeyType& key);
   const_iterator find(const KeyType& key) const;
+  iterator find_with_hash(const KeyType& key, hash_result_type hash);
+  const_iterator find_with_hash(const KeyType& key, hash_result_type hash) const;
   /// @}
 
   /*!
@@ -832,6 +836,13 @@ template <typename KeyType, typename ValueType, typename Hash>
 auto FlatMap<KeyType, ValueType, Hash>::find(const KeyType& key) -> iterator
 {
   auto hash = Hash {}(key);
+  return find_with_hash(key, hash);
+}
+
+template <typename KeyType, typename ValueType, typename Hash>
+auto FlatMap<KeyType, ValueType, Hash>::find_with_hash(const KeyType& key, hash_result_type hash)
+  -> iterator
+{
   iterator found_iter = end();
   this->probeIndex(m_numGroups2, m_metadata, hash, [&](IndexType bucket_index) -> bool {
     if(this->m_buckets[bucket_index].get().first == key)
@@ -849,6 +860,13 @@ template <typename KeyType, typename ValueType, typename Hash>
 auto FlatMap<KeyType, ValueType, Hash>::find(const KeyType& key) const -> const_iterator
 {
   auto hash = Hash {}(key);
+  return find_with_hash(key, hash);
+}
+
+template <typename KeyType, typename ValueType, typename Hash>
+auto FlatMap<KeyType, ValueType, Hash>::find_with_hash(const KeyType& key, hash_result_type hash) const
+  -> const_iterator
+{
   const_iterator found_iter = end();
   this->probeIndex(m_numGroups2, m_metadata, hash, [&](IndexType bucket_index) -> bool {
     if(this->m_buckets[bucket_index].get().first == key)
diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index 66ac4019b7..6dec9f0970 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -348,9 +348,7 @@ struct SequentialLookupPolicy : ProbePolicy
         // Set the overflow bit and continue probing.
         metadata[curr_group].setOverflow(hash_8);
       }
-      // Mask instead of "% metadata.size()": the group count is a power of
-      // two, and the modulo compiled to a 64-bit signed division on the
-      // critical path of every probe continuation.
+      // The group count is a power of two,  so we can use a bitmask (instead of a modulo)
       curr_group = (curr_group + this->getNext(iteration)) & group_mask;
     }
     if(empty_group != NO_MATCH)
diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index 978c47f6d1..21ea611db6 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -289,6 +289,67 @@ void BM_Find_Miss(benchmark::State& state)
   }
 }
 
+void BM_FlatMap_Find_Hit_Prehashed(benchmark::State& state)
+{
+  using MapType = axom::FlatMap<KeyType, ValueType>;
+  using HashResult = typename MapType::hash_result_type;
+
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
+  const auto pairs = make_pairs(keys);
+  const MapType map = make_filled_map<MapType>(pairs);
+
+  std::vector<HashResult> hashes;
+  hashes.reserve(keys.size());
+  for(KeyType k : keys)
+  {
+    hashes.push_back(typename MapType::hasher {}(k));
+  }
+
+  for(auto _ : state)
+  {
+    ValueType sum = 0;
+    for(std::size_t i = 0; i < keys.size(); ++i)
+    {
+      auto it = map.find_with_hash(keys[i], hashes[i]);
+      if(it != map.end())
+      {
+        sum += it->second;
+      }
+    }
+    benchmark::DoNotOptimize(sum);
+  }
+}
+
+void BM_FlatMap_Find_Miss_Prehashed(benchmark::State& state)
+{
+  using MapType = axom::FlatMap<KeyType, ValueType>;
+  using HashResult = typename MapType::hash_result_type;
+
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
+  const auto pairs = make_pairs(keys);
+  const MapType map = make_filled_map<MapType>(pairs);
+  const auto miss_keys = make_miss_keys(keys, static_cast<KeyType>(n) + 11);
+
+  std::vector<HashResult> hashes;
+  hashes.reserve(miss_keys.size());
+  for(KeyType k : miss_keys)
+  {
+    hashes.push_back(typename MapType::hasher {}(k));
+  }
+
+  for(auto _ : state)
+  {
+    std::int64_t misses = 0;
+    for(std::size_t i = 0; i < miss_keys.size(); ++i)
+    {
+      misses += (map.find_with_hash(miss_keys[i], hashes[i]) == map.end()) ? 1 : 0;
+    }
+    benchmark::DoNotOptimize(misses);
+  }
+}
+
 template <typename MapType>
 void insert_pairs_in_batches(MapType& map,
                              const std::vector<std::pair<KeyType, ValueType>>& pairs,
@@ -367,6 +428,19 @@ void RegisterBenchmarksFor(const std::string& map_name)
   // clang-format on
 }
 
+void RegisterFlatMapPrehashedBenchmarks()
+{
+  auto name = [](const std::string& op) { return axom::fmt::format("axom::FlatMap::{}", op); };
+
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None)
+  {
+    benchmark::RegisterBenchmark(name("find_hit_prehashed"), &BM_FlatMap_Find_Hit_Prehashed)
+      ->Apply(CustomArgs);
+    benchmark::RegisterBenchmark(name("find_miss_prehashed"), &BM_FlatMap_Find_Miss_Prehashed)
+      ->Apply(CustomArgs);
+  }
+}
+
 int main(int argc, char* argv[])
 {
   std::vector<int> local_test_sizes;
@@ -450,6 +524,7 @@ int main(int argc, char* argv[])
   }
 
   RegisterBenchmarksFor<axom::FlatMap<KeyType, ValueType>>("axom::FlatMap");
+  RegisterFlatMapPrehashedBenchmarks();
   RegisterBenchmarksFor<std::unordered_map<KeyType, ValueType>>("std::unordered_map");
   RegisterBenchmarksFor<std::map<KeyType, ValueType>>("std::map");
 

From 862195a78741a5ab162d196283fd694744416883 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 15 Jan 2026 21:11:11 -0800
Subject: [PATCH 09/28] Exploring faster hash functions

---
 src/axom/core/FlatMap.hpp                     |  6 ++----
 src/axom/core/detail/FlatTable.hpp            | 21 +++++++++++++++++++
 .../core/tests/core_benchmark_flatmap.cpp     |  3 +++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index 5f3aca38ae..5fbaceed7d 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -835,8 +835,7 @@ FlatMap<KeyType, ValueType, Hash>::FlatMap(IndexType num_elems,
 template <typename KeyType, typename ValueType, typename Hash>
 auto FlatMap<KeyType, ValueType, Hash>::find(const KeyType& key) -> iterator
 {
-  auto hash = Hash {}(key);
-  return find_with_hash(key, hash);
+  return find_with_hash(key, Hash {}(key));
 }
 
 template <typename KeyType, typename ValueType, typename Hash>
@@ -859,8 +858,7 @@ auto FlatMap<KeyType, ValueType, Hash>::find_with_hash(const KeyType& key, hash_
 template <typename KeyType, typename ValueType, typename Hash>
 auto FlatMap<KeyType, ValueType, Hash>::find(const KeyType& key) const -> const_iterator
 {
-  auto hash = Hash {}(key);
-  return find_with_hash(key, hash);
+  return find_with_hash(key, Hash {}(key));
 }
 
 template <typename KeyType, typename ValueType, typename Hash>
diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index 6dec9f0970..5da5d9be3c 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -72,6 +72,27 @@ struct HashMixer64
   }
 };
 
+/*!
+ * \brief A faster (but lower-cost) hash mixer for 64-bit hashing.
+ *
+ * Intended for performance experiments when the cost of hashing dominates
+ * lookup. Uses a single 64-bit multiply followed by an xor-fold.
+ */
+template <typename KeyType, template <typename> class HashFunc>
+struct FastHashMixer64
+{
+  using argument_type = typename HashFunc<KeyType>::argument_type;
+  using result_type = typename HashFunc<KeyType>::result_type;
+
+  AXOM_HOST_DEVICE uint64_t operator()(const KeyType& key) const
+  {
+    uint64_t hash = static_cast<uint64_t>(HashFunc<KeyType> {}(key));
+    hash *= 0x9e3779b97f4a7c15ULL;
+    hash ^= hash >> 32;
+    return hash;
+  }
+};
+
 // We follow the design of boost::unordered_flat_map, which uses a 128-bit chunk
 // of metadata for each group of 15 buckets.
 // This is split up into an "overflow bit", and 15 bytes representing the
diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index 21ea611db6..8164f234c5 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -15,6 +15,7 @@
 
 #include "axom/core/FlatMap.hpp"
 #include "axom/core/FlatMapUtil.hpp"
+#include "axom/core/detail/FlatTable.hpp"
 
 #if defined(AXOM_USE_SPARSEHASH)
   #include "axom/sparsehash/sparse_hash_map"
@@ -525,6 +526,8 @@ int main(int argc, char* argv[])
 
   RegisterBenchmarksFor<axom::FlatMap<KeyType, ValueType>>("axom::FlatMap");
   RegisterFlatMapPrehashedBenchmarks();
+  using FastHash = axom::detail::flat_map::FastHashMixer64<KeyType, axom::DeviceHash>;
+  RegisterBenchmarksFor<axom::FlatMap<KeyType, ValueType, FastHash>>("axom::FlatMapFastHash");
   RegisterBenchmarksFor<std::unordered_map<KeyType, ValueType>>("std::unordered_map");
   RegisterBenchmarksFor<std::map<KeyType, ValueType>>("std::map");
 

From 296934c5cf6049580ff1b693de4a48083204740d Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Tue, 10 Mar 2026 15:08:25 -0700
Subject: [PATCH 10/28] Adds benchmark for flatmap load factor

---
 .../core/tests/core_benchmark_flatmap.cpp     | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index 8164f234c5..3fe000ec62 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -22,6 +22,7 @@
 #endif
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <limits>
 #include <map>
@@ -217,6 +218,27 @@ MapType make_filled_map(const std::vector<std::pair<KeyType, ValueType>>& pairs)
   return map;
 }
 
+template <typename Key, typename Value, typename Hash>
+axom::FlatMap<Key, Value, Hash> make_filled_flatmap_with_target_load_factor(
+  const std::vector<std::pair<KeyType, ValueType>>& pairs,
+  double target_load_factor)
+{
+  using MapType = axom::FlatMap<Key, Value, Hash>;
+  MapType map;
+
+  const double max_lf = map.max_load_factor();
+  const double lf = std::max(1e-3, std::min(target_load_factor, max_lf));
+  const double n = static_cast<double>(pairs.size());
+
+  // FlatMap's ctor/rehash argument is scaled internally by max_load_factor.
+  // To target load factor `lf` for `n` elements, scale the count accordingly.
+  const axom::IndexType rehash_count = static_cast<axom::IndexType>(std::ceil((n * max_lf) / lf));
+
+  map.rehash(rehash_count);
+  map.insert(pairs.begin(), pairs.end());
+  return map;
+}
+
 template <typename MapType>
 void BM_Insert_StartEmpty(benchmark::State& state)
 {
@@ -290,6 +312,32 @@ void BM_Find_Miss(benchmark::State& state)
   }
 }
 
+template <typename Hash>
+void BM_FlatMap_Find_Hit_TargetLoad(benchmark::State& state, double target_load_factor)
+{
+  using MapType = axom::FlatMap<KeyType, ValueType, Hash>;
+
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
+  const auto pairs = make_pairs(keys);
+  const MapType map =
+    make_filled_flatmap_with_target_load_factor<KeyType, ValueType, Hash>(pairs, target_load_factor);
+
+  for(auto _ : state)
+  {
+    ValueType sum = 0;
+    for(KeyType k : keys)
+    {
+      auto it = map.find(k);
+      if(it != map.end())
+      {
+        sum += it->second;
+      }
+    }
+    benchmark::DoNotOptimize(sum);
+  }
+}
+
 void BM_FlatMap_Find_Hit_Prehashed(benchmark::State& state)
 {
   using MapType = axom::FlatMap<KeyType, ValueType>;
@@ -528,6 +576,23 @@ int main(int argc, char* argv[])
   RegisterFlatMapPrehashedBenchmarks();
   using FastHash = axom::detail::flat_map::FastHashMixer64<KeyType, axom::DeviceHash>;
   RegisterBenchmarksFor<axom::FlatMap<KeyType, ValueType, FastHash>>("axom::FlatMapFastHash");
+
+  // Explore the impact of lower load factors on successful lookups.
+  // This trades memory for potentially fewer probes and fewer cache misses.
+  using DefaultHash = axom::FlatMap<KeyType, ValueType>::hasher;
+  benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p50", [](benchmark::State& st) {
+    BM_FlatMap_Find_Hit_TargetLoad<DefaultHash>(st, 0.50);
+  })->Apply(CustomArgs);
+  benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p70", [](benchmark::State& st) {
+    BM_FlatMap_Find_Hit_TargetLoad<DefaultHash>(st, 0.70);
+  })->Apply(CustomArgs);
+  benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p50", [](benchmark::State& st) {
+    BM_FlatMap_Find_Hit_TargetLoad<FastHash>(st, 0.50);
+  })->Apply(CustomArgs);
+  benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p70", [](benchmark::State& st) {
+    BM_FlatMap_Find_Hit_TargetLoad<FastHash>(st, 0.70);
+  })->Apply(CustomArgs);
+
   RegisterBenchmarksFor<std::unordered_map<KeyType, ValueType>>("std::unordered_map");
   RegisterBenchmarksFor<std::map<KeyType, ValueType>>("std::map");
 

From 8b6cb5851a8bf8e44fa6532f0103901ccd29557d Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 05:09:42 +0000
Subject: [PATCH 11/28] Benchmark: decouple lookup order from insertion order
 in FlatMap suite

BM_Find_Hit looks keys up in the order they were inserted.
Since node-based maps walk the heap nearly sequentially,
the hardware prefetcher hides their pointer-chasing latency.

This commit adds find_hit_shuffled (same keys, independently shuffled lookup order)
and find_hit_randkeys (distinct pseudorandom 64-bit keys, shuffled lookup order)
to better exhibit expected lookup behavior.
---
 .../core/tests/core_benchmark_flatmap.cpp     | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index 3fe000ec62..c87eb86e7c 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -30,6 +30,7 @@
 #include <string>
 #include <type_traits>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -158,6 +159,46 @@ std::vector<KeyType> make_miss_keys(const std::vector<KeyType>& keys, KeyType of
   return misses;
 }
 
+/*!
+ * \brief Returns a copy of \a keys reshuffled with an independent seed.
+ *
+ *  Looking keys up in the exact order they were inserted is rarely representative, 
+ *  and it systematically favors node-based containers: with libstdc++'s identity hash 
+ *  for integers and densely numbered keys, the i-th lookup touches the i-th allocated node, 
+ *  so the lookup loop streams through the heap nearly sequentially and the hardware prefetcher hides
+ *  most of the pointer-chasing latency. An independently shuffled lookup order removes that correlation.
+ */
+std::vector<KeyType> make_lookup_order(const std::vector<KeyType>& keys, std::uint64_t seed)
+{
+  std::vector<KeyType> lookup = keys;
+  std::mt19937_64 rng(seed);
+  std::shuffle(lookup.begin(), lookup.end(), rng);
+  return lookup;
+}
+
+/*!
+ * \brief Generates \a n distinct pseudorandom 64-bit keys.
+ *
+ *  Dense keys in [0, n) are friendly to identity-style integer hashes and bucket layouts.
+ *  Random keys exercise hashing and probing the way sparse or pointer-derived IDs do.
+ */
+std::vector<KeyType> make_random_unique_keys(int n, std::uint64_t seed)
+{
+  std::mt19937_64 rng(seed);
+  std::unordered_set<KeyType> seen;
+  std::vector<KeyType> keys;
+  keys.reserve(static_cast<std::size_t>(n));
+  while(keys.size() < static_cast<std::size_t>(n))
+  {
+    const KeyType k = static_cast<KeyType>(rng());
+    if(seen.insert(k).second)
+    {
+      keys.push_back(k);
+    }
+  }
+  return keys;
+}
+
 template <typename MapType>
 struct MapFactory
 {
@@ -269,6 +310,9 @@ void BM_Insert_Reserved(benchmark::State& state)
   }
 }
 
+// NOTE: BM_Find_Hit looks keys up in insertion order, which favors
+// node-based maps as described on make_lookup_order() above.
+// Prefer BM_Find_Hit_Shuffled and BM_Find_Hit_RandomKeys when comparing containers.
 template <typename MapType>
 void BM_Find_Hit(benchmark::State& state)
 {
@@ -292,6 +336,54 @@ void BM_Find_Hit(benchmark::State& state)
   }
 }
 
+template <typename MapType>
+void BM_Find_Hit_Shuffled(benchmark::State& state)
+{
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
+  const auto pairs = make_pairs(keys);
+  const MapType map = make_filled_map<MapType>(pairs);
+  const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL);
+
+  for(auto _ : state)
+  {
+    ValueType sum = 0;
+    for(KeyType k : lookup_keys)
+    {
+      auto it = map.find(k);
+      if(it != map.end())
+      {
+        sum += it->second;
+      }
+    }
+    benchmark::DoNotOptimize(sum);
+  }
+}
+
+template <typename MapType>
+void BM_Find_Hit_RandomKeys(benchmark::State& state)
+{
+  const int n = state.range(0);
+  const auto keys = make_random_unique_keys(n, 0xFEEDFACEULL);
+  const auto pairs = make_pairs(keys);
+  const MapType map = make_filled_map<MapType>(pairs);
+  const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL);
+
+  for(auto _ : state)
+  {
+    ValueType sum = 0;
+    for(KeyType k : lookup_keys)
+    {
+      auto it = map.find(k);
+      if(it != map.end())
+      {
+        sum += it->second;
+      }
+    }
+    benchmark::DoNotOptimize(sum);
+  }
+}
+
 template <typename MapType>
 void BM_Find_Miss(benchmark::State& state)
 {
@@ -467,6 +559,8 @@ void RegisterBenchmarksFor(const std::string& map_name)
   if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None)
   {
     benchmark::RegisterBenchmark(name("find_hit"), &BM_Find_Hit<MapType>)->Apply(CustomArgs);
+    benchmark::RegisterBenchmark(name("find_hit_shuffled"), &BM_Find_Hit_Shuffled<MapType>)->Apply(CustomArgs);
+    benchmark::RegisterBenchmark(name("find_hit_randkeys"), &BM_Find_Hit_RandomKeys<MapType>)->Apply(CustomArgs);
     benchmark::RegisterBenchmark(name("find_miss"), &BM_Find_Miss<MapType>)->Apply(CustomArgs);
   }
 

From 324c90a03e064b5ec8a7f10744eb8befa356c5a3 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 05:11:25 +0000
Subject: [PATCH 12/28] FlatMap: force-inline the lookup hot path

When find_with_hash() in not inlined, every lookup is more expensive
(extra registers, and a stack spill for the key) and requires loop-invariant setup
that cannot be hoisted out of the caller's lookup loop.

Forcing the probe path inline removed 20-40% of find_hit time and 15-35%
of find_miss time for FlatMap<int64,int64> at n = 2^16 and 2^20.
---
 src/axom/core/FlatMap.hpp          |  9 +++++----
 src/axom/core/detail/FlatTable.hpp | 23 +++++++++++++++++------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index 5fbaceed7d..875db27c1a 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -289,10 +289,11 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
    *  if the key wasn't found.
    */
   /// @{
-  iterator find(const KeyType& key);
-  const_iterator find(const KeyType& key) const;
-  iterator find_with_hash(const KeyType& key, hash_result_type hash);
-  const_iterator find_with_hash(const KeyType& key, hash_result_type hash) const;
+  AXOM_FLATMAP_FORCE_INLINE iterator find(const KeyType& key);
+  AXOM_FLATMAP_FORCE_INLINE const_iterator find(const KeyType& key) const;
+  AXOM_FLATMAP_FORCE_INLINE iterator find_with_hash(const KeyType& key, hash_result_type hash);
+  AXOM_FLATMAP_FORCE_INLINE const_iterator find_with_hash(const KeyType& key,
+                                                          hash_result_type hash) const;
   /// @}
 
   /*!
diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index 5da5d9be3c..846960597f 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -28,6 +28,17 @@
   #include <emmintrin.h>
 #endif
 
+// Force-inline annotation for the FlatMap/FlatTable lookup hot path.
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  #define AXOM_FLATMAP_FORCE_INLINE __forceinline__
+#elif defined(__GNUC__) || defined(__clang__)
+  #define AXOM_FLATMAP_FORCE_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+  #define AXOM_FLATMAP_FORCE_INLINE __forceinline
+#else
+  #define AXOM_FLATMAP_FORCE_INLINE inline
+#endif
+
 namespace axom
 {
 namespace detail
@@ -162,7 +173,7 @@ struct GroupBucket
   }
 
   template <typename Func>
-  AXOM_HOST_DEVICE int visitHashBucket(std::uint8_t hash, Func&& visitor) const
+  AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE int visitHashBucket(std::uint8_t hash, Func&& visitor) const
   {
     std::uint8_t reducedHash = reduceHash(hash);
 #if !defined(AXOM_DEVICE_CODE) && defined(_AXOM_CORE_HAVE_SSE2)
@@ -273,7 +284,7 @@ struct GroupBucket
   }
 
   template <bool Atomic = false>
-  AXOM_HOST_DEVICE bool getMaybeOverflowed(std::uint8_t hash) const
+  AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE bool getMaybeOverflowed(std::uint8_t hash) const
   {
     std::uint8_t hashOfwBit = 1 << (hash % 8);
     std::uint8_t curr_ofw;
@@ -390,10 +401,10 @@ struct SequentialLookupPolicy : ProbePolicy
    *  matching hash
    */
   template <typename FoundIndex>
-  AXOM_HOST_DEVICE void probeIndex(int ngroups_pow_2,
-                                   ArrayView<const GroupBucket> metadata,
-                                   HashType hash,
-                                   FoundIndex&& on_hash_found) const
+  AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE void probeIndex(int ngroups_pow_2,
+                                                             ArrayView<const GroupBucket> metadata,
+                                                             HashType hash,
+                                                             FoundIndex&& on_hash_found) const
   {
     // We use the k MSBs of the hash as the initial group probe point,
     // where ngroups = 2^k. Since the group count is always a power of two,

From 03ac4d99017df403f67ae8268368ecc041a0955c Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 05:13:25 +0000
Subject: [PATCH 13/28] FlatMap: hash once and avoid FP division in
 getEmplacePos

`getEmplacePos()` computed `Hash{}(key)`, then called `find(key)`,
which hashed the same key a second time.
It then performed a floating-point division against MAX_LOAD_FACTOR
on every insertion to decide whether to grow.

Note: This reduced instruction count but the performance improvements
within run-to-run noise in our measurements.
---
 src/axom/core/FlatMap.hpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index 875db27c1a..087eb75810 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -7,6 +7,7 @@
 #ifndef Axom_Core_FlatMap_HPP
 #define Axom_Core_FlatMap_HPP
 
+#include <cstdint>
 #include <tuple>
 #include <type_traits>
 #include <utility>
@@ -897,14 +898,21 @@ auto FlatMap<KeyType, ValueType, Hash>::getEmplacePos(const KeyType& key)
   auto hash = Hash {}(key);
 
   // If the key already exists, return the existing iterator.
-  iterator existing_elem = this->find(key);
+  // Reuse the hash computed above rather than re-hashing inside find().
+  iterator existing_elem = this->find_with_hash(key, hash);
   if(existing_elem != this->end())
   {
     return {existing_elem, false};
   }
   // Resize to double the number of bucket groups if insertion would put us
   // above the maximum load factor.
-  if(((m_loadCount + 1) / (double)bucket_count()) >= MAX_LOAD_FACTOR)
+  // MAX_LOAD_FACTOR is exactly 7/8, so (count + 1) / buckets >= 7/8 is
+  // equivalent to 8 * (count + 1) >= 7 * buckets in exact integer arithmetic.
+  // This avoids a floating-point division on every insertion.
+  static_assert(MAX_LOAD_FACTOR == 0.875,
+                "Integer load-factor check below assumes MAX_LOAD_FACTOR == 7/8.");
+  if(8 * (static_cast<std::uint64_t>(m_loadCount) + 1) >=
+     7 * static_cast<std::uint64_t>(bucket_count()))
   {
     IndexType newNumGroups = m_metadata.size() * 2;
     rehash(newNumGroups * BucketsPerGroup - 1);

From f658482439a76424278a87de7a4dca1f16af4f29 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 05:14:16 +0000
Subject: [PATCH 14/28] Benchmark: report realized load factor in
 target-load-factor scenarios

FlatMap rounds its group count up to a power of two, so for a fixed
element count the achievable load factors form a geometric ladder and a
nominal target is quantized to the next rung at or below it. At n = 2^16
the 0.70 target and the default reserve(n) geometry coincide (actual load
factor 0.533, which is why find_hit_lf0p70 reproduced find_hit to within
noise), and the 0.50 target lands at 0.267 -- a table twice as large.
That scenario was really measuring a larger working set, not a shorter
probe sequence.
---
 src/axom/core/tests/core_benchmark_flatmap.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index c87eb86e7c..a06d9be21a 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -273,6 +273,15 @@ axom::FlatMap<Key, Value, Hash> make_filled_flatmap_with_target_load_factor(
 
   // FlatMap's ctor/rehash argument is scaled internally by max_load_factor.
   // To target load factor `lf` for `n` elements, scale the count accordingly.
+  //
+  // NOTE: FlatMap rounds its group count up to a power of two, so for a
+  // fixed n the achievable load factors form a geometric ladder
+  // (n / (15 * 2^k - 1) for integer k) and the request is quantized to the
+  // next rung at or below the target. At n = 2^16 this means a 0.70 target
+  // and the default reserve(n) geometry coincide at an actual load factor
+  // of 0.533, and a 0.50 target lands at 0.267 (a table twice as large).
+  // The benchmarks below export the realized load factor and bucket count
+  // as counters; compare those, not the nominal targets.
   const axom::IndexType rehash_count = static_cast<axom::IndexType>(std::ceil((n * max_lf) / lf));
 
   map.rehash(rehash_count);
@@ -415,6 +424,11 @@ void BM_FlatMap_Find_Hit_TargetLoad(benchmark::State& state, double target_load_
   const MapType map =
     make_filled_flatmap_with_target_load_factor<KeyType, ValueType, Hash>(pairs, target_load_factor);
 
+  // Export the geometry actually realized after power-of-two rounding so
+  // that runs with different nominal targets can be compared meaningfully.
+  state.counters["load_factor"] = map.load_factor();
+  state.counters["buckets"] = static_cast<double>(map.bucket_count());
+
   for(auto _ : state)
   {
     ValueType sum = 0;

From 1935fe63ebaa6b9c756f80088e146058377ad34b Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 05:15:00 +0000
Subject: [PATCH 15/28] FlatTable: honor visitor early-exit in scalar
 visitHashBucket

The SSE2 path of GroupBucket::visitHashBucket() stops visiting as soon as
the visitor returns false, but the scalar fallback (including GPU path)
ignored the return value and kept scanning all 15 slots.

In-tree visitors and the duplicate check in the batched insert path
return false to mean 'stop', and extra visits load and compare a key
which could incur a cache miss per probe group.
---
 src/axom/core/detail/FlatTable.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index 846960597f..45276cdf8f 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -201,7 +201,11 @@ struct GroupBucket
     {
       if(metadata.buckets[i] == reducedHash)
       {
-        visitor(i);
+        if(!visitor(i))
+        {
+          // Found a match - stop visiting, mirroring the SSE2 path above.
+          break;
+        }
       }
     }
 #endif

From cb5793a21c4bd10331a76d6a9572476f95ff95cf Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 11:16:57 -0700
Subject: [PATCH 16/28] Fixes hip build via missing AXOM_HOST_DEVICE

---
 src/axom/core/tests/core_flatmap.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp
index 4e3a2f4506..69241c2c2a 100644
--- a/src/axom/core/tests/core_flatmap.hpp
+++ b/src/axom/core/tests/core_flatmap.hpp
@@ -163,7 +163,7 @@ struct DegenerateGroupHash
 {
   using argument_type = int;
   using result_type = std::uint64_t;
-  std::uint64_t operator()(int key) const
+  AXOM_HOST_DEVICE std::uint64_t operator()(int key) const
   {
     return static_cast<std::uint64_t>(static_cast<unsigned>(key) & 0xFF);
   }

From 6691dbbebad57650c76089acbdf287f702f5f9a8 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 12:06:08 -0700
Subject: [PATCH 17/28] FlatMap: Fuse the find and empty-slot probes in
 getEmplacePos()

Emplacing a new key walked the probe sequence twice -- first to check
for a key and then to find an empty slot within the key. We now do
both within a single call.
---
 src/axom/core/FlatMap.hpp            |  20 +++--
 src/axom/core/detail/FlatTable.hpp   |  83 ++++++++++++++++++++
 src/axom/core/tests/core_flatmap.hpp | 108 +++++++++++++++++++++++++++
 3 files changed, 205 insertions(+), 6 deletions(-)

diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index 087eb75810..c51d246414 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -897,9 +897,18 @@ auto FlatMap<KeyType, ValueType, Hash>::getEmplacePos(const KeyType& key)
 {
   auto hash = Hash {}(key);
 
-  // If the key already exists, return the existing iterator.
-  // Reuse the hash computed above rather than re-hashing inside find().
-  iterator existing_elem = this->find_with_hash(key, hash);
+  // Single fused probe: visit key matches and locate the insertion slot in a single pass
+  iterator existing_elem = this->end();
+  IndexType newBucket =
+    this->probeEmplaceIndex(m_numGroups2, m_metadata, hash, [&](IndexType bucket_index) -> bool {
+      if(this->m_buckets[bucket_index].get().first == key)
+      {
+        existing_elem = iterator(this, bucket_index);
+        return false;
+      }
+      return true;
+    });
+
   if(existing_elem != this->end())
   {
     return {existing_elem, false};
@@ -916,11 +925,10 @@ auto FlatMap<KeyType, ValueType, Hash>::getEmplacePos(const KeyType& key)
   {
     IndexType newNumGroups = m_metadata.size() * 2;
     rehash(newNumGroups * BucketsPerGroup - 1);
+    // The table was rebuilt, so the slot is stale. If we got here, the key is missing
+    newBucket = this->probeEmptyIndex(m_numGroups2, m_metadata, hash);
   }
 
-  // Get an empty index to place the element into.
-  IndexType newBucket = this->probeEmptyIndex(m_numGroups2, m_metadata, hash);
-
   // Add a hash to the corresponding bucket slot.
   this->setBucketHash(m_metadata, newBucket, hash);
   m_size++;
diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index 45276cdf8f..2a9307bd73 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -394,6 +394,89 @@ struct SequentialLookupPolicy : ProbePolicy
     return NO_MATCH;
   }
 
+  /*!
+   * \brief Fused find-or-locate-empty probe for single-key emplacement.
+   *
+   *  Walks the probe sequence once, simultaneously visiting key matches and
+   *  tracking the first empty slot. Overflow bits are maintained in the 
+   *  same way as probeEmptyIndex(): a full group that hasn't yielded an insertion 
+   *  slot is marked overflowed for this hash before moving on.
+   *
+   * \param [in] ngroups_pow_2 the number of groups, expressed as a power of 2
+   * \param [in] metadata the array of metadata for the groups in the hash map
+   * \param [in] hash the hash to search for and, if absent, insert
+   * \param [in] on_hash_found functor called for each matching bucket slot;
+   *  returns false to stop the probe (existing key found)
+   *
+   * \return the bucket index to insert into, or NO_MATCH if the visitor stopped the probe
+   */
+  template <typename FoundIndex>
+  IndexType probeEmplaceIndex(int ngroups_pow_2,
+                              ArrayView<GroupBucket> metadata,
+                              HashType hash,
+                              FoundIndex&& on_hash_found) const
+  {
+    const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2);
+    const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1;
+    HashType curr_group = (hash >> bitshift_right) & group_mask;
+    int empty_group = NO_MATCH;
+    int empty_bucket = NO_MATCH;
+
+    std::uint8_t hash_8 = static_cast<std::uint8_t>(hash);
+    bool may_exist = true;
+    for(int iteration = 0; iteration < metadata.size(); ++iteration)
+    {
+      if(may_exist)
+      {
+        // The key may be in the current group, so scan for matches just as probeIndex() does
+        bool keep_going = true;
+        metadata[curr_group].visitHashBucket(hash_8, [&](IndexType bucket_index) -> bool {
+          keep_going = on_hash_found(curr_group * GroupBucket::Size + bucket_index);
+          return keep_going;
+        });
+        if(!keep_going)
+        {
+          // Visitor stopped the probe: the key already exists
+          return NO_MATCH;
+        }
+      }
+
+      if(empty_group == NO_MATCH)
+      {
+        int tentative_empty_bucket = metadata[curr_group].getEmptyBucket();
+        if(tentative_empty_bucket != GroupBucket::InvalidSlot)
+        {
+          empty_group = curr_group;
+          empty_bucket = tentative_empty_bucket;
+        }
+      }
+
+      if(!metadata[curr_group].getMaybeOverflowed(hash_8))
+      {
+        // The key cannot exist past this group
+        may_exist = false;
+        if(empty_group != NO_MATCH)
+        {
+          break;
+        }
+        // Full group at the end of the trail, mark as overflowed and keep looking for an empty slot
+        metadata[curr_group].setOverflow(hash_8);
+      }
+      else if(empty_group == NO_MATCH)
+      {
+        // Full group inside the trail
+        metadata[curr_group].setOverflow(hash_8);
+      }
+      // The group count is a power of two, so we can use a bitmask (instead of a modulo)
+      curr_group = (curr_group + this->getNext(iteration)) & group_mask;
+    }
+    if(empty_group != NO_MATCH)
+    {
+      return empty_group * GroupBucket::Size + empty_bucket;
+    }
+    return NO_MATCH;
+  }
+
   /*!
    * \brief Finds the next potential bucket index for a given hash in a group
    *  array for an open-addressing hash map.
diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp
index 69241c2c2a..1d134550c4 100644
--- a/src/axom/core/tests/core_flatmap.hpp
+++ b/src/axom/core/tests/core_flatmap.hpp
@@ -228,6 +228,114 @@ TEST(core_flatmap_unit, cross_group_probe_chains)
   }
 }
 
+// Hash functor that maps every key to the same hash value
+// This forces long probe chains and stresses fused "find + empty-slot" probing
+struct ConstantHash64
+{
+  using argument_type = int;
+  using result_type = std::uint64_t;
+
+  AXOM_HOST_DEVICE std::uint64_t operator()(int) const { return std::uint64_t {0}; }
+};
+
+TEST(core_flatmap_unit, fused_emplace_probe_no_duplicate_across_tombstone)
+{
+  using MapType = axom::FlatMap<int, int, ConstantHash64>;
+  MapType test_map;
+
+  const int NUM_ELEMS = 40;
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    test_map.insert_or_assign(i, i * 10);
+  }
+  ASSERT_EQ(test_map.size(), NUM_ELEMS);
+
+  // Create a tombstone early in the probe trail
+  EXPECT_EQ(test_map.erase(1), 1);
+  ASSERT_EQ(test_map.size(), NUM_ELEMS - 1);
+
+  // Update a key that should live beyond the first group. The fused emplace probe
+  // must keep probing past the earlier empty slot and find the existing key.
+  const int existing_key = NUM_ELEMS - 1;
+  auto result = test_map.insert_or_assign(existing_key, 12345);
+  EXPECT_FALSE(result.second);
+  EXPECT_EQ(test_map.size(), NUM_ELEMS - 1);
+  EXPECT_EQ(test_map.at(existing_key), 12345);
+
+  // Verify we did not accidentally insert a duplicate key
+  // (which would be possible if the probe stopped at the tombstone)
+  int occurrences = 0;
+  for(const auto& kv : test_map)
+  {
+    occurrences += (kv.first == existing_key) ? 1 : 0;
+  }
+  EXPECT_EQ(occurrences, 1);
+}
+
+TEST(core_flatmap_unit, fused_emplace_probe_try_emplace_respects_existing_after_tombstone)
+{
+  using MapType = axom::FlatMap<int, int, ConstantHash64>;
+  MapType test_map;
+
+  const int NUM_ELEMS = 40;
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    test_map.insert_or_assign(i, i * 10);
+  }
+  ASSERT_EQ(test_map.size(), NUM_ELEMS);
+
+  // Create a tombstone early in the probe trail
+  EXPECT_EQ(test_map.erase(2), 1);
+  ASSERT_EQ(test_map.size(), NUM_ELEMS - 1);
+
+  const int existing_key = NUM_ELEMS - 2;
+  test_map.insert_or_assign(existing_key, 777);
+  ASSERT_EQ(test_map.at(existing_key), 777);
+
+  // try_emplace must not insert or overwrite when the key exists
+  auto emplace_res = test_map.try_emplace(existing_key, 999);
+  EXPECT_FALSE(emplace_res.second);
+  EXPECT_EQ(emplace_res.first->second, 777);
+
+  int occurrences = 0;
+  for(const auto& kv : test_map)
+  {
+    occurrences += (kv.first == existing_key) ? 1 : 0;
+  }
+  EXPECT_EQ(occurrences, 1);
+}
+
+TEST(core_flatmap_unit, fused_emplace_probe_recomputes_slot_after_rehash)
+{
+  using MapType = axom::FlatMap<int, int, ConstantHash64>;
+  MapType test_map;
+
+  const int init_buckets = test_map.bucket_count();
+  const int size_no_rehash = static_cast<int>(test_map.max_load_factor() * init_buckets);
+
+  // Fill right up to the no-rehash threshold.
+  for(int i = 0; i < size_no_rehash; i++)
+  {
+    test_map.insert_or_assign(i, i);
+  }
+  ASSERT_EQ(test_map.bucket_count(), init_buckets);
+  ASSERT_EQ(test_map.size(), size_no_rehash);
+
+  // Create a mid-sequence tombstone. With ConstantHash64 and a full trail, this should
+  // preserve loadCount, so the next insertion triggers a rehash even though an empty slot exists.
+  EXPECT_EQ(test_map.erase(0), 1);
+  ASSERT_EQ(test_map.bucket_count(), init_buckets);
+  ASSERT_EQ(test_map.size(), size_no_rehash - 1);
+
+  const int buckets_before = test_map.bucket_count();
+  const int new_key = 100000;
+  test_map.insert_or_assign(new_key, 42);
+
+  EXPECT_GT(test_map.bucket_count(), buckets_before);
+  EXPECT_EQ(test_map.at(new_key), 42);
+  EXPECT_EQ(test_map.count(0), 0);
+}
+
 AXOM_TYPED_TEST(core_flatmap, default_init)
 {
   using MapType = typename TestFixture::MapType;

From 6d8a86f394d461704eebd4d488c50687e2d058e3 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 13:32:25 -0700
Subject: [PATCH 18/28] FlatMap: Keep move semantics during batch insertion

---
 src/axom/core/FlatMapUtil.hpp        |  8 +++--
 src/axom/core/tests/core_flatmap.hpp | 48 ++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp
index d1adb18f90..4eb1c41a05 100644
--- a/src/axom/core/FlatMapUtil.hpp
+++ b/src/axom/core/FlatMapUtil.hpp
@@ -278,8 +278,12 @@ void FlatMap<KeyType, ValueType, Hash>::insert(InputIt kv_begin, InputIt kv_end)
 
     for(IndexType idx = 0; idx < num_elems; ++idx)
     {
-      auto kv = *(kv_begin + idx);
-      this->insert_or_assign(kv.first, kv.second);
+      // Preserve the value category of the input pair. In particular, when
+      // kv_begin/kv_end are move iterators, we must forward the mapped value
+      // so move-only types remain supported.
+      decltype(auto) kv = *(kv_begin + idx);
+      this->insert_or_assign(std::forward<decltype(kv)>(kv).first,
+                             std::forward<decltype(kv)>(kv).second);
     }
     return;
   }
diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp
index 1d134550c4..021bcd741d 100644
--- a/src/axom/core/tests/core_flatmap.hpp
+++ b/src/axom/core/tests/core_flatmap.hpp
@@ -13,6 +13,9 @@
 // gtest includes
 #include "gtest/gtest.h"
 
+// C++ includes
+#include <memory>
+
 // Unit test for QuadraticProbing
 TEST(core_flatmap_unit, quadratic_probing)
 {
@@ -656,6 +659,51 @@ TEST(core_flatmap_moveonly, init_and_move_moveonly)
   }
 }
 
+TEST(core_flatmap_moveonly, insert_batched_seq_move_iterators)
+{
+  using MapType = axom::FlatMap<int, std::unique_ptr<double>>;
+  MapType test_map;
+
+  using PairType = std::pair<int, std::unique_ptr<double>>;
+  std::vector<PairType> pairs;
+
+  const int NUM_ELEMS = 64;
+  pairs.reserve(NUM_ELEMS + 1);
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    pairs.emplace_back(i, std::make_unique<double>(i + 1.0));
+  }
+
+  // Include a duplicate so "later duplicates overwrite earlier ones" is exercised,
+  // while also ensuring the value is moved in all cases.
+  pairs.emplace_back(NUM_ELEMS / 2, std::make_unique<double>(123.0));
+
+  test_map.template insert<axom::SEQ_EXEC>(std::make_move_iterator(pairs.begin()),
+                                           std::make_move_iterator(pairs.end()));
+
+  EXPECT_EQ(test_map.size(), NUM_ELEMS);
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    ASSERT_EQ(test_map.count(i), 1);
+    auto& ptr = test_map.at(i);
+    ASSERT_NE(ptr.get(), nullptr);
+    if(i == NUM_ELEMS / 2)
+    {
+      EXPECT_EQ(*ptr, 123.0);
+    }
+    else
+    {
+      EXPECT_EQ(*ptr, i + 1.0);
+    }
+  }
+
+  // All source values should have been moved-from.
+  for(const auto& kv : pairs)
+  {
+    EXPECT_EQ(kv.second.get(), nullptr);
+  }
+}
+
 AXOM_TYPED_TEST(core_flatmap, init_and_copy)
 {
   using MapType = typename TestFixture::MapType;

From f083e50a927eb417235f0f5cee90ef3d16f19e39 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 13:47:37 -0700
Subject: [PATCH 19/28] Improves FlatMap benchmark

* Disables sequential find_hit search by default since it is not representative.
* Guards several tests by the feature they are testing
---
 .../core/tests/core_benchmark_flatmap.cpp     | 51 ++++++++++++-------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index a06d9be21a..2f013e35df 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -72,6 +72,7 @@ inline FlatMapFeatureBenchmarks operator&(FlatMapFeatureBenchmarks lhs, FlatMapF
 std::vector<int> args_benchmark_sizes;
 FlatMapFeatureBenchmarks args_benchmark_features {FlatMapFeatureBenchmarks::None};
 int args_batch_size = 1 << 10;
+bool args_include_insertion_order_lookup = false;
 }  // namespace
 
 template <>
@@ -162,9 +163,9 @@ std::vector<KeyType> make_miss_keys(const std::vector<KeyType>& keys, KeyType of
 /*!
  * \brief Returns a copy of \a keys reshuffled with an independent seed.
  *
- *  Looking keys up in the exact order they were inserted is rarely representative, 
- *  and it systematically favors node-based containers: with libstdc++'s identity hash 
- *  for integers and densely numbered keys, the i-th lookup touches the i-th allocated node, 
+ *  Looking keys up in the exact order they were inserted is rarely representative,
+ *  and it systematically favors node-based containers: with libstdc++'s identity hash
+ *  for integers and densely numbered keys, the i-th lookup touches the i-th allocated node,
  *  so the lookup loop streams through the heap nearly sequentially and the hardware prefetcher hides
  *  most of the pointer-chasing latency. An independently shuffled lookup order removes that correlation.
  */
@@ -423,6 +424,7 @@ void BM_FlatMap_Find_Hit_TargetLoad(benchmark::State& state, double target_load_
   const auto pairs = make_pairs(keys);
   const MapType map =
     make_filled_flatmap_with_target_load_factor<KeyType, ValueType, Hash>(pairs, target_load_factor);
+  const auto lookup_keys = make_lookup_order(keys, 0xF00DBA11ULL);
 
   // Export the geometry actually realized after power-of-two rounding so
   // that runs with different nominal targets can be compared meaningfully.
@@ -432,7 +434,7 @@ void BM_FlatMap_Find_Hit_TargetLoad(benchmark::State& state, double target_load_
   for(auto _ : state)
   {
     ValueType sum = 0;
-    for(KeyType k : keys)
+    for(KeyType k : lookup_keys)
     {
       auto it = map.find(k);
       if(it != map.end())
@@ -572,7 +574,10 @@ void RegisterBenchmarksFor(const std::string& map_name)
 
   if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None)
   {
-    benchmark::RegisterBenchmark(name("find_hit"), &BM_Find_Hit<MapType>)->Apply(CustomArgs);
+    if(::args_include_insertion_order_lookup)
+    {
+      benchmark::RegisterBenchmark(name("find_hit"), &BM_Find_Hit<MapType>)->Apply(CustomArgs);
+    }
     benchmark::RegisterBenchmark(name("find_hit_shuffled"), &BM_Find_Hit_Shuffled<MapType>)->Apply(CustomArgs);
     benchmark::RegisterBenchmark(name("find_hit_randkeys"), &BM_Find_Hit_RandomKeys<MapType>)->Apply(CustomArgs);
     benchmark::RegisterBenchmark(name("find_miss"), &BM_Find_Miss<MapType>)->Apply(CustomArgs);
@@ -603,6 +608,7 @@ int main(int argc, char* argv[])
   std::vector<int> local_test_sizes;
   FlatMapFeatureBenchmarks local_benchmark_features {FlatMapFeatureBenchmarks::None};
   int local_batch_size = ::args_batch_size;
+  bool local_include_insertion_order_lookup = ::args_include_insertion_order_lookup;
 
   axom::CLI::App app {"Axom FlatMap benchmarks"};
   app.add_option("-s,--custom_sizes", local_test_sizes)
@@ -632,6 +638,9 @@ int main(int argc, char* argv[])
     ->default_val(local_batch_size)
     ->check(axom::CLI::PositiveNumber);
 
+  app.add_flag("--include_insertion_order_lookup", local_include_insertion_order_lookup)
+    ->description("Includes insertion-order lookup benchmark (biased; for diagnosis)");
+
   std::vector<std::string> feature_strings;
   auto feature_opt =
     app.add_option("-f,--features", feature_strings)
@@ -673,11 +682,14 @@ int main(int argc, char* argv[])
     std::swap(::args_benchmark_sizes, local_test_sizes);
 
     ::args_batch_size = local_batch_size;
+    ::args_include_insertion_order_lookup = local_include_insertion_order_lookup;
 
     SLIC_INFO("Parsed and processed command line arguments:");
     SLIC_INFO(axom::fmt::format("- Map sizes: {}", axom::fmt::join(::args_benchmark_sizes, ",")));
     SLIC_INFO(axom::fmt::format("- Batch size: {}", ::args_batch_size));
     SLIC_INFO(axom::fmt::format("- Map features to test: {}", ::args_benchmark_features));
+    SLIC_INFO(axom::fmt::format("- Include insertion-order lookup: {}",
+                                ::args_include_insertion_order_lookup ? "true" : "false"));
   }
 
   RegisterBenchmarksFor<axom::FlatMap<KeyType, ValueType>>("axom::FlatMap");
@@ -687,19 +699,22 @@ int main(int argc, char* argv[])
 
   // Explore the impact of lower load factors on successful lookups.
   // This trades memory for potentially fewer probes and fewer cache misses.
-  using DefaultHash = axom::FlatMap<KeyType, ValueType>::hasher;
-  benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p50", [](benchmark::State& st) {
-    BM_FlatMap_Find_Hit_TargetLoad<DefaultHash>(st, 0.50);
-  })->Apply(CustomArgs);
-  benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p70", [](benchmark::State& st) {
-    BM_FlatMap_Find_Hit_TargetLoad<DefaultHash>(st, 0.70);
-  })->Apply(CustomArgs);
-  benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p50", [](benchmark::State& st) {
-    BM_FlatMap_Find_Hit_TargetLoad<FastHash>(st, 0.50);
-  })->Apply(CustomArgs);
-  benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p70", [](benchmark::State& st) {
-    BM_FlatMap_Find_Hit_TargetLoad<FastHash>(st, 0.70);
-  })->Apply(CustomArgs);
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None)
+  {
+    using DefaultHash = axom::FlatMap<KeyType, ValueType>::hasher;
+    benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p50", [](benchmark::State& st) {
+      BM_FlatMap_Find_Hit_TargetLoad<DefaultHash>(st, 0.50);
+    })->Apply(CustomArgs);
+    benchmark::RegisterBenchmark("axom::FlatMap::find_hit_lf0p70", [](benchmark::State& st) {
+      BM_FlatMap_Find_Hit_TargetLoad<DefaultHash>(st, 0.70);
+    })->Apply(CustomArgs);
+    benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p50", [](benchmark::State& st) {
+      BM_FlatMap_Find_Hit_TargetLoad<FastHash>(st, 0.50);
+    })->Apply(CustomArgs);
+    benchmark::RegisterBenchmark("axom::FlatMapFastHash::find_hit_lf0p70", [](benchmark::State& st) {
+      BM_FlatMap_Find_Hit_TargetLoad<FastHash>(st, 0.70);
+    })->Apply(CustomArgs);
+  }
 
   RegisterBenchmarksFor<std::unordered_map<KeyType, ValueType>>("std::unordered_map");
   RegisterBenchmarksFor<std::map<KeyType, ValueType>>("std::map");

From e10cd0260e78bb3196d5a525c8465ad3f1da5dd0 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 14:00:47 -0700
Subject: [PATCH 20/28] FlatMap: Device hash type must be 64 bits

Also adds more device hashing tests
---
 src/axom/core/tests/core_device_hash.hpp | 131 ++++++++++++++++++++---
 1 file changed, 116 insertions(+), 15 deletions(-)

diff --git a/src/axom/core/tests/core_device_hash.hpp b/src/axom/core/tests/core_device_hash.hpp
index 58173725e1..6cab7708b2 100644
--- a/src/axom/core/tests/core_device_hash.hpp
+++ b/src/axom/core/tests/core_device_hash.hpp
@@ -12,6 +12,7 @@
 #include "gtest/gtest.h"
 
 // C++ includes
+#include <cstdint>
 #include <set>
 
 template <typename TheExecSpace>
@@ -37,6 +38,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_int)
   using ExecSpace = typename TestFixture::ExecSpace;
 
   axom::DeviceHash<int> device_hasher;
+  using HashResult = typename decltype(device_hasher)::result_type;
 
   constexpr int NUM_HASHES = 4;
 
@@ -44,7 +46,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_int)
 
   // Allocate space for hash results.
   int allocatorID = axom::execution_space<ExecSpace>::allocatorID();
-  axom::IndexType *computed_hashes = axom::allocate<axom::IndexType>(NUM_HASHES, allocatorID);
+  HashResult* computed_hashes = axom::allocate<HashResult>(NUM_HASHES, allocatorID);
 
   // Compute hashes.
   axom::for_all<ExecSpace>(
@@ -52,8 +54,8 @@ AXOM_TYPED_TEST(core_device_hash, hash_int)
     AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); });
 
   // Copy back to host.
-  axom::IndexType computed_hashes_host[NUM_HASHES];
-  axom::copy(computed_hashes_host, computed_hashes, sizeof(axom::IndexType) * NUM_HASHES);
+  HashResult computed_hashes_host[NUM_HASHES];
+  axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES);
   axom::deallocate(computed_hashes);
 
   for(int i = 0; i < NUM_HASHES; i++)
@@ -74,6 +76,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_float)
   using ExecSpace = typename TestFixture::ExecSpace;
 
   axom::DeviceHash<float> device_hasher;
+  using HashResult = typename decltype(device_hasher)::result_type;
 
   constexpr int NUM_HASHES = 4;
 
@@ -81,7 +84,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_float)
 
   // Allocate space for hash results.
   int allocatorID = axom::execution_space<ExecSpace>::allocatorID();
-  axom::IndexType *computed_hashes = axom::allocate<axom::IndexType>(NUM_HASHES, allocatorID);
+  HashResult* computed_hashes = axom::allocate<HashResult>(NUM_HASHES, allocatorID);
 
   // Compute hashes.
   axom::for_all<ExecSpace>(
@@ -89,8 +92,8 @@ AXOM_TYPED_TEST(core_device_hash, hash_float)
     AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); });
 
   // Copy back to host.
-  axom::IndexType computed_hashes_host[NUM_HASHES];
-  axom::copy(computed_hashes_host, computed_hashes, sizeof(axom::IndexType) * NUM_HASHES);
+  HashResult computed_hashes_host[NUM_HASHES];
+  axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES);
   axom::deallocate(computed_hashes);
 
   for(int i = 0; i < NUM_HASHES; i++)
@@ -112,12 +115,13 @@ AXOM_TYPED_TEST(core_device_hash, hash_float)
 TEST(core_device_hash, hash_string)
 {
   axom::DeviceHash<std::string> device_hasher;
+  using HashResult = typename decltype(device_hasher)::result_type;
 
   constexpr int NUM_HASHES = 4;
 
   std::string things_to_hash[NUM_HASHES] {"0", "1", "37", "1100"};
 
-  axom::IndexType computed_hashes[NUM_HASHES];
+  HashResult computed_hashes[NUM_HASHES];
 
   // Compute hashes.
   for(int i = 0; i < NUM_HASHES; i++)
@@ -151,6 +155,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_enum)
   using ExecSpace = typename TestFixture::ExecSpace;
 
   axom::DeviceHash<TestEnumHash> device_hasher;
+  using HashResult = typename decltype(device_hasher)::result_type;
 
   constexpr int NUM_HASHES = 4;
 
@@ -161,7 +166,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_enum)
 
   // Allocate space for hash results.
   int allocatorID = axom::execution_space<ExecSpace>::allocatorID();
-  axom::IndexType *computed_hashes = axom::allocate<axom::IndexType>(NUM_HASHES, allocatorID);
+  HashResult* computed_hashes = axom::allocate<HashResult>(NUM_HASHES, allocatorID);
 
   // Compute hashes.
   axom::for_all<ExecSpace>(
@@ -169,8 +174,8 @@ AXOM_TYPED_TEST(core_device_hash, hash_enum)
     AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); });
 
   // Copy back to host.
-  axom::IndexType computed_hashes_host[NUM_HASHES];
-  axom::copy(computed_hashes_host, computed_hashes, sizeof(axom::IndexType) * NUM_HASHES);
+  HashResult computed_hashes_host[NUM_HASHES];
+  axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES);
   axom::deallocate(computed_hashes);
 
   for(int i = 0; i < NUM_HASHES; i++)
@@ -210,7 +215,7 @@ struct DeviceHash<axom_testing::UserVector<T>>
     constexpr int NWORDS = sizeof(axom_testing::UserVector<T>) / sizeof(int);
     alignas(axom_testing::UserVector<T>) int bytes[NWORDS];
     // NOTE: Separating these statements fixes a warning about strict-aliasing.
-    auto ptr = reinterpret_cast<axom_testing::UserVector<T> *>(bytes);
+    auto ptr = reinterpret_cast<axom_testing::UserVector<T>*>(bytes);
     *ptr = value;
 
     axom::IndexType hash_result {};
@@ -228,6 +233,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined)
   using ExecSpace = typename TestFixture::ExecSpace;
 
   axom::DeviceHash<axom_testing::UserVector<float>> device_hasher;
+  using HashResult = typename decltype(device_hasher)::result_type;
 
   constexpr int NUM_HASHES = 4;
 
@@ -238,7 +244,7 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined)
 
   // Allocate space for hash results.
   int allocatorID = axom::execution_space<ExecSpace>::allocatorID();
-  axom::IndexType *computed_hashes = axom::allocate<axom::IndexType>(NUM_HASHES, allocatorID);
+  HashResult* computed_hashes = axom::allocate<HashResult>(NUM_HASHES, allocatorID);
 
   // Compute hashes.
   axom::for_all<ExecSpace>(
@@ -246,8 +252,8 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined)
     AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); });
 
   // Copy back to host.
-  axom::IndexType computed_hashes_host[NUM_HASHES];
-  axom::copy(computed_hashes_host, computed_hashes, sizeof(axom::IndexType) * NUM_HASHES);
+  HashResult computed_hashes_host[NUM_HASHES];
+  axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES);
   axom::deallocate(computed_hashes);
 
   for(int i = 0; i < NUM_HASHES; i++)
@@ -263,6 +269,101 @@ AXOM_TYPED_TEST(core_device_hash, hash_user_defined)
   }
 }
 
+AXOM_TYPED_TEST(core_device_hash, hash_uint64_distinguishes_high_bits)
+{
+  using ExecSpace = typename TestFixture::ExecSpace;
+
+  axom::DeviceHash<std::uint64_t> device_hasher;
+  using HashResult = typename decltype(device_hasher)::result_type;
+
+  constexpr int NUM_HASHES = 3;
+  std::uint64_t things_to_hash[NUM_HASHES] = {std::uint64_t {1},
+                                              std::uint64_t {1} + (std::uint64_t {1} << 32),
+                                              std::uint64_t {1} + (std::uint64_t {1} << 33)};
+
+  int allocatorID = axom::execution_space<ExecSpace>::allocatorID();
+  HashResult* computed_hashes = axom::allocate<HashResult>(NUM_HASHES, allocatorID);
+
+  axom::for_all<ExecSpace>(
+    NUM_HASHES,
+    AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); });
+
+  HashResult computed_hashes_host[NUM_HASHES];
+  axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES);
+  axom::deallocate(computed_hashes);
+
+  EXPECT_NE(computed_hashes_host[0], computed_hashes_host[1]);
+  EXPECT_NE(computed_hashes_host[0], computed_hashes_host[2]);
+  EXPECT_NE(computed_hashes_host[1], computed_hashes_host[2]);
+}
+
+AXOM_TYPED_TEST(core_device_hash, hash_fractional_float_device)
+{
+  using ExecSpace = typename TestFixture::ExecSpace;
+
+  axom::DeviceHash<float> device_hasher;
+  using HashResult = typename decltype(device_hasher)::result_type;
+
+  constexpr int NUM_HASHES = 8;
+  float things_to_hash[NUM_HASHES] = {0.25f, 0.75f, -0.5f, 0.5f, 0.125f, 0.625f, 0.875f, 1.25f};
+
+  int allocatorID = axom::execution_space<ExecSpace>::allocatorID();
+  HashResult* computed_hashes = axom::allocate<HashResult>(NUM_HASHES, allocatorID);
+
+  axom::for_all<ExecSpace>(
+    NUM_HASHES,
+    AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); });
+
+  HashResult computed_hashes_host[NUM_HASHES];
+  axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES);
+  axom::deallocate(computed_hashes);
+
+  // Idempotence and pairwise distinctness for these chosen values.
+  for(int i = 0; i < NUM_HASHES; i++)
+  {
+    EXPECT_EQ(computed_hashes_host[i], device_hasher(things_to_hash[i]));
+    for(int j = i + 1; j < NUM_HASHES; j++)
+    {
+      EXPECT_NE(computed_hashes_host[i], computed_hashes_host[j]);
+    }
+  }
+
+  EXPECT_EQ(device_hasher(0.0f), device_hasher(-0.0f));
+}
+
+AXOM_TYPED_TEST(core_device_hash, hash_fractional_double_device)
+{
+  using ExecSpace = typename TestFixture::ExecSpace;
+
+  axom::DeviceHash<double> device_hasher;
+  using HashResult = typename decltype(device_hasher)::result_type;
+
+  constexpr int NUM_HASHES = 8;
+  double things_to_hash[NUM_HASHES] = {0.25, 0.75, -0.5, 0.5, 0.125, 0.625, 0.875, 1.25};
+
+  int allocatorID = axom::execution_space<ExecSpace>::allocatorID();
+  HashResult* computed_hashes = axom::allocate<HashResult>(NUM_HASHES, allocatorID);
+
+  axom::for_all<ExecSpace>(
+    NUM_HASHES,
+    AXOM_LAMBDA(int i) { computed_hashes[i] = device_hasher(things_to_hash[i]); });
+
+  HashResult computed_hashes_host[NUM_HASHES];
+  axom::copy(computed_hashes_host, computed_hashes, sizeof(HashResult) * NUM_HASHES);
+  axom::deallocate(computed_hashes);
+
+  for(int i = 0; i < NUM_HASHES; i++)
+  {
+    EXPECT_EQ(computed_hashes_host[i], device_hasher(things_to_hash[i]));
+    for(int j = i + 1; j < NUM_HASHES; j++)
+    {
+      EXPECT_NE(computed_hashes_host[i], computed_hashes_host[j]);
+    }
+  }
+
+  EXPECT_EQ(device_hasher(0.0), device_hasher(-0.0));
+}
+
 TEST(core_device_hash, hash_width_decoupled_from_indextype)
 {
   // The hash result must be 64 bits wide regardless of the configured
@@ -277,7 +378,7 @@ TEST(core_device_hash, hash_width_decoupled_from_indextype)
                 "integral hash result must be std::uint64_t");
   static_assert(std::is_same<axom::DeviceHash<double>::result_type, std::uint64_t>::value,
                 "floating-point hash result must be std::uint64_t");
-  static_assert(std::is_same<axom::DeviceHash<int *>::result_type, std::uint64_t>::value,
+  static_assert(std::is_same<axom::DeviceHash<int*>::result_type, std::uint64_t>::value,
                 "pointer hash result must be std::uint64_t");
   static_assert(std::is_same<axom::DeviceHash<std::string>::result_type, std::uint64_t>::value,
                 "catch-all (std::hash) result must be std::uint64_t");

From e95e7df5271ebe6cb8d45e8f1288b5a6a67811fe Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 14:23:00 -0700
Subject: [PATCH 21/28] Moves AXOM_FORCE_INLINE to core's Macros.hpp

---
 src/axom/core/FlatMap.hpp          |  9 ++++-----
 src/axom/core/Macros.hpp           | 18 ++++++++++++++++++
 src/axom/core/detail/FlatTable.hpp | 23 ++++++-----------------
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index c51d246414..8f92292ab4 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -290,11 +290,10 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
    *  if the key wasn't found.
    */
   /// @{
-  AXOM_FLATMAP_FORCE_INLINE iterator find(const KeyType& key);
-  AXOM_FLATMAP_FORCE_INLINE const_iterator find(const KeyType& key) const;
-  AXOM_FLATMAP_FORCE_INLINE iterator find_with_hash(const KeyType& key, hash_result_type hash);
-  AXOM_FLATMAP_FORCE_INLINE const_iterator find_with_hash(const KeyType& key,
-                                                          hash_result_type hash) const;
+  AXOM_FORCE_INLINE iterator find(const KeyType& key);
+  AXOM_FORCE_INLINE const_iterator find(const KeyType& key) const;
+  AXOM_FORCE_INLINE iterator find_with_hash(const KeyType& key, hash_result_type hash);
+  AXOM_FORCE_INLINE const_iterator find_with_hash(const KeyType& key, hash_result_type hash) const;
   /// @}
 
   /*!
diff --git a/src/axom/core/Macros.hpp b/src/axom/core/Macros.hpp
index fa7e9c1d3b..53cfa646e1 100644
--- a/src/axom/core/Macros.hpp
+++ b/src/axom/core/Macros.hpp
@@ -90,6 +90,24 @@
 #endif
 // _decorating_macros_end
 
+/*!
+ * \def AXOM_FORCE_INLINE
+ *
+ * \brief Force-inline annotation for hot-path functions
+ *
+ * \note Prefer using this sparingly on true hot paths since overuse can increase
+ *  code size and hurt instruction cache behavior
+ */
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  #define AXOM_FORCE_INLINE __forceinline__
+#elif defined(__GNUC__) || defined(__clang__)
+  #define AXOM_FORCE_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+  #define AXOM_FORCE_INLINE __forceinline
+#else
+  #define AXOM_FORCE_INLINE inline
+#endif
+
 /*
  * \def AXOM_STRINGIFY
  *
diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index 2a9307bd73..dafef72b9f 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -28,17 +28,6 @@
   #include <emmintrin.h>
 #endif
 
-// Force-inline annotation for the FlatMap/FlatTable lookup hot path.
-#if defined(__CUDACC__) || defined(__HIPCC__)
-  #define AXOM_FLATMAP_FORCE_INLINE __forceinline__
-#elif defined(__GNUC__) || defined(__clang__)
-  #define AXOM_FLATMAP_FORCE_INLINE inline __attribute__((always_inline))
-#elif defined(_MSC_VER)
-  #define AXOM_FLATMAP_FORCE_INLINE __forceinline
-#else
-  #define AXOM_FLATMAP_FORCE_INLINE inline
-#endif
-
 namespace axom
 {
 namespace detail
@@ -173,7 +162,7 @@ struct GroupBucket
   }
 
   template <typename Func>
-  AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE int visitHashBucket(std::uint8_t hash, Func&& visitor) const
+  AXOM_FORCE_INLINE AXOM_HOST_DEVICE int visitHashBucket(std::uint8_t hash, Func&& visitor) const
   {
     std::uint8_t reducedHash = reduceHash(hash);
 #if !defined(AXOM_DEVICE_CODE) && defined(_AXOM_CORE_HAVE_SSE2)
@@ -288,7 +277,7 @@ struct GroupBucket
   }
 
   template <bool Atomic = false>
-  AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE bool getMaybeOverflowed(std::uint8_t hash) const
+  AXOM_FORCE_INLINE AXOM_HOST_DEVICE bool getMaybeOverflowed(std::uint8_t hash) const
   {
     std::uint8_t hashOfwBit = 1 << (hash % 8);
     std::uint8_t curr_ofw;
@@ -488,10 +477,10 @@ struct SequentialLookupPolicy : ProbePolicy
    *  matching hash
    */
   template <typename FoundIndex>
-  AXOM_FLATMAP_FORCE_INLINE AXOM_HOST_DEVICE void probeIndex(int ngroups_pow_2,
-                                                             ArrayView<const GroupBucket> metadata,
-                                                             HashType hash,
-                                                             FoundIndex&& on_hash_found) const
+  AXOM_FORCE_INLINE AXOM_HOST_DEVICE void probeIndex(int ngroups_pow_2,
+                                                     ArrayView<const GroupBucket> metadata,
+                                                     HashType hash,
+                                                     FoundIndex&& on_hash_found) const
   {
     // We use the k MSBs of the hash as the initial group probe point,
     // where ngroups = 2^k. Since the group count is always a power of two,

From 4f34f61d46bf1c3356bb24201d7d3a1d7cd8f432 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 14:32:35 -0700
Subject: [PATCH 22/28] Adds utility function for initializing initial probe
 group via bitshift and masking

---
 src/axom/core/FlatMapUtil.hpp      | 12 ++++-----
 src/axom/core/detail/FlatTable.hpp | 41 +++++++++++++++++++-----------
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp
index 4eb1c41a05..da0b363631 100644
--- a/src/axom/core/FlatMapUtil.hpp
+++ b/src/axom/core/FlatMapUtil.hpp
@@ -359,11 +359,11 @@ void FlatMap<KeyType, ValueType, Hash>::insert(InputIt kv_begin, InputIt kv_end)
       // Hash keys.
       auto hash = Hash {}(key);
 
-      // We use the k MSBs of the hash as the initial group probe point,
-      // where ngroups = 2^k.
-      int bitshift_right = ((CHAR_BIT * sizeof(HashResult)) - ngroups_pow_2);
-      HashResult curr_group = hash >> bitshift_right;
-      curr_group &= ((1 << ngroups_pow_2) - 1);
+      // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k.
+      const auto init =
+        detail::flat_map::SequentialLookupPolicy<HashResult>::initGroupProbe(hash, ngroups_pow_2);
+      const HashResult group_mask = init.group_mask;
+      HashResult curr_group = init.curr_group;
 
       std::uint8_t hash_8 = static_cast<std::uint8_t>(hash);
 
@@ -469,7 +469,7 @@ void FlatMap<KeyType, ValueType, Hash>::insert(InputIt kv_begin, InputIt kv_end)
           else
           {
             // Move to next group.
-            curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) % meta_group.size();
+            curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) & group_mask;
             iteration++;
           }
         }
diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index dafef72b9f..c033ef7795 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -333,6 +333,21 @@ struct SequentialLookupPolicy : ProbePolicy
 {
   constexpr static int NO_MATCH = -1;
 
+  struct GroupProbeInit
+  {
+    HashType group_mask;
+    HashType curr_group;
+  };
+
+  AXOM_FORCE_INLINE AXOM_HOST_DEVICE static GroupProbeInit initGroupProbe(HashType hash,
+                                                                          int ngroups_pow_2)
+  {
+    const int bitshift_right = (CHAR_BIT * sizeof(HashType)) - ngroups_pow_2;
+    const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1;
+    const HashType curr_group = (hash >> bitshift_right) & group_mask;
+    return {group_mask, curr_group};
+  }
+
   /*!
    * \brief Inserts a hash into the first empty bucket in an array of groups
    *  for an open-addressing hash map.
@@ -343,12 +358,10 @@ struct SequentialLookupPolicy : ProbePolicy
    */
   IndexType probeEmptyIndex(int ngroups_pow_2, ArrayView<GroupBucket> metadata, HashType hash) const
   {
-    // We use the k MSBs of the hash as the initial group probe point,
-    // where ngroups = 2^k. Since the group count is always a power of two,
-    // wrapping a group index is a bitwise AND with this mask.
-    const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2);
-    const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1;
-    HashType curr_group = (hash >> bitshift_right) & group_mask;
+    // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k.
+    const auto init = initGroupProbe(hash, ngroups_pow_2);
+    const HashType group_mask = init.group_mask;
+    HashType curr_group = init.curr_group;
     int empty_group = NO_MATCH;
     int empty_bucket = NO_MATCH;
 
@@ -405,9 +418,9 @@ struct SequentialLookupPolicy : ProbePolicy
                               HashType hash,
                               FoundIndex&& on_hash_found) const
   {
-    const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2);
-    const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1;
-    HashType curr_group = (hash >> bitshift_right) & group_mask;
+    const auto init = initGroupProbe(hash, ngroups_pow_2);
+    const HashType group_mask = init.group_mask;
+    HashType curr_group = init.curr_group;
     int empty_group = NO_MATCH;
     int empty_bucket = NO_MATCH;
 
@@ -482,12 +495,10 @@ struct SequentialLookupPolicy : ProbePolicy
                                                      HashType hash,
                                                      FoundIndex&& on_hash_found) const
   {
-    // We use the k MSBs of the hash as the initial group probe point,
-    // where ngroups = 2^k. Since the group count is always a power of two,
-    // wrapping a group index is a bitwise AND with this mask.
-    const int bitshift_right = ((CHAR_BIT * sizeof(HashType)) - ngroups_pow_2);
-    const HashType group_mask = (HashType {1} << ngroups_pow_2) - 1;
-    HashType curr_group = (hash >> bitshift_right) & group_mask;
+    // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k.
+    const auto init = initGroupProbe(hash, ngroups_pow_2);
+    const HashType group_mask = init.group_mask;
+    HashType curr_group = init.curr_group;
 
     std::uint8_t hash_8 = static_cast<std::uint8_t>(hash);
     bool keep_going = true;

From d409eced2bade2816998bfa34be91eeee813ef87 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 14:58:49 -0700
Subject: [PATCH 23/28] FlatMap: Improves documentation and testing of
 find_with_hash

Also improves device hashing of floating point types (float and long double).
---
 src/axom/core/DeviceHash.hpp                  | 22 ++++++-------
 src/axom/core/FlatMap.hpp                     | 15 +++++++++
 src/axom/core/detail/FlatTable.hpp            |  4 +--
 .../core/tests/core_benchmark_flatmap.cpp     |  9 +++---
 src/axom/core/tests/core_device_hash.hpp      | 13 ++++++++
 src/axom/core/tests/core_flatmap.hpp          | 32 +++++++++++++++++++
 6 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/src/axom/core/DeviceHash.hpp b/src/axom/core/DeviceHash.hpp
index d1ac31ced2..0dc1bb4caa 100644
--- a/src/axom/core/DeviceHash.hpp
+++ b/src/axom/core/DeviceHash.hpp
@@ -52,21 +52,19 @@ struct DeviceHashHelper<T, std::enable_if_t<std::is_floating_point<T>::value>>
     // A float-to-integer value conversion collapses every key sharing an integer part,
     // e.g. all numbers between -1 and 1 converts to integer 0
 
-    // NUM_WORDS is 1 for float or double, possibly 2 for long double
-    constexpr std::size_t NUM_WORDS = (sizeof(T) + sizeof(std::uint64_t) - 1) / sizeof(std::uint64_t);
-    // zero out words since we might only copy 4 bytes in for floats
-    std::uint64_t words[NUM_WORDS] = {0};
-    memcpy(words, &value, sizeof(T));
-
-    std::uint64_t result = words[0];
-    // Extra processing fortypes wider than 64 bits (long double).
-    // Use an odd multiplier (2^64/golden-ratio-phi),
-    // so the halves cannot cancel under a later XOR-style mixer
-    for(std::size_t i = 1; i < NUM_WORDS; i++)
+    if constexpr(sizeof(T) <= sizeof(std::uint64_t))
     {
-      result = result * std::uint64_t {0x9e3779b97f4a7c15} + words[i];
+      // Zero-initialize first since we only copy 4 bytes for floats.
+      std::uint64_t result = 0;
+      memcpy(&result, &value, sizeof(T));
+      return result;
     }
 
+    // Avoid hashing padding bytes for wider floating types such as x86 long double.
+    // Collisions are acceptable for a hash; equal values must hash identically.
+    double narrowed_value = static_cast<double>(value);
+    std::uint64_t result = 0;
+    memcpy(&result, &narrowed_value, sizeof(narrowed_value));
     return result;
   }
 };
diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
index 8f92292ab4..be1958a9bf 100644
--- a/src/axom/core/FlatMap.hpp
+++ b/src/axom/core/FlatMap.hpp
@@ -292,6 +292,21 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
   /// @{
   AXOM_FORCE_INLINE iterator find(const KeyType& key);
   AXOM_FORCE_INLINE const_iterator find(const KeyType& key) const;
+  /// @}
+
+  /*!
+   * \brief Try to find an entry with a given key and a precomputed hash.
+   *
+   * \param [in] key the key to search for
+   * \param [in] hash the precomputed hash for \a key
+   *
+   * \return An iterator pointing to the corresponding key-value pair, or end()
+   *  if the key wasn't found.
+   *
+   * \pre hash must be equivalent to hasher{}(key) for this FlatMap's Hash policy.
+   *  Supplying a hash computed for a different key or Hash policy can miss an existing key
+   */
+  /// @{
   AXOM_FORCE_INLINE iterator find_with_hash(const KeyType& key, hash_result_type hash);
   AXOM_FORCE_INLINE const_iterator find_with_hash(const KeyType& key, hash_result_type hash) const;
   /// @}
diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
index c033ef7795..34ec9c4167 100644
--- a/src/axom/core/detail/FlatTable.hpp
+++ b/src/axom/core/detail/FlatTable.hpp
@@ -400,8 +400,8 @@ struct SequentialLookupPolicy : ProbePolicy
    * \brief Fused find-or-locate-empty probe for single-key emplacement.
    *
    *  Walks the probe sequence once, simultaneously visiting key matches and
-   *  tracking the first empty slot. Overflow bits are maintained in the 
-   *  same way as probeEmptyIndex(): a full group that hasn't yielded an insertion 
+   *  tracking the first empty slot. Overflow bits are maintained in the
+   *  same way as probeEmptyIndex(): a full group that hasn't yielded an insertion
    *  slot is marked overflowed for this hash before moving on.
    *
    * \param [in] ngroups_pow_2 the number of groups, expressed as a power of 2
diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index 2f013e35df..ccd79a9408 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -455,10 +455,11 @@ void BM_FlatMap_Find_Hit_Prehashed(benchmark::State& state)
   const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
   const auto pairs = make_pairs(keys);
   const MapType map = make_filled_map<MapType>(pairs);
+  const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL);
 
   std::vector<HashResult> hashes;
-  hashes.reserve(keys.size());
-  for(KeyType k : keys)
+  hashes.reserve(lookup_keys.size());
+  for(KeyType k : lookup_keys)
   {
     hashes.push_back(typename MapType::hasher {}(k));
   }
@@ -466,9 +467,9 @@ void BM_FlatMap_Find_Hit_Prehashed(benchmark::State& state)
   for(auto _ : state)
   {
     ValueType sum = 0;
-    for(std::size_t i = 0; i < keys.size(); ++i)
+    for(std::size_t i = 0; i < lookup_keys.size(); ++i)
     {
-      auto it = map.find_with_hash(keys[i], hashes[i]);
+      auto it = map.find_with_hash(lookup_keys[i], hashes[i]);
       if(it != map.end())
       {
         sum += it->second;
diff --git a/src/axom/core/tests/core_device_hash.hpp b/src/axom/core/tests/core_device_hash.hpp
index 6cab7708b2..dcdb278c37 100644
--- a/src/axom/core/tests/core_device_hash.hpp
+++ b/src/axom/core/tests/core_device_hash.hpp
@@ -14,6 +14,7 @@
 // C++ includes
 #include <cstdint>
 #include <set>
+#include <type_traits>
 
 template <typename TheExecSpace>
 class core_device_hash : public ::testing::Test
@@ -427,3 +428,15 @@ TEST(core_device_hash, hash_float_bit_pattern)
   EXPECT_NE(double_hasher(1e300), double_hasher(2e300));
   EXPECT_NE(float_hasher(-0.5f), float_hasher(0.5f));
 }
+
+TEST(core_device_hash, hash_long_double_has_stable_equal_value_hash)
+{
+  axom::DeviceHash<long double> long_double_hasher;
+
+  static_assert(std::is_same<axom::DeviceHash<long double>::result_type, std::uint64_t>::value,
+                "long double hash result must be std::uint64_t");
+
+  EXPECT_EQ(long_double_hasher(0.0L), long_double_hasher(-0.0L));
+  EXPECT_EQ(long_double_hasher(0.25L), long_double_hasher(static_cast<long double>(0.25)));
+  EXPECT_NE(long_double_hasher(0.25L), long_double_hasher(0.75L));
+}
diff --git a/src/axom/core/tests/core_flatmap.hpp b/src/axom/core/tests/core_flatmap.hpp
index 021bcd741d..c0665ce21b 100644
--- a/src/axom/core/tests/core_flatmap.hpp
+++ b/src/axom/core/tests/core_flatmap.hpp
@@ -339,6 +339,38 @@ TEST(core_flatmap_unit, fused_emplace_probe_recomputes_slot_after_rehash)
   EXPECT_EQ(test_map.count(0), 0);
 }
 
+TEST(core_flatmap_unit, find_with_hash_uses_precomputed_hash)
+{
+  using MapType = axom::FlatMap<int, int>;
+  MapType test_map;
+
+  const int NUM_ELEMS = 64;
+  for(int i = 0; i < NUM_ELEMS; i++)
+  {
+    test_map.insert_or_assign(i, i * 10);
+  }
+
+  const int key = 37;
+  const auto hash = MapType::hasher {}(key);
+
+  auto it = test_map.find_with_hash(key, hash);
+  ASSERT_NE(it, test_map.end());
+  EXPECT_EQ(it->first, key);
+  EXPECT_EQ(it->second, key * 10);
+
+  const MapType& const_map = test_map;
+  auto const_it = const_map.find_with_hash(key, hash);
+  ASSERT_NE(const_it, const_map.end());
+  EXPECT_EQ(const_it->first, key);
+  EXPECT_EQ(const_it->second, key * 10);
+
+  // A precomputed hash is part of the lookup key. Supplying a mismatched hash
+  // may miss an existing key; this guards the documented precondition.
+  const auto mismatched_hash = hash ^ MapType::hash_result_type {0x80};
+  EXPECT_EQ(test_map.find_with_hash(key, mismatched_hash), test_map.end());
+  EXPECT_EQ(const_map.find_with_hash(key, mismatched_hash), const_map.end());
+}
+
 AXOM_TYPED_TEST(core_flatmap, default_init)
 {
   using MapType = typename TestFixture::MapType;

From 9b14075602dfade3bd9ed8b8569aa55a09247b88 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 16:07:54 -0700
Subject: [PATCH 24/28] Adds benchmarks for device contruction and lookup

---
 .../core/tests/core_benchmark_flatmap.cpp     | 291 +++++++++++++++++-
 1 file changed, 287 insertions(+), 4 deletions(-)

diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index ccd79a9408..edb8a12cdb 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -13,10 +13,6 @@
 #include "axom/CLI11.hpp"
 #include "axom/fmt.hpp"
 
-#include "axom/core/FlatMap.hpp"
-#include "axom/core/FlatMapUtil.hpp"
-#include "axom/core/detail/FlatTable.hpp"
-
 #if defined(AXOM_USE_SPARSEHASH)
   #include "axom/sparsehash/sparse_hash_map"
 #endif
@@ -553,6 +549,269 @@ void BM_BatchedInsert_Reserved(benchmark::State& state)
   }
 }
 
+#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE)
+
+  // Device execution policy for benchmarking
+  #if defined(AXOM_USE_HIP)
+using DeviceExec = axom::HIP_EXEC<256>;
+  #elif defined(AXOM_USE_CUDA)
+using DeviceExec = axom::CUDA_EXEC<256>;
+  #endif
+
+  #if defined(AXOM_USE_CUDA) || defined(AXOM_USE_HIP)
+
+/*!
+ * \brief Device/parallel benchmarks for FlatMap
+ *
+ * These benchmarks measure GPU kernel execution + data transfer overhead.
+ * Google Benchmark measures wall-clock time, which includes:
+ * - Host-to-device memory transfers
+ * - Kernel launch overhead
+ * - GPU execution time
+ * - Device-to-host synchronization
+ *
+ * For pure kernel performance, profile with nsys/rocprof separately.
+ * These benchmarks characterize end-to-end device operation cost.
+ */
+
+/*!
+ * \brief Simple device sanity check
+ *
+ * Minimal test to verify device operations work before trying FlatMap.
+ */
+void BM_Device_Sanity_Check(benchmark::State& state)
+{
+  const int n = state.range(0);
+
+  const int device_allocator_id = axom::execution_space<DeviceExec>::allocatorID();
+  if(device_allocator_id == axom::INVALID_ALLOCATOR_ID)
+  {
+    state.SkipWithError("Device allocator not available");
+    return;
+  }
+
+  // Allocate simple arrays on device
+  int* device_input = axom::allocate<int>(n, device_allocator_id);
+  int* device_output = axom::allocate<int>(n, device_allocator_id);
+
+  // Initialize on host
+  std::vector<int> host_data(n, 42);
+  axom::copy(device_input, host_data.data(), sizeof(int) * n);
+
+  for(auto _ : state)
+  {
+    // Simple kernel: copy input to output
+    axom::for_all<DeviceExec>(n, [=] AXOM_HOST_DEVICE(int i) {
+      device_output[i] = device_input[i] + 1;
+    });
+
+    axom::synchronize<DeviceExec>();
+    benchmark::DoNotOptimize(device_output);
+  }
+
+  axom::deallocate(device_input);
+  axom::deallocate(device_output);
+}
+
+/*!
+ * \brief Benchmark parallel batched insertion on device
+ */
+void BM_FlatMap_Insert_Device_Reserved(benchmark::State& state)
+{
+  using MapType = axom::FlatMap<KeyType, ValueType>;
+
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0x1CEB00DAULL);
+  const auto pairs = make_pairs(keys);
+
+  // Check if device allocator is available
+  const int device_allocator_id = axom::execution_space<DeviceExec>::allocatorID();
+  if(device_allocator_id == axom::INVALID_ALLOCATOR_ID)
+  {
+    state.SkipWithError("Device allocator not available");
+    return;
+  }
+
+  // Use axom::Array for host data
+  using PairType = std::pair<KeyType, ValueType>;
+  axom::Array<PairType> host_pairs(pairs.size(), pairs.size());
+  std::copy(pairs.begin(), pairs.end(), host_pairs.data());
+
+  // Copy to device using axom::Array with device allocator
+  axom::Array<PairType> device_pairs(pairs.size(), pairs.size(), device_allocator_id);
+  axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size());
+
+  const std::size_t bs = static_cast<std::size_t>(std::max(1, ::args_batch_size));
+
+  // Get device-safe ArrayView and extract raw pointer for template instantiation
+  auto pairs_view = device_pairs.view();
+  PairType* device_pairs_ptr = pairs_view.data();
+  const std::size_t total_size = pairs.size();
+
+  for(auto _ : state)
+  {
+    // Create map with device allocator and reserve capacity
+    MapType map(axom::Allocator {device_allocator_id});
+    map.reserve(static_cast<axom::IndexType>(pairs.size()));
+
+    // Benchmark parallel batched insertion using raw pointers from ArrayView
+    for(std::size_t offset = 0; offset < total_size; offset += bs)
+    {
+      const std::size_t count = std::min(bs, total_size - offset);
+      map.template insert<DeviceExec>(device_pairs_ptr + offset, device_pairs_ptr + offset + count);
+    }
+
+    // Synchronize to ensure device operations complete
+    axom::synchronize<DeviceExec>();
+
+    benchmark::DoNotOptimize(map);
+  }
+}
+
+/*!
+ * \brief Benchmark parallel lookup on device
+ */
+void BM_FlatMap_Find_Hit_Device(benchmark::State& state)
+{
+  using MapType = axom::FlatMap<KeyType, ValueType>;
+
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
+  const auto pairs = make_pairs(keys);
+  const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL);
+
+  // Check if device allocator is available
+  const int device_allocator_id = axom::execution_space<DeviceExec>::allocatorID();
+  if(device_allocator_id == axom::INVALID_ALLOCATOR_ID)
+  {
+    state.SkipWithError("Device allocator not available");
+    return;
+  }
+
+  // Use axom::Array for host data
+  using PairType = std::pair<KeyType, ValueType>;
+  axom::Array<PairType> host_pairs(pairs.size(), pairs.size());
+  std::copy(pairs.begin(), pairs.end(), host_pairs.data());
+
+  // Copy to device using axom::Array with device allocator
+  axom::Array<PairType> device_pairs(pairs.size(), pairs.size(), device_allocator_id);
+  axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size());
+
+  // Create and populate map on device
+  MapType map(axom::Allocator {device_allocator_id});
+  map.reserve(static_cast<axom::IndexType>(pairs.size()));
+
+  // Use raw pointer from ArrayView for template instantiation
+  auto pairs_view = device_pairs.view();
+  map.template insert<DeviceExec>(pairs_view.data(), pairs_view.data() + pairs_view.size());
+
+  // Copy lookup keys to device using axom::Array
+  axom::Array<KeyType> host_lookup_keys(lookup_keys.size(), lookup_keys.size());
+  std::copy(lookup_keys.begin(), lookup_keys.end(), host_lookup_keys.data());
+
+  axom::Array<KeyType> device_lookup_keys(lookup_keys.size(), lookup_keys.size(), device_allocator_id);
+  axom::copy(device_lookup_keys.data(), host_lookup_keys.data(), sizeof(KeyType) * lookup_keys.size());
+
+  // Allocate result array on device using axom::Array
+  axom::Array<ValueType> device_results(lookup_keys.size(), lookup_keys.size(), device_allocator_id);
+
+  // Get device-safe views for kernel capture
+  // Note: Using explicit [=] AXOM_HOST_DEVICE instead of AXOM_LAMBDA to avoid
+  // RAJA privatizer issues on HIP with non-trivial types in capture
+  auto map_view = map.view();
+  auto lookup_keys_view = device_lookup_keys.view();
+  auto results_view = device_results.view();
+
+  for(auto _ : state)
+  {
+    // Perform lookups in parallel using ArrayViews
+    axom::for_all<DeviceExec>(static_cast<axom::IndexType>(lookup_keys.size()),
+                              [=] AXOM_HOST_DEVICE(axom::IndexType i) {
+                                auto it = map_view.find(lookup_keys_view[i]);
+                                results_view[i] =
+                                  (it != map_view.end()) ? it->second : ValueType {-1};
+                              });
+
+    // Synchronize to ensure device operations complete
+    axom::synchronize<DeviceExec>();
+
+    benchmark::DoNotOptimize(device_results.data());
+  }
+}
+
+/*!
+ * \brief Benchmark parallel lookup misses on device
+ */
+void BM_FlatMap_Find_Miss_Device(benchmark::State& state)
+{
+  using MapType = axom::FlatMap<KeyType, ValueType>;
+
+  const int n = state.range(0);
+  const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
+  const auto pairs = make_pairs(keys);
+  const auto miss_keys = make_miss_keys(keys, static_cast<KeyType>(n) + 11);
+
+  // Check if device allocator is available
+  const int device_allocator_id = axom::execution_space<DeviceExec>::allocatorID();
+  if(device_allocator_id == axom::INVALID_ALLOCATOR_ID)
+  {
+    state.SkipWithError("Device allocator not available");
+    return;
+  }
+
+  // Use axom::Array for host data
+  using PairType = std::pair<KeyType, ValueType>;
+  axom::Array<PairType> host_pairs(pairs.size(), pairs.size());
+  std::copy(pairs.begin(), pairs.end(), host_pairs.data());
+
+  // Copy to device using axom::Array with device allocator
+  axom::Array<PairType> device_pairs(pairs.size(), pairs.size(), device_allocator_id);
+  axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size());
+
+  // Create and populate map on device
+  MapType map(axom::Allocator {device_allocator_id});
+  map.reserve(static_cast<axom::IndexType>(pairs.size()));
+
+  // Use raw pointer from ArrayView for template instantiation
+  auto pairs_view = device_pairs.view();
+  map.template insert<DeviceExec>(pairs_view.data(), pairs_view.data() + pairs_view.size());
+
+  // Copy miss keys to device using axom::Array
+  axom::Array<KeyType> host_miss_keys(miss_keys.size(), miss_keys.size());
+  std::copy(miss_keys.begin(), miss_keys.end(), host_miss_keys.data());
+
+  axom::Array<KeyType> device_miss_keys(miss_keys.size(), miss_keys.size(), device_allocator_id);
+  axom::copy(device_miss_keys.data(), host_miss_keys.data(), sizeof(KeyType) * miss_keys.size());
+
+  // Allocate result array on device using axom::Array
+  axom::Array<int> device_misses(miss_keys.size(), miss_keys.size(), device_allocator_id);
+
+  // Get device-safe views for kernel capture
+  // Note: Using explicit [=] AXOM_HOST_DEVICE instead of AXOM_LAMBDA to avoid
+  // RAJA privatizer issues on HIP with non-trivial types in capture
+  auto map_view = map.view();
+  auto miss_keys_view = device_miss_keys.view();
+  auto misses_view = device_misses.view();
+
+  for(auto _ : state)
+  {
+    // Perform lookups in parallel using ArrayViews
+    axom::for_all<DeviceExec>(static_cast<axom::IndexType>(miss_keys.size()),
+                              [=] AXOM_HOST_DEVICE(axom::IndexType i) {
+                                misses_view[i] =
+                                  (map_view.find(miss_keys_view[i]) == map_view.end()) ? 1 : 0;
+                              });
+
+    // Synchronize to ensure device operations complete
+    axom::synchronize<DeviceExec>();
+
+    benchmark::DoNotOptimize(device_misses.data());
+  }
+}
+
+  #endif  // AXOM_USE_CUDA || AXOM_USE_HIP
+#endif    // AXOM_USE_RAJA && AXOM_USE_UMPIRE
+
 }  // namespace
 
 //-----------------------------------------------------------------------------
@@ -725,6 +984,30 @@ int main(int argc, char* argv[])
     "axom::google::sparse_hash_map");
 #endif
 
+  // Device/parallel benchmarks for debugging
+#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && \
+  (defined(AXOM_USE_CUDA) || defined(AXOM_USE_HIP))
+
+  // Device benchmarks enabled with raw pointers (iterators cause host stack address faults)
+  benchmark::RegisterBenchmark("Device::sanity_check", &BM_Device_Sanity_Check)->Apply(CustomArgs);
+
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::BatchedInsertion) !=
+     FlatMapFeatureBenchmarks::None)
+  {
+    benchmark::RegisterBenchmark("axom::FlatMap::insert_device_reserved",
+                                 &BM_FlatMap_Insert_Device_Reserved)
+      ->Apply(CustomArgs);
+  }
+
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None)
+  {
+    benchmark::RegisterBenchmark("axom::FlatMap::find_hit_device", &BM_FlatMap_Find_Hit_Device)
+      ->Apply(CustomArgs);
+    benchmark::RegisterBenchmark("axom::FlatMap::find_miss_device", &BM_FlatMap_Find_Miss_Device)
+      ->Apply(CustomArgs);
+  }
+#endif  // AXOM_USE_RAJA && AXOM_USE_UMPIRE && (AXOM_USE_CUDA || AXOM_USE_HIP)
+
   ::benchmark::RunSpecifiedBenchmarks();
   return 0;
 }

From 489d9ff20694823c614918bc78f8c17324809b7d Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 16:46:07 -0700
Subject: [PATCH 25/28] FlatMap: Generalizes the device benchmarks to other
 execution spaces, including omp

---
 src/axom/core/tests/CMakeLists.txt            |   8 +-
 .../core/tests/core_benchmark_flatmap.cpp     | 307 ++++++++----------
 2 files changed, 139 insertions(+), 176 deletions(-)

diff --git a/src/axom/core/tests/CMakeLists.txt b/src/axom/core/tests/CMakeLists.txt
index b30ebb25fc..7322f589f6 100644
--- a/src/axom/core/tests/CMakeLists.txt
+++ b/src/axom/core/tests/CMakeLists.txt
@@ -218,6 +218,11 @@ if (ENABLE_BENCHMARKS)
   foreach(test ${core_benchmarks})
     get_filename_component(test_name ${test} NAME_WE)
 
+    set(_num_threads)
+    if(test STREQUAL "core_benchmark_flatmap.cpp" AND AXOM_ENABLE_OPENMP)
+        set(_num_threads ${AXOM_TEST_NUM_OMP_THREADS})
+    endif()
+
     axom_add_executable(NAME       ${test_name}
                         SOURCES    ${test}
                         OUTPUT_DIR ${TEST_OUTPUT_DIRECTORY}
@@ -225,6 +230,7 @@ if (ENABLE_BENCHMARKS)
                         FOLDER     axom/core/benchmarks)
 
     blt_add_benchmark(NAME    ${test_name}
-                      COMMAND ${test_name} --benchmark_min_time=0.0001s)
+                      COMMAND ${test_name} --benchmark_min_time=0.0001s
+                      NUM_OMP_THREADS ${_num_threads})
   endforeach()
 endif()
diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index edb8a12cdb..c99dc61e20 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -549,269 +549,211 @@ void BM_BatchedInsert_Reserved(benchmark::State& state)
   }
 }
 
-#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE)
-
-  // Device execution policy for benchmarking
-  #if defined(AXOM_USE_HIP)
-using DeviceExec = axom::HIP_EXEC<256>;
-  #elif defined(AXOM_USE_CUDA)
-using DeviceExec = axom::CUDA_EXEC<256>;
-  #endif
-
-  #if defined(AXOM_USE_CUDA) || defined(AXOM_USE_HIP)
-
 /*!
- * \brief Device/parallel benchmarks for FlatMap
+ * \brief Execution-space benchmarks for FlatMap.
  *
- * These benchmarks measure GPU kernel execution + data transfer overhead.
- * Google Benchmark measures wall-clock time, which includes:
- * - Host-to-device memory transfers
- * - Kernel launch overhead
- * - GPU execution time
- * - Device-to-host synchronization
- *
- * For pure kernel performance, profile with nsys/rocprof separately.
- * These benchmarks characterize end-to-end device operation cost.
+ * These benchmarks measure execution-space operation cost. Host-to-exec-space
+ * data setup is outside the timed loop; kernel launch/execution and
+ * synchronization are included.
  */
 
-/*!
- * \brief Simple device sanity check
- *
- * Minimal test to verify device operations work before trying FlatMap.
- */
-void BM_Device_Sanity_Check(benchmark::State& state)
+template <typename ExecSpace>
+bool get_allocator_or_skip(benchmark::State& state, int& allocator_id)
+{
+  allocator_id = axom::execution_space<ExecSpace>::allocatorID();
+  if(allocator_id == axom::INVALID_ALLOCATOR_ID)
+  {
+    state.SkipWithError("Execution-space allocator not available");
+    return false;
+  }
+
+  return true;
+}
+
+template <typename T>
+axom::Array<T> copy_to_allocator(const std::vector<T>& values, int allocator_id)
+{
+  axom::Array<T> copied_values(values.size(), values.size(), allocator_id);
+  if(!values.empty())
+  {
+    axom::copy(copied_values.data(), values.data(), sizeof(T) * values.size());
+  }
+  return copied_values;
+}
+
+template <typename ExecSpace>
+void BM_ExecSpace_Sanity_Check(benchmark::State& state)
 {
   const int n = state.range(0);
 
-  const int device_allocator_id = axom::execution_space<DeviceExec>::allocatorID();
-  if(device_allocator_id == axom::INVALID_ALLOCATOR_ID)
+  int allocator_id = axom::INVALID_ALLOCATOR_ID;
+  if(!get_allocator_or_skip<ExecSpace>(state, allocator_id))
   {
-    state.SkipWithError("Device allocator not available");
     return;
   }
 
-  // Allocate simple arrays on device
-  int* device_input = axom::allocate<int>(n, device_allocator_id);
-  int* device_output = axom::allocate<int>(n, device_allocator_id);
+  int* input = axom::allocate<int>(n, allocator_id);
+  int* output = axom::allocate<int>(n, allocator_id);
 
-  // Initialize on host
   std::vector<int> host_data(n, 42);
-  axom::copy(device_input, host_data.data(), sizeof(int) * n);
+  axom::copy(input, host_data.data(), sizeof(int) * n);
 
   for(auto _ : state)
   {
-    // Simple kernel: copy input to output
-    axom::for_all<DeviceExec>(n, [=] AXOM_HOST_DEVICE(int i) {
-      device_output[i] = device_input[i] + 1;
-    });
+    axom::for_all<ExecSpace>(n, [=] AXOM_HOST_DEVICE(int i) { output[i] = input[i] + 1; });
 
-    axom::synchronize<DeviceExec>();
-    benchmark::DoNotOptimize(device_output);
+    axom::synchronize<ExecSpace>();
+    benchmark::DoNotOptimize(output);
   }
 
-  axom::deallocate(device_input);
-  axom::deallocate(device_output);
+  axom::deallocate(input);
+  axom::deallocate(output);
 }
 
 /*!
- * \brief Benchmark parallel batched insertion on device
+ * \brief Benchmark parallel batched insertion using an execution space.
  */
-void BM_FlatMap_Insert_Device_Reserved(benchmark::State& state)
+template <typename ExecSpace>
+void BM_FlatMap_Insert_ExecSpace_Reserved(benchmark::State& state)
 {
   using MapType = axom::FlatMap<KeyType, ValueType>;
+  using PairType = std::pair<KeyType, ValueType>;
 
   const int n = state.range(0);
   const auto keys = make_shuffled_keys(n, 0x1CEB00DAULL);
   const auto pairs = make_pairs(keys);
 
-  // Check if device allocator is available
-  const int device_allocator_id = axom::execution_space<DeviceExec>::allocatorID();
-  if(device_allocator_id == axom::INVALID_ALLOCATOR_ID)
+  int allocator_id = axom::INVALID_ALLOCATOR_ID;
+  if(!get_allocator_or_skip<ExecSpace>(state, allocator_id))
   {
-    state.SkipWithError("Device allocator not available");
     return;
   }
 
-  // Use axom::Array for host data
-  using PairType = std::pair<KeyType, ValueType>;
-  axom::Array<PairType> host_pairs(pairs.size(), pairs.size());
-  std::copy(pairs.begin(), pairs.end(), host_pairs.data());
-
-  // Copy to device using axom::Array with device allocator
-  axom::Array<PairType> device_pairs(pairs.size(), pairs.size(), device_allocator_id);
-  axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size());
-
   const std::size_t bs = static_cast<std::size_t>(std::max(1, ::args_batch_size));
-
-  // Get device-safe ArrayView and extract raw pointer for template instantiation
-  auto pairs_view = device_pairs.view();
-  PairType* device_pairs_ptr = pairs_view.data();
+  axom::Array<PairType> exec_pairs = copy_to_allocator(pairs, allocator_id);
+  auto pairs_view = exec_pairs.view();
+  PairType* pairs_ptr = pairs_view.data();
   const std::size_t total_size = pairs.size();
 
   for(auto _ : state)
   {
-    // Create map with device allocator and reserve capacity
-    MapType map(axom::Allocator {device_allocator_id});
+    MapType map(axom::Allocator {allocator_id});
     map.reserve(static_cast<axom::IndexType>(pairs.size()));
 
-    // Benchmark parallel batched insertion using raw pointers from ArrayView
     for(std::size_t offset = 0; offset < total_size; offset += bs)
     {
       const std::size_t count = std::min(bs, total_size - offset);
-      map.template insert<DeviceExec>(device_pairs_ptr + offset, device_pairs_ptr + offset + count);
+      map.template insert<ExecSpace>(pairs_ptr + offset, pairs_ptr + offset + count);
     }
 
-    // Synchronize to ensure device operations complete
-    axom::synchronize<DeviceExec>();
+    axom::synchronize<ExecSpace>();
 
     benchmark::DoNotOptimize(map);
   }
 }
 
 /*!
- * \brief Benchmark parallel lookup on device
+ * \brief Benchmark parallel successful lookup using an execution space.
  */
-void BM_FlatMap_Find_Hit_Device(benchmark::State& state)
+template <typename ExecSpace>
+void BM_FlatMap_Find_Hit_ExecSpace(benchmark::State& state)
 {
   using MapType = axom::FlatMap<KeyType, ValueType>;
+  using PairType = std::pair<KeyType, ValueType>;
 
   const int n = state.range(0);
   const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
   const auto pairs = make_pairs(keys);
   const auto lookup_keys = make_lookup_order(keys, 0xBADC0DE5ULL);
 
-  // Check if device allocator is available
-  const int device_allocator_id = axom::execution_space<DeviceExec>::allocatorID();
-  if(device_allocator_id == axom::INVALID_ALLOCATOR_ID)
+  int allocator_id = axom::INVALID_ALLOCATOR_ID;
+  if(!get_allocator_or_skip<ExecSpace>(state, allocator_id))
   {
-    state.SkipWithError("Device allocator not available");
     return;
   }
 
-  // Use axom::Array for host data
-  using PairType = std::pair<KeyType, ValueType>;
-  axom::Array<PairType> host_pairs(pairs.size(), pairs.size());
-  std::copy(pairs.begin(), pairs.end(), host_pairs.data());
-
-  // Copy to device using axom::Array with device allocator
-  axom::Array<PairType> device_pairs(pairs.size(), pairs.size(), device_allocator_id);
-  axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size());
-
-  // Create and populate map on device
-  MapType map(axom::Allocator {device_allocator_id});
+  axom::Array<PairType> exec_pairs = copy_to_allocator(pairs, allocator_id);
+  MapType map(axom::Allocator {allocator_id});
   map.reserve(static_cast<axom::IndexType>(pairs.size()));
 
-  // Use raw pointer from ArrayView for template instantiation
-  auto pairs_view = device_pairs.view();
-  map.template insert<DeviceExec>(pairs_view.data(), pairs_view.data() + pairs_view.size());
-
-  // Copy lookup keys to device using axom::Array
-  axom::Array<KeyType> host_lookup_keys(lookup_keys.size(), lookup_keys.size());
-  std::copy(lookup_keys.begin(), lookup_keys.end(), host_lookup_keys.data());
+  auto pairs_view = exec_pairs.view();
+  map.template insert<ExecSpace>(pairs_view.data(), pairs_view.data() + pairs_view.size());
+  axom::synchronize<ExecSpace>();
 
-  axom::Array<KeyType> device_lookup_keys(lookup_keys.size(), lookup_keys.size(), device_allocator_id);
-  axom::copy(device_lookup_keys.data(), host_lookup_keys.data(), sizeof(KeyType) * lookup_keys.size());
+  axom::Array<KeyType> exec_lookup_keys = copy_to_allocator(lookup_keys, allocator_id);
+  axom::Array<ValueType> exec_results(lookup_keys.size(), lookup_keys.size(), allocator_id);
 
-  // Allocate result array on device using axom::Array
-  axom::Array<ValueType> device_results(lookup_keys.size(), lookup_keys.size(), device_allocator_id);
-
-  // Get device-safe views for kernel capture
   // Note: Using explicit [=] AXOM_HOST_DEVICE instead of AXOM_LAMBDA to avoid
   // RAJA privatizer issues on HIP with non-trivial types in capture
   auto map_view = map.view();
-  auto lookup_keys_view = device_lookup_keys.view();
-  auto results_view = device_results.view();
+  auto lookup_keys_view = exec_lookup_keys.view();
+  auto results_view = exec_results.view();
 
   for(auto _ : state)
   {
-    // Perform lookups in parallel using ArrayViews
-    axom::for_all<DeviceExec>(static_cast<axom::IndexType>(lookup_keys.size()),
-                              [=] AXOM_HOST_DEVICE(axom::IndexType i) {
-                                auto it = map_view.find(lookup_keys_view[i]);
-                                results_view[i] =
-                                  (it != map_view.end()) ? it->second : ValueType {-1};
-                              });
+    axom::for_all<ExecSpace>(static_cast<axom::IndexType>(lookup_keys.size()),
+                             [=] AXOM_HOST_DEVICE(axom::IndexType i) {
+                               auto it = map_view.find(lookup_keys_view[i]);
+                               results_view[i] = (it != map_view.end()) ? it->second : ValueType {-1};
+                             });
 
-    // Synchronize to ensure device operations complete
-    axom::synchronize<DeviceExec>();
+    axom::synchronize<ExecSpace>();
 
-    benchmark::DoNotOptimize(device_results.data());
+    benchmark::DoNotOptimize(exec_results.data());
   }
 }
 
 /*!
- * \brief Benchmark parallel lookup misses on device
+ * \brief Benchmark parallel missed lookup using an execution space.
  */
-void BM_FlatMap_Find_Miss_Device(benchmark::State& state)
+template <typename ExecSpace>
+void BM_FlatMap_Find_Miss_ExecSpace(benchmark::State& state)
 {
   using MapType = axom::FlatMap<KeyType, ValueType>;
+  using PairType = std::pair<KeyType, ValueType>;
 
   const int n = state.range(0);
   const auto keys = make_shuffled_keys(n, 0xC0FFEEULL);
   const auto pairs = make_pairs(keys);
   const auto miss_keys = make_miss_keys(keys, static_cast<KeyType>(n) + 11);
 
-  // Check if device allocator is available
-  const int device_allocator_id = axom::execution_space<DeviceExec>::allocatorID();
-  if(device_allocator_id == axom::INVALID_ALLOCATOR_ID)
+  int allocator_id = axom::INVALID_ALLOCATOR_ID;
+  if(!get_allocator_or_skip<ExecSpace>(state, allocator_id))
   {
-    state.SkipWithError("Device allocator not available");
     return;
   }
 
-  // Use axom::Array for host data
-  using PairType = std::pair<KeyType, ValueType>;
-  axom::Array<PairType> host_pairs(pairs.size(), pairs.size());
-  std::copy(pairs.begin(), pairs.end(), host_pairs.data());
-
-  // Copy to device using axom::Array with device allocator
-  axom::Array<PairType> device_pairs(pairs.size(), pairs.size(), device_allocator_id);
-  axom::copy(device_pairs.data(), host_pairs.data(), sizeof(PairType) * pairs.size());
-
-  // Create and populate map on device
-  MapType map(axom::Allocator {device_allocator_id});
+  axom::Array<PairType> exec_pairs = copy_to_allocator(pairs, allocator_id);
+  MapType map(axom::Allocator {allocator_id});
   map.reserve(static_cast<axom::IndexType>(pairs.size()));
 
-  // Use raw pointer from ArrayView for template instantiation
-  auto pairs_view = device_pairs.view();
-  map.template insert<DeviceExec>(pairs_view.data(), pairs_view.data() + pairs_view.size());
-
-  // Copy miss keys to device using axom::Array
-  axom::Array<KeyType> host_miss_keys(miss_keys.size(), miss_keys.size());
-  std::copy(miss_keys.begin(), miss_keys.end(), host_miss_keys.data());
+  auto pairs_view = exec_pairs.view();
+  map.template insert<ExecSpace>(pairs_view.data(), pairs_view.data() + pairs_view.size());
+  axom::synchronize<ExecSpace>();
 
-  axom::Array<KeyType> device_miss_keys(miss_keys.size(), miss_keys.size(), device_allocator_id);
-  axom::copy(device_miss_keys.data(), host_miss_keys.data(), sizeof(KeyType) * miss_keys.size());
+  axom::Array<KeyType> exec_miss_keys = copy_to_allocator(miss_keys, allocator_id);
+  axom::Array<int> exec_misses(miss_keys.size(), miss_keys.size(), allocator_id);
 
-  // Allocate result array on device using axom::Array
-  axom::Array<int> device_misses(miss_keys.size(), miss_keys.size(), device_allocator_id);
-
-  // Get device-safe views for kernel capture
   // Note: Using explicit [=] AXOM_HOST_DEVICE instead of AXOM_LAMBDA to avoid
   // RAJA privatizer issues on HIP with non-trivial types in capture
   auto map_view = map.view();
-  auto miss_keys_view = device_miss_keys.view();
-  auto misses_view = device_misses.view();
+  auto miss_keys_view = exec_miss_keys.view();
+  auto misses_view = exec_misses.view();
 
   for(auto _ : state)
   {
-    // Perform lookups in parallel using ArrayViews
-    axom::for_all<DeviceExec>(static_cast<axom::IndexType>(miss_keys.size()),
-                              [=] AXOM_HOST_DEVICE(axom::IndexType i) {
-                                misses_view[i] =
-                                  (map_view.find(miss_keys_view[i]) == map_view.end()) ? 1 : 0;
-                              });
+    axom::for_all<ExecSpace>(static_cast<axom::IndexType>(miss_keys.size()),
+                             [=] AXOM_HOST_DEVICE(axom::IndexType i) {
+                               misses_view[i] =
+                                 (map_view.find(miss_keys_view[i]) == map_view.end()) ? 1 : 0;
+                             });
 
-    // Synchronize to ensure device operations complete
-    axom::synchronize<DeviceExec>();
+    axom::synchronize<ExecSpace>();
 
-    benchmark::DoNotOptimize(device_misses.data());
+    benchmark::DoNotOptimize(exec_misses.data());
   }
 }
 
-  #endif  // AXOM_USE_CUDA || AXOM_USE_HIP
-#endif    // AXOM_USE_RAJA && AXOM_USE_UMPIRE
-
 }  // namespace
 
 //-----------------------------------------------------------------------------
@@ -863,6 +805,33 @@ void RegisterFlatMapPrehashedBenchmarks()
   }
 }
 
+template <typename ExecSpace>
+void RegisterFlatMapExecSpaceBenchmarks(const std::string& exec_suffix,
+                                        const std::string& sanity_prefix)
+{
+  benchmark::RegisterBenchmark(axom::fmt::format("{}::sanity_check", sanity_prefix),
+                               &BM_ExecSpace_Sanity_Check<ExecSpace>)
+    ->Apply(CustomArgs);
+
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::BatchedInsertion) !=
+     FlatMapFeatureBenchmarks::None)
+  {
+    benchmark::RegisterBenchmark(axom::fmt::format("axom::FlatMap::insert_{}_reserved", exec_suffix),
+                                 &BM_FlatMap_Insert_ExecSpace_Reserved<ExecSpace>)
+      ->Apply(CustomArgs);
+  }
+
+  if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None)
+  {
+    benchmark::RegisterBenchmark(axom::fmt::format("axom::FlatMap::find_hit_{}", exec_suffix),
+                                 &BM_FlatMap_Find_Hit_ExecSpace<ExecSpace>)
+      ->Apply(CustomArgs);
+    benchmark::RegisterBenchmark(axom::fmt::format("axom::FlatMap::find_miss_{}", exec_suffix),
+                                 &BM_FlatMap_Find_Miss_ExecSpace<ExecSpace>)
+      ->Apply(CustomArgs);
+  }
+}
+
 int main(int argc, char* argv[])
 {
   std::vector<int> local_test_sizes;
@@ -984,29 +953,17 @@ int main(int argc, char* argv[])
     "axom::google::sparse_hash_map");
 #endif
 
-  // Device/parallel benchmarks for debugging
-#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && \
-  (defined(AXOM_USE_CUDA) || defined(AXOM_USE_HIP))
-
-  // Device benchmarks enabled with raw pointers (iterators cause host stack address faults)
-  benchmark::RegisterBenchmark("Device::sanity_check", &BM_Device_Sanity_Check)->Apply(CustomArgs);
+  RegisterFlatMapExecSpaceBenchmarks<axom::SEQ_EXEC>("seq", "SEQ");
 
-  if((::args_benchmark_features & FlatMapFeatureBenchmarks::BatchedInsertion) !=
-     FlatMapFeatureBenchmarks::None)
-  {
-    benchmark::RegisterBenchmark("axom::FlatMap::insert_device_reserved",
-                                 &BM_FlatMap_Insert_Device_Reserved)
-      ->Apply(CustomArgs);
-  }
+#if defined(AXOM_USE_OPENMP) && defined(AXOM_USE_RAJA)
+  RegisterFlatMapExecSpaceBenchmarks<axom::OMP_EXEC>("omp", "OMP");
+#endif
 
-  if((::args_benchmark_features & FlatMapFeatureBenchmarks::Lookup) != FlatMapFeatureBenchmarks::None)
-  {
-    benchmark::RegisterBenchmark("axom::FlatMap::find_hit_device", &BM_FlatMap_Find_Hit_Device)
-      ->Apply(CustomArgs);
-    benchmark::RegisterBenchmark("axom::FlatMap::find_miss_device", &BM_FlatMap_Find_Miss_Device)
-      ->Apply(CustomArgs);
-  }
-#endif  // AXOM_USE_RAJA && AXOM_USE_UMPIRE && (AXOM_USE_CUDA || AXOM_USE_HIP)
+#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && defined(AXOM_USE_HIP)
+  RegisterFlatMapExecSpaceBenchmarks<axom::HIP_EXEC<256>>("device", "Device");
+#elif defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && defined(AXOM_USE_CUDA)
+  RegisterFlatMapExecSpaceBenchmarks<axom::CUDA_EXEC<256>>("device", "Device");
+#endif
 
   ::benchmark::RunSpecifiedBenchmarks();
   return 0;

From fd31c5e5d63f7d0bad89ed2b6506abb1582ede7f Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 17:01:01 -0700
Subject: [PATCH 26/28] Add number of threads to omp benchmarks

---
 .../core/tests/core_benchmark_flatmap.cpp     | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/axom/core/tests/core_benchmark_flatmap.cpp b/src/axom/core/tests/core_benchmark_flatmap.cpp
index c99dc61e20..35119b1a7a 100644
--- a/src/axom/core/tests/core_benchmark_flatmap.cpp
+++ b/src/axom/core/tests/core_benchmark_flatmap.cpp
@@ -13,6 +13,10 @@
 #include "axom/CLI11.hpp"
 #include "axom/fmt.hpp"
 
+#if defined(AXOM_USE_OPENMP)
+  #include <omp.h>
+#endif
+
 #if defined(AXOM_USE_SPARSEHASH)
   #include "axom/sparsehash/sparse_hash_map"
 #endif
@@ -805,6 +809,18 @@ void RegisterFlatMapPrehashedBenchmarks()
   }
 }
 
+#if defined(AXOM_USE_OPENMP) && defined(AXOM_USE_RAJA)
+std::string make_openmp_exec_suffix()
+{
+  return axom::fmt::format("omp_{}t", omp_get_max_threads());
+}
+
+std::string make_openmp_sanity_prefix()
+{
+  return axom::fmt::format("OMP_{}t", omp_get_max_threads());
+}
+#endif
+
 template <typename ExecSpace>
 void RegisterFlatMapExecSpaceBenchmarks(const std::string& exec_suffix,
                                         const std::string& sanity_prefix)
@@ -956,7 +972,8 @@ int main(int argc, char* argv[])
   RegisterFlatMapExecSpaceBenchmarks<axom::SEQ_EXEC>("seq", "SEQ");
 
 #if defined(AXOM_USE_OPENMP) && defined(AXOM_USE_RAJA)
-  RegisterFlatMapExecSpaceBenchmarks<axom::OMP_EXEC>("omp", "OMP");
+  RegisterFlatMapExecSpaceBenchmarks<axom::OMP_EXEC>(make_openmp_exec_suffix(),
+                                                     make_openmp_sanity_prefix());
 #endif
 
 #if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && defined(AXOM_USE_HIP)

From a85db8211547ab4724d1f8c916ddd89f0e223457 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Thu, 11 Jun 2026 17:05:21 -0700
Subject: [PATCH 27/28] Updates RELEASE-NOTES

---
 RELEASE-NOTES.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index a4bd67cc64..059766fcb2 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -59,6 +59,8 @@ The Axom project release numbers follow [Semantic Versioning](http://semver.org/
 - Primal: Improves reproducibility of 3D GWN methods by removing some sources of randomness
 - Core: ArrayView assigments/copies now copy the stride
 - Core: Array construction from strided ArrayView now correctly copies the strided elements
+- Core: Improved `axom::FlatMap` insertion performance by fusing duplicate-key lookup with empty-slot probing.
+- Core: Updated DeviceHash to use 64-bit hash results and improved coverage for integer and floating-point hashing.
 
 ## [Version 0.14.0] - Release date 2026-03-31
 

From d8bb8e9d3ae99fddfa806581d6257aae8e31b5c8 Mon Sep 17 00:00:00 2001
From: Kenneth Weiss <weiss27@llnl.gov>
Date: Fri, 12 Jun 2026 18:55:32 -0700
Subject: [PATCH 28/28] Bugfix for rzvector -- `if constexpr` needs an `else`

---
 src/CMakeLists.txt            |   2 +-
 src/axom/core/FlatMapUtil.hpp | 393 +++++++++++++++++-----------------
 2 files changed, 198 insertions(+), 197 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 362e28914d..08eabf44f7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -42,7 +42,7 @@ else()
     endif()
 endif()
 
-if (“${PROJECT_SOURCE_DIR}” STREQUAL “${CMAKE_SOURCE_DIR}”)
+if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
     # Set some default BLT options before loading BLT only if not included in
     # another project
     if (NOT BLT_CXX_STD)
diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp
index da0b363631..24cb9e38d6 100644
--- a/src/axom/core/FlatMapUtil.hpp
+++ b/src/axom/core/FlatMapUtil.hpp
@@ -285,233 +285,234 @@ void FlatMap<KeyType, ValueType, Hash>::insert(InputIt kv_begin, InputIt kv_end)
       this->insert_or_assign(std::forward<decltype(kv)>(kv).first,
                              std::forward<decltype(kv)>(kv).second);
     }
-    return;
   }
-
-  using HashResult = typename Hash::result_type;
-  using GroupBucket = detail::flat_map::GroupBucket;
-
-  IndexType num_elems = std::distance(kv_begin, kv_end);
-
-  // Batched insertion assumes probing sequences are gap-free
-  // (i.e., there are no tombstones from prior erase() operations).
-  // When tombstones exist, the parallel insertion logic can mishandle duplicates
-  // under contention (e.g. OpenMP) and produce incorrect size/value results.
-  //
-  // If tombstones exist, rehash to compact the table and restore the invariants required by this algorithm.
-  if(this->m_loadCount != static_cast<std::uint64_t>(this->m_size))
+  else
   {
-    this->rehash(this->m_size + num_elems);
-  }
+    using HashResult = typename Hash::result_type;
+    using GroupBucket = detail::flat_map::GroupBucket;
+
+    IndexType num_elems = std::distance(kv_begin, kv_end);
+
+    // Batched insertion assumes probing sequences are gap-free
+    // (i.e., there are no tombstones from prior erase() operations).
+    // When tombstones exist, the parallel insertion logic can mishandle duplicates
+    // under contention (e.g. OpenMP) and produce incorrect size/value results.
+    //
+    // If tombstones exist, rehash to compact the table and restore the invariants required by this algorithm.
+    if(this->m_loadCount != static_cast<std::uint64_t>(this->m_size))
+    {
+      this->rehash(this->m_size + num_elems);
+    }
 
-  const bool is_gap_free = (this->m_loadCount == static_cast<std::uint64_t>(this->m_size));
+    const bool is_gap_free = (this->m_loadCount == static_cast<std::uint64_t>(this->m_size));
 
-  // Assume that all elements will be inserted into an empty slot.
-  this->reserve(this->size() + num_elems);
+    // Assume that all elements will be inserted into an empty slot.
+    this->reserve(this->size() + num_elems);
 
-  FlatMap<KeyType, ValueType, Hash> temp;
-  bool allocate_temp_map = false;
+    FlatMap<KeyType, ValueType, Hash> temp;
+    bool allocate_temp_map = false;
 #if defined(AXOM_USE_CUDA) && defined(AXOM_USE_UMPIRE)
-  if(this->m_allocator.getSpace() == MemorySpace::Pinned)
-  {
-    // Pinned memory is allocated on the CPU, and is not always coherent with respect to the GPU.
-    // Instead of using system-scope atomics, we just construct a temporary map in device memory
-    // and copy it back to the pinned space.
-    axom::Allocator device_allocator {axom::detail::getAllocatorID<MemorySpace::Device>()};
-    temp = FlatMap(*this, device_allocator);
-    allocate_temp_map = true;
-  }
+    if(this->m_allocator.getSpace() == MemorySpace::Pinned)
+    {
+      // Pinned memory is allocated on the CPU, and is not always coherent with respect to the GPU.
+      // Instead of using system-scope atomics, we just construct a temporary map in device memory
+      // and copy it back to the pinned space.
+      axom::Allocator device_allocator {axom::detail::getAllocatorID<MemorySpace::Device>()};
+      temp = FlatMap(*this, device_allocator);
+      allocate_temp_map = true;
+    }
 #endif
-  FlatMap<KeyType, ValueType, Hash>& map = allocate_temp_map ? temp : *this;
-
-  // Grab some needed internal fields from the flat map.
-  // We're going to be constructing metadata and the K-V pairs directly
-  // in-place.
-  const int ngroups_pow_2 = map.m_numGroups2;
-  const auto meta_group = map.m_metadata.view();
-  const auto buckets = map.m_buckets.view();
-
-  // Construct an array of locks per-group. This guards metadata updates for
-  // each insertion.
-  const IndexType num_groups = 1 << ngroups_pow_2;
-  Array<detail::SpinLock> lock_vec(num_groups, num_groups, map.m_allocator.getID());
-  const auto group_locks = lock_vec.view();
-
-  // Map bucket slots to k-v pair indices. This is used to deduplicate pairs
-  // with the same key value.
-  Array<IndexType> key_index_dedup_vec(0, 0, map.m_allocator.getID());
-  key_index_dedup_vec.resize(num_groups * GroupBucket::Size, -1);
-  const auto key_index_dedup = key_index_dedup_vec.view();
-
-  // Map k-v pair indices to bucket slots. This is essentially the inverse of
-  // the above mapping.
-  Array<IndexType> key_index_to_bucket_vec(num_elems, num_elems, map.m_allocator.getID());
-  const auto key_index_to_bucket = key_index_to_bucket_vec.view();
-
-  axom::ReduceSum<ExecSpace, IndexType> total_overwrites(0);
-
-  for_all<ExecSpace>(
-    num_elems,
-    AXOM_LAMBDA(IndexType idx) {
-      // Construct key.
-      KeyType key = (*(kv_begin + idx)).first;
-
-      // Hash keys.
-      auto hash = Hash {}(key);
-
-      // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k.
-      const auto init =
-        detail::flat_map::SequentialLookupPolicy<HashResult>::initGroupProbe(hash, ngroups_pow_2);
-      const HashResult group_mask = init.group_mask;
-      HashResult curr_group = init.curr_group;
-
-      std::uint8_t hash_8 = static_cast<std::uint8_t>(hash);
-
-      IndexType duplicate_bucket_index = -1;
-      IndexType empty_bucket_index = -1;
-      int iteration = 0;
-      while(iteration < meta_group.size())
-      {
-        // Try to lock the group. We do this in a non-blocking manner to avoid
-        // intra-warp progress hazards.
-        bool group_locked = group_locks[curr_group].tryLock();
-
-        if(group_locked)
+    FlatMap<KeyType, ValueType, Hash>& map = allocate_temp_map ? temp : *this;
+
+    // Grab some needed internal fields from the flat map.
+    // We're going to be constructing metadata and the K-V pairs directly
+    // in-place.
+    const int ngroups_pow_2 = map.m_numGroups2;
+    const auto meta_group = map.m_metadata.view();
+    const auto buckets = map.m_buckets.view();
+
+    // Construct an array of locks per-group. This guards metadata updates for
+    // each insertion.
+    const IndexType num_groups = 1 << ngroups_pow_2;
+    Array<detail::SpinLock> lock_vec(num_groups, num_groups, map.m_allocator.getID());
+    const auto group_locks = lock_vec.view();
+
+    // Map bucket slots to k-v pair indices. This is used to deduplicate pairs
+    // with the same key value.
+    Array<IndexType> key_index_dedup_vec(0, 0, map.m_allocator.getID());
+    key_index_dedup_vec.resize(num_groups * GroupBucket::Size, -1);
+    const auto key_index_dedup = key_index_dedup_vec.view();
+
+    // Map k-v pair indices to bucket slots. This is essentially the inverse of
+    // the above mapping.
+    Array<IndexType> key_index_to_bucket_vec(num_elems, num_elems, map.m_allocator.getID());
+    const auto key_index_to_bucket = key_index_to_bucket_vec.view();
+
+    axom::ReduceSum<ExecSpace, IndexType> total_overwrites(0);
+
+    for_all<ExecSpace>(
+      num_elems,
+      AXOM_LAMBDA(IndexType idx) {
+        // Construct key.
+        KeyType key = (*(kv_begin + idx)).first;
+
+        // Hash keys.
+        auto hash = Hash {}(key);
+
+        // We use the k MSBs of the hash as the initial group probe point, where ngroups = 2^k.
+        const auto init =
+          detail::flat_map::SequentialLookupPolicy<HashResult>::initGroupProbe(hash, ngroups_pow_2);
+        const HashResult group_mask = init.group_mask;
+        HashResult curr_group = init.curr_group;
+
+        std::uint8_t hash_8 = static_cast<std::uint8_t>(hash);
+
+        IndexType duplicate_bucket_index = -1;
+        IndexType empty_bucket_index = -1;
+        int iteration = 0;
+        while(iteration < meta_group.size())
         {
-          // Every bucket visit - check prior filled buckets for duplicate
-          // keys.
-          meta_group[curr_group].visitHashBucket(hash_8, [&](int matching_slot) -> bool {
-            IndexType bucket_index = curr_group * GroupBucket::Size + matching_slot;
+          // Try to lock the group. We do this in a non-blocking manner to avoid
+          // intra-warp progress hazards.
+          bool group_locked = group_locks[curr_group].tryLock();
 
-            if(buckets[bucket_index].get().first == key)
-            {
-              duplicate_bucket_index = bucket_index;
-              return false;  // Don't need to search other buckets.
-            }
-            return true;
-          });
-          int empty_slot_index = meta_group[curr_group].getEmptyBucket();
-
-          if(duplicate_bucket_index == -1 && empty_bucket_index == -1)
+          if(group_locked)
           {
-            // Default probing behavior: no duplicate found yet, and no empty
-            // bucket found prior.
-            if(empty_slot_index == GroupBucket::InvalidSlot)
-            {
-              // Group is full. Set overflow bit for the group.
-              meta_group[curr_group].template setOverflow<true>(hash_8);
-            }
-            else
+            // Every bucket visit - check prior filled buckets for duplicate
+            // keys.
+            meta_group[curr_group].visitHashBucket(hash_8, [&](int matching_slot) -> bool {
+              IndexType bucket_index = curr_group * GroupBucket::Size + matching_slot;
+
+              if(buckets[bucket_index].get().first == key)
+              {
+                duplicate_bucket_index = bucket_index;
+                return false;  // Don't need to search other buckets.
+              }
+              return true;
+            });
+            int empty_slot_index = meta_group[curr_group].getEmptyBucket();
+
+            if(duplicate_bucket_index == -1 && empty_bucket_index == -1)
             {
-              // Update empty bucket index with first empty slot we encounter.
-              empty_bucket_index = curr_group * GroupBucket::Size + empty_slot_index;
-              key_index_dedup[empty_bucket_index] = idx;
-              key_index_to_bucket[idx] = empty_bucket_index;
-
-              // Insert initial element, this will be updated with the value of
-              // the "winning" key-value pair.
-              meta_group[curr_group].template setBucket<true>(empty_slot_index, hash_8);
+              // Default probing behavior: no duplicate found yet, and no empty
+              // bucket found prior.
+              if(empty_slot_index == GroupBucket::InvalidSlot)
+              {
+                // Group is full. Set overflow bit for the group.
+                meta_group[curr_group].template setOverflow<true>(hash_8);
+              }
+              else
+              {
+                // Update empty bucket index with first empty slot we encounter.
+                empty_bucket_index = curr_group * GroupBucket::Size + empty_slot_index;
+                key_index_dedup[empty_bucket_index] = idx;
+                key_index_to_bucket[idx] = empty_bucket_index;
+
+                // Insert initial element, this will be updated with the value of
+                // the "winning" key-value pair.
+                meta_group[curr_group].template setBucket<true>(empty_slot_index, hash_8);
 #if defined(__CUDA_ARCH__)
-              detail::constructPairInPlace(buckets[empty_bucket_index].get(),
-                                           key,
-                                           (*(kv_begin + idx)).second);
+                detail::constructPairInPlace(buckets[empty_bucket_index].get(),
+                                             key,
+                                             (*(kv_begin + idx)).second);
 #else
-              new(&buckets[empty_bucket_index]) KeyValuePair(*(kv_begin + idx));
+                new(&buckets[empty_bucket_index]) KeyValuePair(*(kv_begin + idx));
 #endif
+              }
             }
-          }
-          else if(duplicate_bucket_index != -1)
-          {
-            // Found a duplicate bucket.
-            if(!is_gap_free && empty_bucket_index != -1)
-            {
-              // We've already encountered an empty bucket earlier to place a
-              // k-v pair. This may occur if a probing sequence contains gaps
-              // (insertions followed by erasures).
-              //
-              // Just erase this element.
-              total_overwrites += 1;
-
-              int slot_index = duplicate_bucket_index - curr_group * GroupBucket::Size;
-              buckets[duplicate_bucket_index].get().~KeyValuePair();
-              meta_group[curr_group].clearBucket(slot_index);
-            }
-            else
+            else if(duplicate_bucket_index != -1)
             {
-              if(key_index_dedup[duplicate_bucket_index] == -1)
+              // Found a duplicate bucket.
+              if(!is_gap_free && empty_bucket_index != -1)
               {
-                // The k-v pair matches an already-existing pair in the map.
-                // Keep track of the number of overwrites so that we don't
-                // double-count them when incrementing the size.
+                // We've already encountered an empty bucket earlier to place a
+                // k-v pair. This may occur if a probing sequence contains gaps
+                // (insertions followed by erasures).
+                //
+                // Just erase this element.
                 total_overwrites += 1;
+
+                int slot_index = duplicate_bucket_index - curr_group * GroupBucket::Size;
+                buckets[duplicate_bucket_index].get().~KeyValuePair();
+                meta_group[curr_group].clearBucket(slot_index);
+              }
+              else
+              {
+                if(key_index_dedup[duplicate_bucket_index] == -1)
+                {
+                  // The k-v pair matches an already-existing pair in the map.
+                  // Keep track of the number of overwrites so that we don't
+                  // double-count them when incrementing the size.
+                  total_overwrites += 1;
+                }
+                // Highest-indexed kv pair wins.
+                axom::atomicMax<ExecSpace>(&key_index_dedup[duplicate_bucket_index], idx);
+                key_index_to_bucket[idx] = duplicate_bucket_index;
               }
-              // Highest-indexed kv pair wins.
-              axom::atomicMax<ExecSpace>(&key_index_dedup[duplicate_bucket_index], idx);
-              key_index_to_bucket[idx] = duplicate_bucket_index;
             }
-          }
-          // Unlock group once we're done.
-          group_locks[curr_group].unlock();
+            // Unlock group once we're done.
+            group_locks[curr_group].unlock();
 
-          if(duplicate_bucket_index != -1)
-          {
-            // We've found a duplicate key to overwrite.
-            break;
-          }
-          else if(empty_bucket_index != -1 &&
-                  (is_gap_free || !meta_group[curr_group].getMaybeOverflowed(hash_8)))
-          {
-            // If we're inserting into a gap-free map, empty bucket signals the
-            // end of the probing sequence.
-            // Otherwise, we need to check the overflow mask to continue probing.
-            break;
-          }
-          else
-          {
-            // Move to next group.
-            curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) & group_mask;
-            iteration++;
+            if(duplicate_bucket_index != -1)
+            {
+              // We've found a duplicate key to overwrite.
+              break;
+            }
+            else if(empty_bucket_index != -1 &&
+                    (is_gap_free || !meta_group[curr_group].getMaybeOverflowed(hash_8)))
+            {
+              // If we're inserting into a gap-free map, empty bucket signals the
+              // end of the probing sequence.
+              // Otherwise, we need to check the overflow mask to continue probing.
+              break;
+            }
+            else
+            {
+              // Move to next group.
+              curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) & group_mask;
+              iteration++;
+            }
           }
         }
-      }
-    });
-
-  // Add a counter for duplicated inserts.
-  axom::ReduceSum<ExecSpace, IndexType> total_inserts(0);
-
-  // Using key-deduplication map, assign unique k-v pairs to buckets.
-  for_all<ExecSpace>(
-    num_elems,
-    AXOM_LAMBDA(IndexType kv_idx) {
-      IndexType bucket_idx = key_index_to_bucket[kv_idx];
-      IndexType winning_idx = key_index_dedup[bucket_idx];
-      // Place k-v pair at bucket_idx.
-      if(kv_idx == winning_idx)
-      {
+      });
+
+    // Add a counter for duplicated inserts.
+    axom::ReduceSum<ExecSpace, IndexType> total_inserts(0);
+
+    // Using key-deduplication map, assign unique k-v pairs to buckets.
+    for_all<ExecSpace>(
+      num_elems,
+      AXOM_LAMBDA(IndexType kv_idx) {
+        IndexType bucket_idx = key_index_to_bucket[kv_idx];
+        IndexType winning_idx = key_index_dedup[bucket_idx];
+        // Place k-v pair at bucket_idx.
+        if(kv_idx == winning_idx)
+        {
 #if defined(__CUDA_ARCH__)
-        detail::constructPairInPlace(buckets[bucket_idx].get(),
-                                     (*(kv_begin + kv_idx)).first,
-                                     (*(kv_begin + kv_idx)).second);
+          detail::constructPairInPlace(buckets[bucket_idx].get(),
+                                       (*(kv_begin + kv_idx)).first,
+                                       (*(kv_begin + kv_idx)).second);
 #else
-        new(&buckets[bucket_idx]) KeyValuePair(*(kv_begin + kv_idx));
+          new(&buckets[bucket_idx]) KeyValuePair(*(kv_begin + kv_idx));
 #endif
-        total_inserts += 1;
-      }
-    });
+          total_inserts += 1;
+        }
+      });
 
-  map.m_size += total_inserts.get() - total_overwrites.get();
-  map.m_loadCount += total_inserts.get() - total_overwrites.get();
+    map.m_size += total_inserts.get() - total_overwrites.get();
+    map.m_loadCount += total_inserts.get() - total_overwrites.get();
 
 #if defined(AXOM_USE_CUDA) && defined(AXOM_USE_UMPIRE)
-  if(allocate_temp_map)
-  {
-    // Original pinned map is in temp.
-    axom::Allocator pinned_allocator = temp.getAllocator();
+    if(allocate_temp_map)
+    {
+      // Original pinned map is in temp.
+      axom::Allocator pinned_allocator = temp.getAllocator();
 
-    // Move new FlatMap to pinned memory.
-    *this = FlatMap(map, pinned_allocator);
-  }
+      // Move new FlatMap to pinned memory.
+      *this = FlatMap(map, pinned_allocator);
+    }
 #endif
+  }
 }
 
 }  // namespace axom