From bc15c1d5b20f5fbd3d5b7f28faa41fd8b93a4fb1 Mon Sep 17 00:00:00 2001
From: Graydon Hoare <graydon@pobox.com>
Date: Thu, 2 Jul 2026 18:07:03 -0700
Subject: [PATCH 1/3] bump medida to take new optimizations

---
 lib/libmedida | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/libmedida b/lib/libmedida
index b2caac89d..2bf1afac2 160000
--- a/lib/libmedida
+++ b/lib/libmedida
@@ -1 +1 @@
-Subproject commit b2caac89d54c9ec2b6be7fa7c020aa1b0859206b
+Subproject commit 2bf1afac2911c9aca9d1ba06e3d883fa150c9baf

From 35dc8d4b5b81f4ed5f9e5d11685702833c1a1330 Mon Sep 17 00:00:00 2001
From: dmkozh <dmytro@stellar.org>
Date: Thu, 11 Jun 2026 17:32:13 -0400
Subject: [PATCH 2/3] track SimpleTimer max with an atomic CAS instead of a
 mutex

SimpleTimer::Update is called from the parallel apply threads (notably the
bucket point-load timers), where the per-update mutex around max tracking
was a cross-thread contention point. The sum/count counters were already
atomic; with a compare-exchange max the whole timer is now lock-free.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/util/SimpleTimer.cpp | 11 ++++++-----
 src/util/SimpleTimer.h   |  7 +++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/util/SimpleTimer.cpp b/src/util/SimpleTimer.cpp
index 959a48985..924a26b1b 100644
--- a/src/util/SimpleTimer.cpp
+++ b/src/util/SimpleTimer.cpp
@@ -26,9 +26,7 @@ SimpleTimer::SimpleTimer(MetricsRegistry& registry, SimpleTimerName const& name,
 void
 SimpleTimer::syncMax()
 {
-    MutexLocker lock{mLock};
-    mMaxSampleValue.set_count(mMax);
-    mMax = 0;
+    mMaxSampleValue.set_count(mMax.exchange(0, std::memory_order_relaxed));
 }
 
 std::int64_t
@@ -43,9 +41,12 @@ SimpleTimer::Update(std::chrono::nanoseconds d)
     int64_t converted = d / mDurationUnit;
     mSum.inc(converted);
     mSampleCount.inc(1);
+    // Lock-free max tracking.
+    int64_t prev = mMax.load(std::memory_order_relaxed);
+    while (prev < converted &&
+           !mMax.compare_exchange_weak(prev, converted,
+                                       std::memory_order_relaxed))
     {
-        MutexLocker lock{mLock};
-        mMax = std::max(mMax, converted);
     }
 }
 
diff --git a/src/util/SimpleTimer.h b/src/util/SimpleTimer.h
index 91d0cc469..d1058191f 100644
--- a/src/util/SimpleTimer.h
+++ b/src/util/SimpleTimer.h
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "util/ThreadAnnotations.h"
+#include <atomic>
+#include <chrono>
 #include <medida/counter.h>
 #include <medida/metric_name.h>
 
@@ -41,9 +42,7 @@ class SimpleTimer
     // metrics, but it is only synced on `syncMax()` so that the value of the
     // counter is clear.
     medida::Counter& mMaxSampleValue;
-    std::int64_t mMax GUARDED_BY(mLock);
-
-    ANNOTATED_MUTEX(mLock);
+    std::atomic<std::int64_t> mMax;
 
     std::chrono::nanoseconds const mDurationUnit;
 

From 83885c25044843929b046da8b50fb8dde031ced7 Mon Sep 17 00:00:00 2001
From: dmkozh <dmytro@stellar.org>
Date: Thu, 11 Jun 2026 17:32:28 -0400
Subject: [PATCH 3/3] batch apply-path soroban metrics per thread, publish at
 ledger close

With metrics enabled, every InvokeHostFunction op marked ~20 process-wide
meters and updated ~7 histograms/timers from the apply threads (~25
contended lock/cache-line operations per op), and the parallel path also
performed a registry lookup of the ledger.operation.apply timer per
transaction. On the apply-load benchmark (sac, TX=6000, T=8) this made
close time 282ms with metrics vs 107ms without, with p99 spikes from CKMS
compaction; DISABLE_SOROBAN_METRICS_FOR_TESTING existed to hide this.

Instead, record all per-op/per-tx metric updates into a per-thread
SorobanMetrics::ApplyMetricsBatch (meter increments as plain sums, sample
streams for percentile-bearing histograms/timers as raw value vectors,
one brief uncontended lock per record) and drain all batches into the
underlying medida metrics once per ledger in
publishAndResetLedgerWideMetrics(), using the new UpdateMany bulk APIs.
Observable metric values are preserved: meter counts are identical sums,
EWMA rates tick at 5s granularity either way, and histograms receive the
exact same sample stream, just at close time.

The sequential apply path still updates medida directly (main thread,
cheap after the medida fixes). Combined with those fixes, metrics-on
close time drops to ~106.5ms vs ~104.1ms metrics-off (200-ledger run),
i.e. metric overhead goes from ~164% to ~2%.

Also adds a regression test asserting the batched metrics are visible
after the ledger close that applied the transaction.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/ledger/LedgerManagerImpl.cpp              |   9 +-
 src/ledger/SorobanMetrics.cpp                 | 153 ++++++++++++++++++
 src/ledger/SorobanMetrics.h                   | 134 +++++++++++++++
 .../ExtendFootprintTTLOpFrame.cpp             |  21 ++-
 .../InvokeHostFunctionOpFrame.cpp             | 101 +++++++-----
 src/transactions/RestoreFootprintOpFrame.cpp  |  23 ++-
 src/transactions/TransactionFrame.cpp         |  16 +-
 .../test/InvokeHostFunctionTests.cpp          |  36 +++++
 src/util/SimpleTimer.cpp                      |   5 +-
 9 files changed, 437 insertions(+), 61 deletions(-)

diff --git a/src/ledger/LedgerManagerImpl.cpp b/src/ledger/LedgerManagerImpl.cpp
index 513642590..2ca2d75e1 100644
--- a/src/ledger/LedgerManagerImpl.cpp
+++ b/src/ledger/LedgerManagerImpl.cpp
@@ -2415,12 +2415,13 @@ LedgerManagerImpl::applyThread(
 {
     for (auto const& txBundle : cluster)
     {
-        // Apply timer
-        std::optional<medida::TimerContext> txTime;
+        // Apply timer; samples go into the thread's metrics batch and are
+        // published at ledger close.
+        std::optional<BatchedTimerScope> txTime;
         if (!mApp.getConfig().DISABLE_SOROBAN_METRICS_FOR_TESTING)
         {
-            txTime.emplace(
-                mApplyState.getMetrics().mTransactionApply.TimeScope());
+            txTime.emplace(getSorobanMetrics(),
+                           &SorobanMetrics::ApplyMetricsBatch::mTxApplyNsecs);
         }
 
         Hash txSubSeed = subSha256(sorobanBasePrngSeed, txBundle.getTxNum());
diff --git a/src/ledger/SorobanMetrics.cpp b/src/ledger/SorobanMetrics.cpp
index 4e04c24c9..3aa72b928 100644
--- a/src/ledger/SorobanMetrics.cpp
+++ b/src/ledger/SorobanMetrics.cpp
@@ -1,7 +1,11 @@
 #include "ledger/SorobanMetrics.h"
 #include "util/MetricsRegistry.h"
 
+#include <medida/histogram.h>
+#include <medida/meter.h>
 #include <medida/metrics_registry.h>
+#include <medida/timer.h>
+#include <unordered_map>
 
 namespace stellar
 {
@@ -26,6 +30,8 @@ SorobanMetrics::SorobanMetrics(MetricsRegistry& metrics)
 
     /* tx-wide metrics */
     , mTxSizeByte(metrics.NewHistogram({"soroban", "tx", "size-byte"}))
+    , mTransactionApply(metrics.NewTimer({"ledger", "transaction", "apply"}))
+    , mOperationApply(metrics.NewTimer({"ledger", "operation", "apply"}))
     /* InvokeHostFunctionOp metrics */
     , mHostFnOpReadEntry(
           metrics.NewMeter({"soroban", "host-fn-op", "read-entry"}, "entry"))
@@ -202,9 +208,156 @@ SorobanMetrics::accumulateLedgerWriteByte(uint64_t writeByte)
     mCounterLedgerWriteByte += writeByte;
 }
 
+SorobanMetrics::ApplyMetricsBatch&
+SorobanMetrics::getApplyThreadBatch()
+{
+    // One entry per (thread, SorobanMetrics instance): tests can run several
+    // applications in one process and threads outlive applications, so the
+    // weak_ptr is re-validated on every first-use (the registry owns the
+    // batches; a stale entry from a destroyed instance whose address got
+    // reused expires together with its registry).
+    static thread_local std::unordered_map<SorobanMetrics const*,
+                                           std::weak_ptr<ApplyMetricsBatch>>
+        tlBatches;
+    auto& weak = tlBatches[this];
+    if (auto existing = weak.lock())
+    {
+        return *existing;
+    }
+    auto batch = std::make_shared<ApplyMetricsBatch>();
+    {
+        std::lock_guard<std::mutex> lock(mApplyBatchesMutex);
+        mApplyBatches.push_back(batch);
+    }
+    weak = batch;
+    return *batch;
+}
+
+void
+SorobanMetrics::flushApplyMetricsBatches()
+{
+    std::vector<std::shared_ptr<ApplyMetricsBatch>> batches;
+    {
+        std::lock_guard<std::mutex> lock(mApplyBatchesMutex);
+        batches = mApplyBatches;
+    }
+    if (batches.empty())
+    {
+        return;
+    }
+
+    ApplyMetricsBatch total;
+    auto take = [](uint64_t& v) {
+        auto res = v;
+        v = 0;
+        return res;
+    };
+    auto drain = [](std::vector<int64_t>& dst, std::vector<int64_t>& src) {
+        dst.insert(dst.end(), src.begin(), src.end());
+        src.clear();
+    };
+    for (auto const& b : batches)
+    {
+        std::lock_guard<std::mutex> lock(b->mMutex);
+        total.mHostFnOpReadEntry += take(b->mHostFnOpReadEntry);
+        total.mHostFnOpWriteEntry += take(b->mHostFnOpWriteEntry);
+        total.mHostFnOpReadKeyByte += take(b->mHostFnOpReadKeyByte);
+        total.mHostFnOpWriteKeyByte += take(b->mHostFnOpWriteKeyByte);
+        total.mHostFnOpReadLedgerByte += take(b->mHostFnOpReadLedgerByte);
+        total.mHostFnOpReadDataByte += take(b->mHostFnOpReadDataByte);
+        total.mHostFnOpReadCodeByte += take(b->mHostFnOpReadCodeByte);
+        total.mHostFnOpWriteLedgerByte += take(b->mHostFnOpWriteLedgerByte);
+        total.mHostFnOpWriteDataByte += take(b->mHostFnOpWriteDataByte);
+        total.mHostFnOpWriteCodeByte += take(b->mHostFnOpWriteCodeByte);
+        total.mHostFnOpEmitEvent += take(b->mHostFnOpEmitEvent);
+        total.mHostFnOpEmitEventByte += take(b->mHostFnOpEmitEventByte);
+        total.mHostFnOpCpuInsn += take(b->mHostFnOpCpuInsn);
+        total.mHostFnOpMemByte += take(b->mHostFnOpMemByte);
+        total.mHostFnOpCpuInsnExclVm += take(b->mHostFnOpCpuInsnExclVm);
+        total.mHostFnOpMaxRwKeyByte += take(b->mHostFnOpMaxRwKeyByte);
+        total.mHostFnOpMaxRwDataByte += take(b->mHostFnOpMaxRwDataByte);
+        total.mHostFnOpMaxRwCodeByte += take(b->mHostFnOpMaxRwCodeByte);
+        total.mHostFnOpMaxEmitEventByte += take(b->mHostFnOpMaxEmitEventByte);
+        total.mHostFnOpSuccess += take(b->mHostFnOpSuccess);
+        total.mHostFnOpFailure += take(b->mHostFnOpFailure);
+        total.mExtFpTtlOpReadLedgerByte += take(b->mExtFpTtlOpReadLedgerByte);
+        total.mRestoreFpOpReadLedgerByte += take(b->mRestoreFpOpReadLedgerByte);
+        total.mRestoreFpOpWriteLedgerByte +=
+            take(b->mRestoreFpOpWriteLedgerByte);
+
+        drain(total.mHostFnOpInvokeTimeNsecs, b->mHostFnOpInvokeTimeNsecs);
+        drain(total.mHostFnOpInvokeTimeNsecsExclVm,
+              b->mHostFnOpInvokeTimeNsecsExclVm);
+        drain(total.mHostFnOpInvokeTimeFsecsCpuInsnRatio,
+              b->mHostFnOpInvokeTimeFsecsCpuInsnRatio);
+        drain(total.mHostFnOpInvokeTimeFsecsCpuInsnRatioExclVm,
+              b->mHostFnOpInvokeTimeFsecsCpuInsnRatioExclVm);
+        drain(total.mHostFnOpDeclaredInsnsUsageRatio,
+              b->mHostFnOpDeclaredInsnsUsageRatio);
+        drain(total.mHostFnOpExecNsecs, b->mHostFnOpExecNsecs);
+        drain(total.mExtFpTtlOpExecNsecs, b->mExtFpTtlOpExecNsecs);
+        drain(total.mRestoreFpOpExecNsecs, b->mRestoreFpOpExecNsecs);
+        drain(total.mTxSizeByte, b->mTxSizeByte);
+        drain(total.mTxApplyNsecs, b->mTxApplyNsecs);
+        drain(total.mOpApplyNsecs, b->mOpApplyNsecs);
+    }
+
+    // Publish into the underlying medida metrics, one bulk call per metric.
+    // Zero meter increments are skipped (a Mark(0) does not change any
+    // observable value); empty sample batches are no-ops in UpdateMany.
+    auto markIf = [](medida::Meter& meter, uint64_t value) {
+        if (value != 0)
+        {
+            meter.Mark(value);
+        }
+    };
+    markIf(mHostFnOpReadEntry, total.mHostFnOpReadEntry);
+    markIf(mHostFnOpWriteEntry, total.mHostFnOpWriteEntry);
+    markIf(mHostFnOpReadKeyByte, total.mHostFnOpReadKeyByte);
+    markIf(mHostFnOpWriteKeyByte, total.mHostFnOpWriteKeyByte);
+    markIf(mHostFnOpReadLedgerByte, total.mHostFnOpReadLedgerByte);
+    markIf(mHostFnOpReadDataByte, total.mHostFnOpReadDataByte);
+    markIf(mHostFnOpReadCodeByte, total.mHostFnOpReadCodeByte);
+    markIf(mHostFnOpWriteLedgerByte, total.mHostFnOpWriteLedgerByte);
+    markIf(mHostFnOpWriteDataByte, total.mHostFnOpWriteDataByte);
+    markIf(mHostFnOpWriteCodeByte, total.mHostFnOpWriteCodeByte);
+    markIf(mHostFnOpEmitEvent, total.mHostFnOpEmitEvent);
+    markIf(mHostFnOpEmitEventByte, total.mHostFnOpEmitEventByte);
+    markIf(mHostFnOpCpuInsn, total.mHostFnOpCpuInsn);
+    markIf(mHostFnOpMemByte, total.mHostFnOpMemByte);
+    markIf(mHostFnOpCpuInsnExclVm, total.mHostFnOpCpuInsnExclVm);
+    markIf(mHostFnOpMaxRwKeyByte, total.mHostFnOpMaxRwKeyByte);
+    markIf(mHostFnOpMaxRwDataByte, total.mHostFnOpMaxRwDataByte);
+    markIf(mHostFnOpMaxRwCodeByte, total.mHostFnOpMaxRwCodeByte);
+    markIf(mHostFnOpMaxEmitEventByte, total.mHostFnOpMaxEmitEventByte);
+    markIf(mHostFnOpSuccess, total.mHostFnOpSuccess);
+    markIf(mHostFnOpFailure, total.mHostFnOpFailure);
+    markIf(mExtFpTtlOpReadLedgerByte, total.mExtFpTtlOpReadLedgerByte);
+    markIf(mRestoreFpOpReadLedgerByte, total.mRestoreFpOpReadLedgerByte);
+    markIf(mRestoreFpOpWriteLedgerByte, total.mRestoreFpOpWriteLedgerByte);
+
+    mHostFnOpInvokeTimeNsecs.UpdateMany(total.mHostFnOpInvokeTimeNsecs);
+    mHostFnOpInvokeTimeNsecsExclVm.UpdateMany(
+        total.mHostFnOpInvokeTimeNsecsExclVm);
+    mHostFnOpInvokeTimeFsecsCpuInsnRatio.UpdateMany(
+        total.mHostFnOpInvokeTimeFsecsCpuInsnRatio);
+    mHostFnOpInvokeTimeFsecsCpuInsnRatioExclVm.UpdateMany(
+        total.mHostFnOpInvokeTimeFsecsCpuInsnRatioExclVm);
+    mHostFnOpDeclaredInsnsUsageRatio.UpdateMany(
+        total.mHostFnOpDeclaredInsnsUsageRatio);
+    mHostFnOpExec.UpdateMany(total.mHostFnOpExecNsecs);
+    mExtFpTtlOpExec.UpdateMany(total.mExtFpTtlOpExecNsecs);
+    mRestoreFpOpExec.UpdateMany(total.mRestoreFpOpExecNsecs);
+    mTxSizeByte.UpdateMany(total.mTxSizeByte);
+    mTransactionApply.UpdateMany(total.mTxApplyNsecs);
+    mOperationApply.UpdateMany(total.mOpApplyNsecs);
+}
+
 void
 SorobanMetrics::publishAndResetLedgerWideMetrics()
 {
+    flushApplyMetricsBatches();
+
     mLedgerTxCount.Update(mCounterLedgerTxCount);
     mLedgerCpuInsn.Update(mCounterLedgerCpuInsn);
     mLedgerTxsSizeByte.Update(mCounterLedgerTxsSizeByte);
diff --git a/src/ledger/SorobanMetrics.h b/src/ledger/SorobanMetrics.h
index 7266add08..62146673c 100644
--- a/src/ledger/SorobanMetrics.h
+++ b/src/ledger/SorobanMetrics.h
@@ -8,7 +8,11 @@
 // limits. It also performs aggregation of ledger-wide resource usage across
 // different operations.
 #include <atomic>
+#include <chrono>
 #include <cstdint>
+#include <memory>
+#include <mutex>
+#include <vector>
 
 namespace medida
 {
@@ -24,6 +28,58 @@ class MetricsRegistry;
 
 class SorobanMetrics
 {
+  public:
+    // Accumulates apply-path metric updates from a single thread. Hot apply
+    // code records into its own thread's batch (brief, uncontended lock
+    // acquisition), and the batches are drained into the underlying
+    // process-wide medida metrics once per ledger on the main thread via
+    // publishAndResetLedgerWideMetrics(). This keeps shared metric state (its
+    // locks and cache lines) off the parallel apply threads.
+    struct ApplyMetricsBatch
+    {
+        std::mutex mMutex;
+
+        // Pending Meter increments (Marks summed since the last publish).
+        uint64_t mHostFnOpReadEntry{0};
+        uint64_t mHostFnOpWriteEntry{0};
+        uint64_t mHostFnOpReadKeyByte{0};
+        uint64_t mHostFnOpWriteKeyByte{0};
+        uint64_t mHostFnOpReadLedgerByte{0};
+        uint64_t mHostFnOpReadDataByte{0};
+        uint64_t mHostFnOpReadCodeByte{0};
+        uint64_t mHostFnOpWriteLedgerByte{0};
+        uint64_t mHostFnOpWriteDataByte{0};
+        uint64_t mHostFnOpWriteCodeByte{0};
+        uint64_t mHostFnOpEmitEvent{0};
+        uint64_t mHostFnOpEmitEventByte{0};
+        uint64_t mHostFnOpCpuInsn{0};
+        uint64_t mHostFnOpMemByte{0};
+        uint64_t mHostFnOpCpuInsnExclVm{0};
+        uint64_t mHostFnOpMaxRwKeyByte{0};
+        uint64_t mHostFnOpMaxRwDataByte{0};
+        uint64_t mHostFnOpMaxRwCodeByte{0};
+        uint64_t mHostFnOpMaxEmitEventByte{0};
+        uint64_t mHostFnOpSuccess{0};
+        uint64_t mHostFnOpFailure{0};
+        uint64_t mExtFpTtlOpReadLedgerByte{0};
+        uint64_t mRestoreFpOpReadLedgerByte{0};
+        uint64_t mRestoreFpOpWriteLedgerByte{0};
+
+        // Pending sample streams for percentile-bearing histograms/timers
+        // (timer samples are in nanoseconds).
+        std::vector<int64_t> mHostFnOpInvokeTimeNsecs;
+        std::vector<int64_t> mHostFnOpInvokeTimeNsecsExclVm;
+        std::vector<int64_t> mHostFnOpInvokeTimeFsecsCpuInsnRatio;
+        std::vector<int64_t> mHostFnOpInvokeTimeFsecsCpuInsnRatioExclVm;
+        std::vector<int64_t> mHostFnOpDeclaredInsnsUsageRatio;
+        std::vector<int64_t> mHostFnOpExecNsecs;
+        std::vector<int64_t> mExtFpTtlOpExecNsecs;
+        std::vector<int64_t> mRestoreFpOpExecNsecs;
+        std::vector<int64_t> mTxSizeByte;
+        std::vector<int64_t> mTxApplyNsecs;
+        std::vector<int64_t> mOpApplyNsecs;
+    };
+
   private:
     std::atomic<uint64_t> mCounterLedgerTxCount{0};
     std::atomic<uint64_t> mCounterLedgerCpuInsn{0};
@@ -38,6 +94,13 @@ class SorobanMetrics
     std::atomic<uint64_t> mLedgerInsnsExclVmCount{0};
     std::atomic<uint64_t> mLedgerHostFnExecTimeNsecs{0};
 
+    // All per-thread batches handed out by getApplyThreadBatch(), drained on
+    // each publishAndResetLedgerWideMetrics() call.
+    std::mutex mApplyBatchesMutex;
+    std::vector<std::shared_ptr<ApplyMetricsBatch>> mApplyBatches;
+
+    void flushApplyMetricsBatches();
+
   public:
     // ledger-wide metrics
     medida::Histogram& mLedgerTxCount;
@@ -53,6 +116,14 @@ class SorobanMetrics
     // tx-wide metrics
     medida::Histogram& mTxSizeByte;
 
+    // Cached references to the (op-kind-agnostic) "ledger.transaction.apply"
+    // and "ledger.operation.apply" timers: the parallel apply path records
+    // per-tx/per-op samples into its ApplyMetricsBatch and they are published
+    // into these at ledger close. These are the same timer instances the
+    // sequential apply path updates directly via registry lookups.
+    medida::Timer& mTransactionApply;
+    medida::Timer& mOperationApply;
+
     // `InvokeHostFunctionOp` metrics
     medida::Meter& mHostFnOpReadEntry;
     medida::Meter& mHostFnOpWriteEntry;
@@ -127,6 +198,10 @@ class SorobanMetrics
 
     SorobanMetrics(MetricsRegistry& metrics);
 
+    // Returns the calling thread's metrics batch for this SorobanMetrics
+    // instance, creating and registering it on first use.
+    ApplyMetricsBatch& getApplyThreadBatch();
+
     void accumulateModelledCpuInsns(uint64_t insnsCount,
                                     uint64_t insnsExclVmCount,
                                     uint64_t execTimeNsecs);
@@ -140,4 +215,63 @@ class SorobanMetrics
 
     void publishAndResetLedgerWideMetrics();
 };
+
+// Adds the wall-clock duration of its lifetime (in nanoseconds) to a
+// caller-provided accumulator on destruction.
+class ScopedNsecsTimer
+{
+  public:
+    explicit ScopedNsecsTimer(uint64_t& target)
+        : mTarget(target), mStart(std::chrono::steady_clock::now())
+    {
+    }
+
+    ScopedNsecsTimer(ScopedNsecsTimer const&) = delete;
+    ScopedNsecsTimer& operator=(ScopedNsecsTimer const&) = delete;
+
+    ~ScopedNsecsTimer()
+    {
+        mTarget += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                       std::chrono::steady_clock::now() - mStart)
+                       .count();
+    }
+
+  private:
+    uint64_t& mTarget;
+    std::chrono::steady_clock::time_point mStart;
+};
+
+// Times its lifetime and records the elapsed nanoseconds as a sample in the
+// given vector of the calling thread's ApplyMetricsBatch.
+class BatchedTimerScope
+{
+  public:
+    using SampleField =
+        std::vector<int64_t> SorobanMetrics::ApplyMetricsBatch::*;
+
+    BatchedTimerScope(SorobanMetrics& metrics, SampleField field)
+        : mMetrics(metrics)
+        , mField(field)
+        , mStart(std::chrono::steady_clock::now())
+    {
+    }
+
+    BatchedTimerScope(BatchedTimerScope const&) = delete;
+    BatchedTimerScope& operator=(BatchedTimerScope const&) = delete;
+
+    ~BatchedTimerScope()
+    {
+        auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                           std::chrono::steady_clock::now() - mStart)
+                           .count();
+        auto& batch = mMetrics.getApplyThreadBatch();
+        std::lock_guard<std::mutex> lock(batch.mMutex);
+        (batch.*mField).push_back(elapsed);
+    }
+
+  private:
+    SorobanMetrics& mMetrics;
+    SampleField mField;
+    std::chrono::steady_clock::time_point mStart;
+};
 }
diff --git a/src/transactions/ExtendFootprintTTLOpFrame.cpp b/src/transactions/ExtendFootprintTTLOpFrame.cpp
index a1e960b0d..56d6cee6d 100644
--- a/src/transactions/ExtendFootprintTTLOpFrame.cpp
+++ b/src/transactions/ExtendFootprintTTLOpFrame.cpp
@@ -30,18 +30,33 @@ struct ExtendFootprintTTLMetrics
 
     uint32 mLedgerReadByte{0};
 
+    // op execution wall time, captured via getExecTimer()
+    uint64_t mExecTimeNsecs{0};
+    bool mExecTimed{false};
+
     ExtendFootprintTTLMetrics(SorobanMetrics& metrics) : mMetrics(metrics)
     {
     }
 
     ~ExtendFootprintTTLMetrics()
     {
-        mMetrics.mExtFpTtlOpReadLedgerByte.Mark(mLedgerReadByte);
+        // Record into the calling thread's batch (published once per ledger)
+        // rather than updating process-wide metrics from every (possibly
+        // concurrent) operation.
+        auto& batch = mMetrics.getApplyThreadBatch();
+        std::lock_guard<std::mutex> lock(batch.mMutex);
+        batch.mExtFpTtlOpReadLedgerByte += mLedgerReadByte;
+        if (mExecTimed)
+        {
+            batch.mExtFpTtlOpExecNsecs.push_back(
+                static_cast<int64_t>(mExecTimeNsecs));
+        }
     }
-    medida::TimerContext
+    ScopedNsecsTimer
     getExecTimer()
     {
-        return mMetrics.mExtFpTtlOpExec.TimeScope();
+        mExecTimed = true;
+        return ScopedNsecsTimer(mExecTimeNsecs);
     }
 };
 
diff --git a/src/transactions/InvokeHostFunctionOpFrame.cpp b/src/transactions/InvokeHostFunctionOpFrame.cpp
index 4be31b69e..e5f817ad9 100644
--- a/src/transactions/InvokeHostFunctionOpFrame.cpp
+++ b/src/transactions/InvokeHostFunctionOpFrame.cpp
@@ -141,6 +141,10 @@ struct HostFunctionMetrics
     uint32_t mMaxReadWriteCodeByte{0};
     uint32_t mMaxEmitEventByte{0};
 
+    // host-fn execution wall time, captured via getExecTimer()
+    uint64_t mExecTimeNsecs{0};
+    bool mExecTimed{false};
+
     bool mSuccess{false};
 
     HostFunctionMetrics(SorobanMetrics& metrics, bool disableMetrics)
@@ -155,53 +159,66 @@ struct HostFunctionMetrics
             return;
         }
 
-        mMetrics.mHostFnOpReadEntry.Mark(mReadEntry);
-        mMetrics.mHostFnOpWriteEntry.Mark(mWriteEntry);
-
-        mMetrics.mHostFnOpReadKeyByte.Mark(mReadKeyByte);
-        mMetrics.mHostFnOpWriteKeyByte.Mark(mWriteKeyByte);
-
-        mMetrics.mHostFnOpReadLedgerByte.Mark(mLedgerReadByte);
-        mMetrics.mHostFnOpReadDataByte.Mark(mReadDataByte);
-        mMetrics.mHostFnOpReadCodeByte.Mark(mReadCodeByte);
-
-        mMetrics.mHostFnOpWriteLedgerByte.Mark(mLedgerWriteByte);
-        mMetrics.mHostFnOpWriteDataByte.Mark(mWriteDataByte);
-        mMetrics.mHostFnOpWriteCodeByte.Mark(mWriteCodeByte);
-
-        mMetrics.mHostFnOpEmitEvent.Mark(mEmitEvent);
-        mMetrics.mHostFnOpEmitEventByte.Mark(mEmitEventByte);
-
-        mMetrics.mHostFnOpCpuInsn.Mark(mCpuInsn);
-        mMetrics.mHostFnOpMemByte.Mark(mMemByte);
-        mMetrics.mHostFnOpInvokeTimeNsecs.Update(
-            std::chrono::nanoseconds(mInvokeTimeNsecs));
-        mMetrics.mHostFnOpCpuInsnExclVm.Mark(mCpuInsnExclVm);
-        mMetrics.mHostFnOpInvokeTimeNsecsExclVm.Update(
-            std::chrono::nanoseconds(mInvokeTimeNsecsExclVm));
-        mMetrics.mHostFnOpInvokeTimeFsecsCpuInsnRatio.Update(
-            mInvokeTimeNsecs * 1000000 / std::max(mCpuInsn, uint64_t(1)));
-        mMetrics.mHostFnOpInvokeTimeFsecsCpuInsnRatioExclVm.Update(
-            mInvokeTimeNsecsExclVm * 1000000 /
-            std::max(mCpuInsnExclVm, uint64_t(1)));
-        mMetrics.mHostFnOpDeclaredInsnsUsageRatio.Update(
-            mCpuInsn * 1000000 / std::max(mDeclaredCpuInsn, uint64_t(1)));
-
-        mMetrics.mHostFnOpMaxRwKeyByte.Mark(mMaxReadWriteKeyByte);
-        mMetrics.mHostFnOpMaxRwDataByte.Mark(mMaxReadWriteDataByte);
-        mMetrics.mHostFnOpMaxRwCodeByte.Mark(mMaxReadWriteCodeByte);
-        mMetrics.mHostFnOpMaxEmitEventByte.Mark(mMaxEmitEventByte);
-
         mMetrics.accumulateModelledCpuInsns(mCpuInsn, mCpuInsnExclVm,
                                             mInvokeTimeNsecs);
 
+        // Record everything into the calling thread's batch (published once
+        // per ledger) instead of updating ~25 process-wide metrics from
+        // every (possibly concurrent) operation.
+        auto& batch = mMetrics.getApplyThreadBatch();
+        std::lock_guard<std::mutex> lock(batch.mMutex);
+
+        batch.mHostFnOpReadEntry += mReadEntry;
+        batch.mHostFnOpWriteEntry += mWriteEntry;
+
+        batch.mHostFnOpReadKeyByte += mReadKeyByte;
+        batch.mHostFnOpWriteKeyByte += mWriteKeyByte;
+
+        batch.mHostFnOpReadLedgerByte += mLedgerReadByte;
+        batch.mHostFnOpReadDataByte += mReadDataByte;
+        batch.mHostFnOpReadCodeByte += mReadCodeByte;
+
+        batch.mHostFnOpWriteLedgerByte += mLedgerWriteByte;
+        batch.mHostFnOpWriteDataByte += mWriteDataByte;
+        batch.mHostFnOpWriteCodeByte += mWriteCodeByte;
+
+        batch.mHostFnOpEmitEvent += mEmitEvent;
+        batch.mHostFnOpEmitEventByte += mEmitEventByte;
+
+        batch.mHostFnOpCpuInsn += mCpuInsn;
+        batch.mHostFnOpMemByte += mMemByte;
+        batch.mHostFnOpInvokeTimeNsecs.push_back(
+            static_cast<int64_t>(mInvokeTimeNsecs));
+        batch.mHostFnOpCpuInsnExclVm += mCpuInsnExclVm;
+        batch.mHostFnOpInvokeTimeNsecsExclVm.push_back(
+            static_cast<int64_t>(mInvokeTimeNsecsExclVm));
+        batch.mHostFnOpInvokeTimeFsecsCpuInsnRatio.push_back(
+            static_cast<int64_t>(mInvokeTimeNsecs * 1000000 /
+                                 std::max(mCpuInsn, uint64_t(1))));
+        batch.mHostFnOpInvokeTimeFsecsCpuInsnRatioExclVm.push_back(
+            static_cast<int64_t>(mInvokeTimeNsecsExclVm * 1000000 /
+                                 std::max(mCpuInsnExclVm, uint64_t(1))));
+        batch.mHostFnOpDeclaredInsnsUsageRatio.push_back(static_cast<int64_t>(
+            mCpuInsn * 1000000 / std::max(mDeclaredCpuInsn, uint64_t(1))));
+
+        batch.mHostFnOpMaxRwKeyByte += mMaxReadWriteKeyByte;
+        batch.mHostFnOpMaxRwDataByte += mMaxReadWriteDataByte;
+        batch.mHostFnOpMaxRwCodeByte += mMaxReadWriteCodeByte;
+        batch.mHostFnOpMaxEmitEventByte += mMaxEmitEventByte;
+
+        if (mExecTimed)
+        {
+            batch.mHostFnOpExecNsecs.push_back(
+                static_cast<int64_t>(mExecTimeNsecs));
+        }
+
         if (mSuccess)
         {
-            mMetrics.mHostFnOpSuccess.Mark();
+            ++batch.mHostFnOpSuccess;
         }
         else
         {
-            mMetrics.mHostFnOpFailure.Mark();
+            ++batch.mHostFnOpFailure;
         }
     }
 
@@ -242,12 +259,14 @@ struct HostFunctionMetrics
         }
     }
 
-    std::optional<medida::TimerContext>
+    std::optional<ScopedNsecsTimer>
     getExecTimer()
     {
         if (!mDisableMetrics)
         {
-            return mMetrics.mHostFnOpExec.TimeScope();
+            mExecTimed = true;
+            return std::optional<ScopedNsecsTimer>(std::in_place,
+                                                   mExecTimeNsecs);
         }
         return std::nullopt;
     }
diff --git a/src/transactions/RestoreFootprintOpFrame.cpp b/src/transactions/RestoreFootprintOpFrame.cpp
index 517a1473b..38780fa19 100644
--- a/src/transactions/RestoreFootprintOpFrame.cpp
+++ b/src/transactions/RestoreFootprintOpFrame.cpp
@@ -32,19 +32,34 @@ struct RestoreFootprintMetrics
     uint32_t mLedgerReadByte{0};
     uint32_t mLedgerWriteByte{0};
 
+    // op execution wall time, captured via getExecTimer()
+    uint64_t mExecTimeNsecs{0};
+    bool mExecTimed{false};
+
     RestoreFootprintMetrics(SorobanMetrics& metrics) : mMetrics(metrics)
     {
     }
 
     ~RestoreFootprintMetrics()
     {
-        mMetrics.mRestoreFpOpReadLedgerByte.Mark(mLedgerReadByte);
-        mMetrics.mRestoreFpOpWriteLedgerByte.Mark(mLedgerWriteByte);
+        // Record into the calling thread's batch (published once per ledger)
+        // rather than updating process-wide metrics from every (possibly
+        // concurrent) operation.
+        auto& batch = mMetrics.getApplyThreadBatch();
+        std::lock_guard<std::mutex> lock(batch.mMutex);
+        batch.mRestoreFpOpReadLedgerByte += mLedgerReadByte;
+        batch.mRestoreFpOpWriteLedgerByte += mLedgerWriteByte;
+        if (mExecTimed)
+        {
+            batch.mRestoreFpOpExecNsecs.push_back(
+                static_cast<int64_t>(mExecTimeNsecs));
+        }
     }
-    medida::TimerContext
+    ScopedNsecsTimer
     getExecTimer()
     {
-        return mMetrics.mRestoreFpOpExec.TimeScope();
+        mExecTimed = true;
+        return ScopedNsecsTimer(mExecTimeNsecs);
     }
 };
 
diff --git a/src/transactions/TransactionFrame.cpp b/src/transactions/TransactionFrame.cpp
index 63a9b72ba..7670cc6b0 100644
--- a/src/transactions/TransactionFrame.cpp
+++ b/src/transactions/TransactionFrame.cpp
@@ -1095,8 +1095,13 @@ TransactionFrame::updateSorobanMetrics(AppConnector& app) const
     SorobanMetrics& metrics = app.getSorobanMetrics();
     auto txSize = static_cast<int64_t>(this->getSize());
     auto const& r = sorobanResources();
-    // update the tx metrics
-    metrics.mTxSizeByte.Update(txSize);
+    // record the tx metrics into the per-thread batch (published once per
+    // ledger)
+    {
+        auto& batch = metrics.getApplyThreadBatch();
+        std::lock_guard<std::mutex> lock(batch.mMutex);
+        batch.mTxSizeByte.push_back(txSize);
+    }
     // accumulate the ledger-wide metrics, which will get emitted at the ledger
     // close
     metrics.accumulateLedgerTxCount(getNumOperations());
@@ -2253,12 +2258,11 @@ TransactionFrame::parallelApply(
             ledgerInfo.getLedgerVersion() >=
             config.LEDGER_PROTOCOL_MIN_VERSION_INTERNAL_ERROR_REPORT;
 
-        std::optional<medida::TimerContext> opTimer;
+        std::optional<BatchedTimerScope> opTimer;
         if (!config.DISABLE_SOROBAN_METRICS_FOR_TESTING)
         {
-            opTimer.emplace(app.getMetrics()
-                                .NewTimer({"ledger", "operation", "apply"})
-                                .TimeScope());
+            opTimer.emplace(sorobanMetrics,
+                            &SorobanMetrics::ApplyMetricsBatch::mOpApplyNsecs);
         }
 
         releaseAssertOrThrow(mOperations.size() == 1);
diff --git a/src/transactions/test/InvokeHostFunctionTests.cpp b/src/transactions/test/InvokeHostFunctionTests.cpp
index 13166c0c8..704e47b81 100644
--- a/src/transactions/test/InvokeHostFunctionTests.cpp
+++ b/src/transactions/test/InvokeHostFunctionTests.cpp
@@ -3,12 +3,16 @@
 // of this distribution or at http://www.apache.org/licenses/LICENSE-2.0
 
 #include "test/test.h"
+#include "transactions/ParallelApplyUtils.h"
 #include "transactions/TransactionFrameBase.h"
 #include "util/Logging.h"
 #include "util/MetricsRegistry.h"
 #include "util/ProtocolVersion.h"
 #include "util/UnorderedSet.h"
 #include "xdr/Stellar-transaction.h"
+#include <medida/histogram.h>
+#include <medida/meter.h>
+#include <medida/timer.h>
 #include <numeric>
 #include <stdexcept>
 #include <xdrpp/printer.h>
@@ -21,6 +25,7 @@
 #include "ledger/LedgerManager.h"
 #include "ledger/LedgerTxn.h"
 #include "ledger/LedgerTypeUtils.h"
+#include "ledger/SorobanMetrics.h"
 #include "ledger/test/LedgerTestUtils.h"
 #include "main/Application.h"
 #include "main/CommandHandler.h"
@@ -7636,6 +7641,37 @@ TEST_CASE("module cache rebuild on incremental wasm uploads",
             static_cast<uint64_t>(rebuildBytesAtStartup));
 }
 
+TEST_CASE("soroban metrics published at ledger close", "[tx][soroban]")
+{
+    SorobanTest test;
+    auto const& contract =
+        test.deployWasmContract(rust_bridge::get_test_wasm_add_i32());
+
+    auto& metrics = test.getApp().getLedgerManager().getSorobanMetrics();
+    auto preSuccess = metrics.mHostFnOpSuccess.count();
+    auto preReadEntry = metrics.mHostFnOpReadEntry.count();
+    auto preTxSize = metrics.mTxSizeByte.count();
+    auto preInvokeTime = metrics.mHostFnOpInvokeTimeNsecs.count();
+    auto preExec = metrics.mHostFnOpExec.count();
+    auto preTxApply = metrics.mTransactionApply.count();
+    auto preOpApply = metrics.mOperationApply.count();
+
+    auto tx =
+        makeAddTx(contract, INVOKE_ADD_UNCACHED_COST_PASS, test.getRoot());
+    REQUIRE(isSuccessResult(test.invokeTx(tx)));
+
+    // Per-op/per-tx metrics recorded during apply are batched per thread and
+    // published by the ledger close that applied the tx, so they must all be
+    // visible here.
+    REQUIRE(metrics.mHostFnOpSuccess.count() == preSuccess + 1);
+    REQUIRE(metrics.mHostFnOpReadEntry.count() > preReadEntry);
+    REQUIRE(metrics.mTxSizeByte.count() == preTxSize + 1);
+    REQUIRE(metrics.mHostFnOpInvokeTimeNsecs.count() == preInvokeTime + 1);
+    REQUIRE(metrics.mHostFnOpExec.count() == preExec + 1);
+    REQUIRE(metrics.mTransactionApply.count() >= preTxApply + 1);
+    REQUIRE(metrics.mOperationApply.count() >= preOpApply + 1);
+}
+
 TEST_CASE("Module cache across protocol versions", "[tx][soroban][modulecache]")
 {
     VirtualClock clock;
diff --git a/src/util/SimpleTimer.cpp b/src/util/SimpleTimer.cpp
index 924a26b1b..846c89c28 100644
--- a/src/util/SimpleTimer.cpp
+++ b/src/util/SimpleTimer.cpp
@@ -43,9 +43,8 @@ SimpleTimer::Update(std::chrono::nanoseconds d)
     mSampleCount.inc(1);
     // Lock-free max tracking.
     int64_t prev = mMax.load(std::memory_order_relaxed);
-    while (prev < converted &&
-           !mMax.compare_exchange_weak(prev, converted,
-                                       std::memory_order_relaxed))
+    while (prev < converted && !mMax.compare_exchange_weak(
+                                   prev, converted, std::memory_order_relaxed))
     {
     }
 }