@@ -170,11 +170,54 @@ class HSHomeObject : public HomeObjectImpl {
170170 char data[1 ];
171171 };
172172
173+ struct durable_snapshot_progress {
174+ uint64_t start_time{0 };
175+ uint64_t total_blobs{0 };
176+ uint64_t total_bytes{0 };
177+ uint64_t total_shards{0 };
178+ uint64_t complete_blobs{0 };
179+ uint64_t complete_bytes{0 };
180+ uint64_t complete_shards{0 };
181+ uint64_t corrupted_blobs{0 };
182+ };
183+
184+ struct snapshot_progress {
185+ uint64_t start_time{0 };
186+ uint64_t total_blobs{0 };
187+ uint64_t total_bytes{0 };
188+ uint64_t total_shards{0 };
189+ uint64_t complete_blobs{0 };
190+ uint64_t complete_bytes{0 };
191+ uint64_t complete_shards{0 };
192+ // The count of the blobs which have been corrupted on the leader side.
193+ uint64_t corrupted_blobs{0 };
194+ // Record the stats of the current batch to avoid double counting.
195+ uint64_t cur_shard_total_blobs{0 };
196+ uint64_t cur_shard_complete_blobs{0 };
197+ // Used to handle the retried batch message.
198+ uint64_t cur_batch_blobs{0 };
199+ uint64_t cur_batch_bytes{0 };
200+ uint64_t error_count{0 };
201+
202+ snapshot_progress () = default ;
203+ explicit snapshot_progress (durable_snapshot_progress p) {
204+ start_time = p.start_time ;
205+ total_blobs = p.total_blobs ;
206+ total_bytes = p.total_bytes ;
207+ total_shards = p.total_shards ;
208+ complete_blobs = p.complete_blobs ;
209+ complete_bytes = p.complete_bytes ;
210+ complete_shards = p.complete_shards ;
211+ corrupted_blobs = p.corrupted_blobs ;
212+ }
213+ };
214+
173215 // Since shard list can be quite large and only need to be persisted once, we store it in a separate superblk
174216 struct snapshot_rcvr_info_superblk {
175217 shard_id_t shard_cursor;
176218 int64_t snp_lsn;
177219 pg_id_t pg_id;
220+ durable_snapshot_progress progress;
178221
179222 uint32_t size () const { return sizeof (snapshot_rcvr_info_superblk); }
180223 static auto name () -> string { return _snp_rcvr_meta_name; }
@@ -286,6 +329,11 @@ class HSHomeObject : public HomeObjectImpl {
286329 * Returns the number of open shards on this PG.
287330 */
288331 uint32_t open_shards () const ;
332+
333+ /* *
334+ * Returns the progress of the baseline resync.
335+ */
336+ uint32_t get_snp_progress () const ;
289337 };
290338
291339 struct HS_Shard : public Shard {
@@ -400,6 +448,25 @@ class HSHomeObject : public HomeObjectImpl {
400448 void pack_resync_message (sisl::io_blob_safe& dest_blob, SyncMessageType type);
401449 bool end_of_scan () const ;
402450
451+ // All of the leader's metrics are in-memory
452+ struct DonerSnapshotMetrics : sisl::MetricsGroup {
453+ explicit DonerSnapshotMetrics (pg_id_t pg_id) : sisl::MetricsGroup(" snapshot_doner" , std::to_string(pg_id)) {
454+ REGISTER_COUNTER (snp_dnr_load_blob, " Loaded blobs in baseline resync" );
455+ REGISTER_COUNTER (snp_dnr_load_bytes, " Loaded bytes in baseline resync" );
456+ REGISTER_COUNTER (snp_dnr_resend_count, " Mesg resend times in baseline resync" );
457+ REGISTER_COUNTER (snp_dnr_error_count, " Error times when reading blobs in baseline resync" );
458+ REGISTER_HISTOGRAM (snp_dnr_blob_process_time, " Time cost of successfully process a blob in baseline resync" ,
459+ HistogramBucketsType (DefaultBuckets));
460+ register_me_to_farm ();
461+ }
462+
463+ ~DonerSnapshotMetrics () { deregister_me_from_farm (); }
464+ DonerSnapshotMetrics (const DonerSnapshotMetrics&) = delete ;
465+ DonerSnapshotMetrics (DonerSnapshotMetrics&&) noexcept = delete ;
466+ DonerSnapshotMetrics& operator =(const DonerSnapshotMetrics&) = delete ;
467+ DonerSnapshotMetrics& operator =(DonerSnapshotMetrics&&) noexcept = delete ;
468+ };
469+
403470 struct ShardEntry {
404471 ShardInfo info;
405472 homestore::chunk_num_t v_chunk_num;
@@ -420,6 +487,7 @@ class HSHomeObject : public HomeObjectImpl {
420487 pg_id_t pg_id_;
421488 shared< homestore::ReplDev > repl_dev_;
422489 uint64_t max_batch_size_;
490+ std::unique_ptr<DonerSnapshotMetrics> metrics_;
423491 };
424492
425493 class SnapshotReceiveHandler {
@@ -448,11 +516,11 @@ class HSHomeObject : public HomeObjectImpl {
448516 pg_id_t get_context_pg_id () const ;
449517
450518 // Try to load existing snapshot context info
451- bool load_prev_context ();
519+ bool load_prev_context_and_metrics ();
452520
453521 // Reset the context for a new snapshot, should be called before each new snapshot transmission
454- void reset_context (int64_t lsn, pg_id_t pg_id);
455- void destroy_context ();
522+ void reset_context_and_metrics (int64_t lsn, pg_id_t pg_id);
523+ void destroy_context_and_metrics ();
456524
457525 shard_id_t get_shard_cursor () const ;
458526 shard_id_t get_next_shard () const ;
@@ -466,14 +534,66 @@ class HSHomeObject : public HomeObjectImpl {
466534 const int64_t snp_lsn;
467535 const pg_id_t pg_id;
468536 shared< BlobIndexTable > index_table;
469-
537+ std::shared_mutex progress_lock;
538+ snapshot_progress progress;
470539 SnapshotContext (int64_t lsn, pg_id_t pg_id) : snp_lsn{lsn}, pg_id{pg_id} {}
471540 };
472541
542+ struct ReceiverSnapshotMetrics : sisl::MetricsGroup {
543+ ReceiverSnapshotMetrics (std::shared_ptr<SnapshotContext> ctx) : sisl::MetricsGroup(" snapshot_receiver" , std::to_string(ctx->pg_id)),
544+ ctx_{ctx} {
545+ REGISTER_GAUGE (snp_rcvr_total_blob, " Total blobs in baseline resync" );
546+ REGISTER_GAUGE (snp_rcvr_total_bytes, " Total bytes in baseline resync" )
547+ REGISTER_GAUGE (snp_rcvr_total_shards, " Total shards in baseline resync" )
548+ REGISTER_GAUGE (snp_rcvr_complete_blob, " Complete blob in baseline resync" )
549+ REGISTER_GAUGE (snp_rcvr_complete_bytes, " Complete bytes in baseline resync" )
550+ REGISTER_GAUGE (snp_rcvr_complete_shards, " Complete shards in baseline resync" )
551+ REGISTER_GAUGE (snp_rcvr_current_shard_total_blobs,
552+ " Total blob of the current shard in baseline resync" )
553+ REGISTER_GAUGE (snp_rcvr_current_shard_complete_blobs,
554+ " Compelete blob of the current blob in baseline resync" );
555+ REGISTER_GAUGE (snp_rcvr_corrupted_blobs, " Corrupted blobs in baseline resync" );
556+ REGISTER_GAUGE (snp_rcvr_elapsed_time_sec, " Time cost(seconds) of baseline resync" );
557+ REGISTER_GAUGE (snp_rcvr_error_count, " Error count in baseline resync" );
558+ REGISTER_HISTOGRAM (snp_rcvr_blob_process_time, " Time cost of successfully process a blob in baseline resync" ,
559+ HistogramBucketsType (DefaultBuckets));
560+
561+
562+ attach_gather_cb (std::bind (&ReceiverSnapshotMetrics::on_gather, this ));
563+ register_me_to_farm ();
564+ }
565+ ~ReceiverSnapshotMetrics () { deregister_me_from_farm (); }
566+ ReceiverSnapshotMetrics (const ReceiverSnapshotMetrics&) = delete ;
567+ ReceiverSnapshotMetrics (ReceiverSnapshotMetrics&&) noexcept = delete ;
568+ ReceiverSnapshotMetrics& operator =(const ReceiverSnapshotMetrics&) = delete ;
569+ ReceiverSnapshotMetrics& operator =(ReceiverSnapshotMetrics&&) noexcept = delete ;
570+
571+ void on_gather () {
572+ if (ctx_) {
573+ std::shared_lock<std::shared_mutex> lock (ctx_->progress_lock );
574+ GAUGE_UPDATE (*this , snp_rcvr_total_blob, ctx_->progress .total_blobs );
575+ GAUGE_UPDATE (*this , snp_rcvr_total_bytes, ctx_->progress .total_bytes );
576+ GAUGE_UPDATE (*this , snp_rcvr_total_shards, ctx_->progress .total_shards );
577+ GAUGE_UPDATE (*this , snp_rcvr_complete_blob, ctx_->progress .complete_blobs );
578+ GAUGE_UPDATE (*this , snp_rcvr_complete_bytes, ctx_->progress .complete_bytes );
579+ GAUGE_UPDATE (*this , snp_rcvr_complete_shards, ctx_->progress .complete_shards );
580+ GAUGE_UPDATE (*this , snp_rcvr_current_shard_total_blobs, ctx_->progress .cur_shard_total_blobs );
581+ GAUGE_UPDATE (*this , snp_rcvr_current_shard_complete_blobs, ctx_->progress .cur_shard_complete_blobs );
582+ GAUGE_UPDATE (*this , snp_rcvr_corrupted_blobs, ctx_->progress .corrupted_blobs );
583+ GAUGE_UPDATE (*this , snp_rcvr_error_count, ctx_->progress .error_count );
584+ auto duration = get_elapsed_time_ms (ctx_->progress .start_time * 1000 ) / 1000 ;
585+ GAUGE_UPDATE (*this , snp_rcvr_elapsed_time_sec, duration);
586+ }
587+ }
588+ private:
589+ std::shared_ptr<SnapshotContext> ctx_;
590+ };
591+
473592 HSHomeObject& home_obj_;
474593 const shared< homestore::ReplDev > repl_dev_;
475594
476- std::unique_ptr< SnapshotContext > ctx_;
595+ std::shared_ptr< SnapshotContext > ctx_;
596+ std::unique_ptr< ReceiverSnapshotMetrics > metrics_;
477597
478598 // Update the snp_info superblock
479599 void update_snp_info_sb (bool init = false );
0 commit comments