Skip to content

Commit f3ce828

Browse files
SDSTOR-21763 Fix GC job status update for multi-PG scenarios (#405)
Refactor trigger_gc_for_pg to return folly::Future and defer job status updates to callers. Previously, when triggering GC for all PGs, the job status was incorrectly set to COMPLETED after the first PG finished. Now the job status is only set to COMPLETED after all PGs have completed their GC operations. Changes: - trigger_gc_for_pg now returns folly::Future<folly::Unit> - Removed set_job_status parameter and internal status updates - Callers use thenValue to set job status after completion - Single PG GC: sets status after that PG completes - Multi PG GC: collects all futures and sets status after all complete
1 parent 77ba5c6 commit f3ce828

3 files changed

Lines changed: 39 additions & 26 deletions

File tree

conanfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
class HomeObjectConan(ConanFile):
1212
name = "homeobject"
13-
version = "4.1.6"
13+
version = "4.1.7"
1414

1515
homepage = "https://github.com/eBay/HomeObject"
1616
description = "Blob Store built on HomeStore"

src/lib/homestore_backend/hs_http_manager.cpp

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -595,39 +595,57 @@ void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::H
595595
LOGINFO("GC job {} stopping GC scan timer", job_id);
596596
gc_mgr->stop_gc_scan_timer();
597597

598-
// we block here until all gc tasks for the pg are done
599-
trigger_gc_for_pg(pg_id, job_id);
600-
601-
LOGINFO("GC job {} restarting GC scan timer", job_id);
602-
gc_mgr->start_gc_scan_timer();
598+
trigger_gc_for_pg(pg_id, job_id).thenValue([job_info, gc_mgr, job_id](auto&&) {
599+
job_info->status = job_info->failed_count ? GCJobStatus::FAILED : GCJobStatus::COMPLETED;
600+
LOGINFO("GC job {} completed: total={}, success={}, failed={}", job_info->job_id, job_info->total_chunks,
601+
job_info->success_count, job_info->failed_count);
602+
LOGINFO("GC job {} restarting GC scan timer", job_id);
603+
gc_mgr->start_gc_scan_timer();
604+
});
603605
} else {
604606
LOGINFO("Received trigger_gc request for all chunks");
605607
nlohmann::json result;
608+
std::vector< pg_id_t > pg_ids;
609+
ho_.get_pg_ids(pg_ids);
610+
606611
const auto job_id = generate_job_id();
607612
result["job_id"] = job_id;
608-
result["message"] = "GC triggered for all chunks, pls query job status using gc_job_status API";
609-
// return response before starting the GC so that we don't block the client.
610-
response.send(Pistache::Http::Code::Accepted, result.dump());
611613

612614
auto job_info = std::make_shared< GCJobInfo >(job_id);
613615
{
614616
std::lock_guard lock(gc_job_mutex_);
615617
gc_jobs_map_.set(job_id, job_info);
616618
}
617619

618-
std::vector< pg_id_t > pg_ids;
619-
ho_.get_pg_ids(pg_ids);
620+
if (pg_ids.empty()) {
621+
LOGINFO("GC job {} no PGs found, marking as completed", job_id);
622+
job_info->status = GCJobStatus::COMPLETED;
623+
result["message"] = "No PGs found, GC completed";
624+
response.send(Pistache::Http::Code::Ok, result.dump());
625+
return;
626+
}
627+
628+
result["message"] = "GC triggered for all chunks, pls query job status using gc_job_status API";
629+
// return response before starting the GC so that we don't block the client.
630+
response.send(Pistache::Http::Code::Accepted, result.dump());
631+
620632
LOGINFO("GC job {} will process {} PGs", job_id, pg_ids.size());
621633
LOGINFO("GC job {} stopping GC scan timer", job_id);
622634
gc_mgr->stop_gc_scan_timer();
623635

624-
// we block here until all gc tasks for the pg are done
636+
std::vector< folly::Future< folly::Unit > > pg_futures;
625637
for (const auto& pg_id : pg_ids) {
626-
trigger_gc_for_pg(pg_id, job_id);
638+
pg_futures.push_back(trigger_gc_for_pg(pg_id, job_id));
627639
}
628640

629-
LOGINFO("GC job {} restarting GC scan timer", job_id);
630-
gc_mgr->start_gc_scan_timer();
641+
// Set job status after all PGs are processed
642+
folly::collectAllUnsafe(pg_futures).thenValue([job_info, gc_mgr, job_id](auto&&) {
643+
job_info->status = job_info->failed_count ? GCJobStatus::FAILED : GCJobStatus::COMPLETED;
644+
LOGINFO("GC job {} completed: total={}, success={}, failed={}", job_info->job_id, job_info->total_chunks,
645+
job_info->success_count, job_info->failed_count);
646+
LOGINFO("GC job {} restarting GC scan timer", job_id);
647+
gc_mgr->start_gc_scan_timer();
648+
});
631649
}
632650
}
633651

@@ -703,7 +721,7 @@ void HttpManager::get_gc_job_status(const Pistache::Rest::Request& request, Pist
703721
response.send(Pistache::Http::Code::Ok, result.dump());
704722
}
705723

706-
void HttpManager::trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id) {
724+
folly::Future< folly::Unit > HttpManager::trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id) {
707725
auto hs_pg = const_cast< HSHomeObject::HS_PG* >(ho_.get_hs_pg(pg_id));
708726
RELEASE_ASSERT(hs_pg, "HS PG {} not found during GC job {}", pg_id, job_id);
709727

@@ -742,7 +760,7 @@ void HttpManager::trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id) {
742760
(priority == task_priority::emergent) ? "emergent" : "normal");
743761
}
744762

745-
folly::collectAllUnsafe(gc_task_futures)
763+
return folly::collectAllUnsafe(gc_task_futures)
746764
.thenValue([job_info](auto&& results) {
747765
for (auto const& ok : results) {
748766
RELEASE_ASSERT(ok.hasValue(), "we never throw any exception when copying data");
@@ -753,21 +771,16 @@ void HttpManager::trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id) {
753771
}
754772
}
755773
})
756-
.thenValue([this, pg_id, job_info, gc_mgr](auto&& rets) {
757-
LOGINFO("All GC tasks have been processed");
774+
.thenValue([this, pg_id, job_info](auto&& rets) {
775+
LOGINFO("All GC tasks for PG {} have been processed", pg_id);
758776
const auto& job_id = job_info->job_id;
759777

760778
auto hs_pg = const_cast< HSHomeObject::HS_PG* >(ho_.get_hs_pg(pg_id));
761779
RELEASE_ASSERT(hs_pg, "HS PG {} not found during GC job {}", pg_id, job_id);
762780
// Resume accepting new requests for this pg
763781
hs_pg->repl_dev_->resume_accepting_reqs();
764782
LOGINFO("GC job {} resumed accepting requests for PG {}", job_id, pg_id);
765-
766-
job_info->status = job_info->failed_count ? GCJobStatus::FAILED : GCJobStatus::COMPLETED;
767-
LOGINFO("GC job {} completed: total={}, success={}, failed={}", job_id, job_info->total_chunks,
768-
job_info->success_count, job_info->failed_count);
769-
})
770-
.get();
783+
});
771784
}
772785

773786
#ifdef _PRERELEASE

src/lib/homestore_backend/hs_http_manager.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class HttpManager {
4848
void exit_pg(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response);
4949
void trigger_gc(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response);
5050
void get_gc_job_status(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response);
51-
void trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id);
51+
folly::Future< folly::Unit > trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id);
5252
void get_job_status(const std::string& job_id, nlohmann::json& result);
5353

5454
#ifdef _PRERELEASE

0 commit comments

Comments
 (0)