-
Notifications
You must be signed in to change notification settings - Fork 766
pkg/unsaferecovery: add unsafe recovery abort #10641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import ( | |
| "sort" | ||
| "strconv" | ||
| "strings" | ||
| "sync/atomic" | ||
| "time" | ||
|
|
||
| "go.uber.org/zap" | ||
|
|
@@ -47,6 +48,12 @@ const ( | |
| storeRequestInterval = time.Second * 40 | ||
| ) | ||
|
|
||
| var globalRecoveryStep = uint64(time.Now().UnixNano()) | ||
|
|
||
| func nextRecoveryStep() uint64 { | ||
| return atomic.AddUint64(&globalRecoveryStep, 1) | ||
| } | ||
|
|
||
| // Stage transition graph: for more details, please check `Controller.HandleStoreHeartbeat()` | ||
| // | ||
| // +-----------+ +-----------+ | ||
|
|
@@ -122,10 +129,11 @@ type Controller struct { | |
| cluster cluster | ||
| stage stage | ||
| // the round of recovery, which is an increasing number to identify the reports of each round | ||
| step uint64 | ||
| failedStores map[uint64]struct{} | ||
| timeout time.Time | ||
| autoDetect bool | ||
| step uint64 | ||
| recoveryStartStep uint64 | ||
| failedStores map[uint64]struct{} | ||
| timeout time.Time | ||
| autoDetect bool | ||
|
|
||
| // collected reports from store, if not reported yet, it would be nil | ||
| storeReports map[uint64]*pdpb.StoreReport | ||
|
|
@@ -169,6 +177,7 @@ func NewController(cluster cluster) *Controller { | |
| func (u *Controller) reset() { | ||
| u.stage = Idle | ||
| u.step = 0 | ||
| u.recoveryStartStep = 0 | ||
| u.failedStores = make(map[uint64]struct{}) | ||
| u.storeReports = make(map[uint64]*pdpb.StoreReport) | ||
| u.numStoresReported = 0 | ||
|
|
@@ -237,12 +246,29 @@ func (u *Controller) RemoveFailedStores(failedStores map[uint64]struct{}, timeou | |
| } | ||
|
|
||
| u.timeout = time.Now().Add(time.Duration(timeout) * time.Second) | ||
| u.step = nextRecoveryStep() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we allocate the step from the global monotonic counter on every stage transition, or add a recovery epoch, so delayed reports from a previous run cannot pass the new run's step check? |
||
| u.recoveryStartStep = u.step | ||
| u.failedStores = failedStores | ||
| u.autoDetect = autoDetect | ||
| u.changeStage(CollectReport) | ||
| return nil | ||
| } | ||
|
|
||
| // AbortFailedStoresRemoval aborts the current unsafe recovery process in a best-effort way. | ||
| // It asks TiKV to exit force leader by dispatching empty recovery plans, but any plan that | ||
| // has already been delivered to TiKV may keep running until TiKV finishes or times it out. | ||
| func (u *Controller) AbortFailedStoresRemoval() error { | ||
| u.Lock() | ||
| defer u.Unlock() | ||
|
|
||
| if !isRunning(u.stage) { | ||
| return errs.ErrUnsafeRecoveryInvalidInput.FastGenByArgs("no ongoing unsafe recovery") | ||
| } | ||
|
|
||
| u.handleErr(errors.New("aborted by operator")) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we make abort a no-op when stage == ExitForceLeader, or keep the controller in ExitForceLeader and add a repeated-abort test? |
||
| return nil | ||
| } | ||
|
|
||
| // Show returns the current status of ongoing unsafe recover operation. | ||
| func (u *Controller) Show() []StageOutput { | ||
| u.Lock() | ||
|
|
@@ -550,8 +576,9 @@ func (u *Controller) changeStage(stage stage) { | |
| output.Details = append(output.Details, fmt.Sprintf("triggered by error: %v", u.err.Error())) | ||
| } | ||
| case Finished: | ||
| if u.step > 1 { | ||
| // == 1 means no operation has done, no need to invalid cache | ||
| if u.step > u.recoveryStartStep+1 { | ||
| // Only CollectReport has finished when step == recoveryStartStep+1, | ||
| // which means no operation has done and no cache invalidation is needed. | ||
| u.cluster.ResetRegionCache() | ||
| } | ||
| output.Info = "Unsafe recovery Finished" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -82,6 +82,25 @@ func (h *unsafeOperationHandler) RemoveFailedStores(w http.ResponseWriter, r *ht | |
| h.rd.JSON(w, http.StatusOK, "Request has been accepted.") | ||
| } | ||
|
|
||
| // AbortFailedStoresRemoval aborts the current failed stores removal. | ||
| // | ||
| // @Tags unsafe | ||
| // @Summary Abort the current failed stores removal. | ||
| // @Produce json | ||
| // | ||
| // Success 200 {string} string "Request has been accepted." | ||
| // Failure 500 {string} string "PD server failed to proceed the request." | ||
| // | ||
| // @Router /admin/unsafe/remove-failed-stores/abort [post] | ||
| func (h *unsafeOperationHandler) AbortFailedStoresRemoval(w http.ResponseWriter, r *http.Request) { | ||
| rc := getCluster(r) | ||
| if err := rc.GetUnsafeRecoveryController().AbortFailedStoresRemoval(); err != nil { | ||
| h.rd.JSON(w, http.StatusInternalServerError, err.Error()) | ||
| return | ||
| } | ||
| h.rd.JSON(w, http.StatusOK, "Request has been accepted.") | ||
|
Comment on lines
+85
to
+101
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Return a 4xx for “no ongoing unsafe recovery” instead of 500.
As per coding guidelines, 🤖 Prompt for AI Agents |
||
| } | ||
|
|
||
| // GetFailedStoresRemovalStatus gets the current status of failed stores removal. | ||
| // | ||
| // @Tags unsafe | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Allocate recovery steps from the process-wide counter for every stage, not just once per run.
This only reserves a unique value for the run’s initial step. After that, the controller still advances
u.steplocally, so a later run can reuse step values that an earlier run already emitted in later stages. A delayedStoreReportfrom the previous run can then pass theGetStep() == u.stepcheck in the new run, which is exactly the collision this PR is trying to prevent.🤖 Prompt for AI Agents