-
Notifications
You must be signed in to change notification settings - Fork 766
checker: address split scatter pending follow-ups #10691
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
95b971a
dfa9555
4e5a225
64529c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,7 +63,8 @@ type splitScatterController struct { | |
| // pending maps a pending region ID to its latest split-scatter batch item. | ||
| // The item keeps its batch group so stale snapshots cannot mutate a newer | ||
| // pending entry for the same region. | ||
| pending map[uint64]splitScatterPendingItem | ||
| pending map[uint64]splitScatterPendingItem | ||
| nextDispatchAt time.Time | ||
| } | ||
|
|
||
| func newSplitScatterController( | ||
|
|
@@ -72,6 +73,7 @@ func newSplitScatterController( | |
| opController *operator.Controller, | ||
| addPendingProcessedRegions func(needCheckLen bool, ids ...uint64), | ||
| ) *splitScatterController { | ||
| splitScatterPendingGauge.Set(0) | ||
| return &splitScatterController{ | ||
| cluster: cluster, | ||
| opController: opController, | ||
|
|
@@ -111,17 +113,20 @@ func (c *splitScatterController) collectTopPendingSplitScatter(limit int) []spli | |
| c.pendingMu.RUnlock() | ||
|
|
||
| candidates := make([]splitScatterPendingItem, 0, len(pendingSnapshot)) | ||
| missingSnapshot := make([]splitScatterPendingItem, 0) | ||
| for _, pending := range pendingSnapshot { | ||
| if !pending.retryAt.IsZero() && now.Before(pending.retryAt) { | ||
| continue | ||
| } | ||
| regionID := pending.regionID | ||
| region := c.cluster.GetRegion(regionID) | ||
| if region == nil { | ||
| continue | ||
| } | ||
| if !pending.retryAt.IsZero() && now.Before(pending.retryAt) { | ||
| missingSnapshot = append(missingSnapshot, pending) | ||
| continue | ||
| } | ||
| sourceRegion := c.cluster.GetRegion(pending.sourceRegionID) | ||
| if sourceRegion == nil { | ||
| missingSnapshot = append(missingSnapshot, pending) | ||
| continue | ||
| } | ||
| sourceVersion := uint64(0) | ||
|
|
@@ -162,6 +167,7 @@ func (c *splitScatterController) collectTopPendingSplitScatter(limit int) []spli | |
| candidates = selected | ||
| c.pendingMu.Unlock() | ||
| } | ||
| c.delayMissingPendingSplitScatter(missingSnapshot, now) | ||
|
|
||
| if len(expiredSnapshot) > 0 { | ||
| attemptedExpiredCount := 0 | ||
|
|
@@ -188,6 +194,30 @@ func (c *splitScatterController) collectTopPendingSplitScatter(limit int) []spli | |
| return candidates | ||
| } | ||
|
|
||
| func (c *splitScatterController) delayMissingPendingSplitScatter(missing []splitScatterPendingItem, now time.Time) { | ||
| if len(missing) == 0 { | ||
| return | ||
| } | ||
| missingCount := 0 | ||
| c.pendingMu.Lock() | ||
| for _, expected := range missing { | ||
| pending, ok := c.pending[expected.regionID] | ||
| if !ok || pending.group != expected.group || !pending.expireAt.Equal(expected.expireAt) { | ||
| continue | ||
| } | ||
| if !pending.retryAt.IsZero() && now.Before(pending.retryAt) { | ||
| continue | ||
| } | ||
| pending.retryAt = now.Add(splitScatterRetryBackoff) | ||
| c.pending[expected.regionID] = pending | ||
| missingCount++ | ||
| } | ||
| c.pendingMu.Unlock() | ||
| if missingCount > 0 { | ||
| splitScatterDispatchRegionMissingCounter.Add(float64(missingCount)) | ||
| } | ||
| } | ||
|
|
||
| func (c *splitScatterController) delayPendingSplitScatter(expected splitScatterPendingItem) { | ||
| c.pendingMu.Lock() | ||
| defer c.pendingMu.Unlock() | ||
|
|
@@ -250,6 +280,29 @@ func (c *splitScatterController) cleanupExpiredPendingSplitScatter() int { | |
| return pendingCount | ||
| } | ||
|
|
||
| func (c *splitScatterController) clearPendingSplitScatter() { | ||
| c.pendingMu.Lock() | ||
| defer c.pendingMu.Unlock() | ||
| pendingCount := len(c.pending) | ||
| if pendingCount > 0 { | ||
| c.pending = make(map[uint64]splitScatterPendingItem) | ||
| c.updatePendingGaugeLocked() | ||
| } | ||
| c.nextDispatchAt = time.Time{} | ||
| } | ||
|
|
||
| func (c *splitScatterController) skipDispatchUntil(now time.Time) bool { | ||
| c.pendingMu.RLock() | ||
| defer c.pendingMu.RUnlock() | ||
| return !c.nextDispatchAt.IsZero() && now.Before(c.nextDispatchAt) | ||
| } | ||
|
|
||
| func (c *splitScatterController) delayNextDispatch(now time.Time) { | ||
| c.pendingMu.Lock() | ||
| defer c.pendingMu.Unlock() | ||
| c.nextDispatchAt = now.Add(splitScatterRetryBackoff) | ||
| } | ||
|
|
||
| func makeSplitScatterGroup(sourceRegionID, firstNewRegionID uint64) string { | ||
| return fmt.Sprintf("split-scatter-%d-%d", sourceRegionID, firstNewRegionID) | ||
| } | ||
|
|
@@ -263,6 +316,9 @@ func (c *splitScatterController) recordSplitScatterBatch(sourceRegionID, sourceW | |
| if len(newRegionIDs) == 0 { | ||
| return | ||
| } | ||
| if c.cluster.GetCheckerConfig().GetSplitScatterScheduleLimit() == 0 { | ||
| return | ||
| } | ||
| group := makeSplitScatterGroup(sourceRegionID, newRegionIDs[0]) | ||
| expireAt := time.Now().Add(splitScatterPendingTTL) | ||
| if sourceWaitVersion == 0 { | ||
|
|
@@ -317,27 +373,39 @@ func (c *splitScatterController) recordSplitScatterBatch(sourceRegionID, sourceW | |
| expireAt: expireAt, | ||
| } | ||
| c.updatePendingGaugeLocked() | ||
| c.nextDispatchAt = time.Time{} | ||
| } | ||
|
|
||
| func (c *splitScatterController) dispatchSplitScatterRegions() { | ||
| now := time.Now() | ||
| if c.cleanupExpiredPendingSplitScatter() == 0 { | ||
| return | ||
| } | ||
| limit := c.cluster.GetCheckerConfig().GetSplitScatterScheduleLimit() | ||
| if limit == 0 { | ||
| splitScatterDispatchDisabledCounter.Inc() | ||
| c.clearPendingSplitScatter() | ||
| return | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| } | ||
| if c.skipDispatchUntil(now) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we needs to add new metrics recording why the split controller doesn't work?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the existing checker event metrics already cover the main reasons why split-scatter dispatch does not make progress. They are reported through |
||
| return | ||
| } | ||
| running := c.opController.OperatorCount(operator.OpSplitScatter) | ||
| if running >= limit { | ||
| splitScatterDispatchScheduleLimitCounter.Inc() | ||
| operator.IncOperatorLimitCounter(types.SplitScatterChecker, operator.OpSplitScatter) | ||
| c.delayNextDispatch(now) | ||
| return | ||
| } | ||
| dispatchLimit := int(limit - running) | ||
| // Dispatch sequentially so operators added for earlier pending items in this pass | ||
| // are visible to later ScatterInternal calls through the running-operator delta. | ||
| for _, pending := range c.collectTopPendingSplitScatter(dispatchLimit) { | ||
| pendingItems := c.collectTopPendingSplitScatter(dispatchLimit) | ||
| if len(pendingItems) == 0 { | ||
| c.delayNextDispatch(now) | ||
| return | ||
| } | ||
| for _, pending := range pendingItems { | ||
| region := c.cluster.GetRegion(pending.regionID) | ||
| if region == nil { | ||
| splitScatterDispatchRegionMissingCounter.Inc() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will the metrics reset zero if pending count is zero,
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated in 64529c3.
clearPendingSplitScatternow always refreshessplitScatterPendingGaugeafter clearing pending, so the metric is reset to 0 even when the pending count is already 0.PatrolRegionsalso calls the same cleanup path when it exits.