-
Notifications
You must be signed in to change notification settings - Fork 764
mcs/scheduling: clean primary resources on exit #10645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 8 commits
9c92cfd
23174e5
e7bea36
b6baff6
8871821
6fff238
53ef0f2
3b30f5c
9c2100b
36b0d6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| // Copyright 2026 TiKV Project Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| package apis | ||
|
|
||
| import ( | ||
| "net/http" | ||
| "net/http/httptest" | ||
| "testing" | ||
|
|
||
| "github.com/gin-gonic/gin" | ||
| "github.com/stretchr/testify/require" | ||
| "go.uber.org/goleak" | ||
|
|
||
| scheserver "github.com/tikv/pd/pkg/mcs/scheduling/server" | ||
| "github.com/tikv/pd/pkg/utils/apiutil/multiservicesapi" | ||
| "github.com/tikv/pd/pkg/utils/testutil" | ||
| ) | ||
|
|
||
| func TestMain(m *testing.M) { | ||
| goleak.VerifyTestMain(m, testutil.LeakOptions...) | ||
| } | ||
|
|
||
| func TestGetAllStoresReturnsNotBootstrappedWhenBasicClusterMissing(t *testing.T) { | ||
| gin.SetMode(gin.TestMode) | ||
| re := require.New(t) | ||
|
|
||
| resp := httptest.NewRecorder() | ||
| ctx, _ := gin.CreateTestContext(resp) | ||
| ctx.Request = httptest.NewRequest(http.MethodGet, "/stores", nil) | ||
| ctx.Set(multiservicesapi.ServiceContextKey, &scheserver.Server{}) | ||
|
|
||
| getAllStores(ctx) | ||
|
|
||
| re.Equal(http.StatusInternalServerError, resp.Code) | ||
| re.Contains(resp.Body.String(), "not bootstrapped") | ||
| } | ||
|
bufferflies marked this conversation as resolved.
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,10 @@ import ( | |
| "github.com/tikv/pd/pkg/cluster" | ||
| "github.com/tikv/pd/pkg/core" | ||
| "github.com/tikv/pd/pkg/errs" | ||
| mcsaffinity "github.com/tikv/pd/pkg/mcs/scheduling/server/affinity" | ||
| "github.com/tikv/pd/pkg/mcs/scheduling/server/config" | ||
| "github.com/tikv/pd/pkg/mcs/scheduling/server/meta" | ||
| "github.com/tikv/pd/pkg/mcs/scheduling/server/rule" | ||
| "github.com/tikv/pd/pkg/ratelimit" | ||
| "github.com/tikv/pd/pkg/response" | ||
| "github.com/tikv/pd/pkg/schedule" | ||
|
|
@@ -76,7 +79,13 @@ type Cluster struct { | |
| regionStats *statistics.RegionStatistics | ||
| labelStats *statistics.LabelStatistics | ||
| hotStat *statistics.HotStat | ||
| resourceMu sync.RWMutex | ||
| storage storage.Storage | ||
| hbStreams *hbstream.HeartbeatStreams | ||
| metaWatcher *meta.Watcher | ||
| configWatcher *config.Watcher | ||
| ruleWatcher *rule.Watcher | ||
| affinityWatcher *mcsaffinity.Watcher | ||
| coordinator *schedule.Coordinator | ||
| checkMembershipCh chan struct{} | ||
| pdLeader atomic.Value | ||
|
|
@@ -142,6 +151,7 @@ func NewCluster( | |
| labelStats: statistics.NewLabelStatistics(), | ||
| regionStats: statistics.NewRegionStatistics(basicCluster, persistConfig, ruleManager), | ||
| storage: storage, | ||
| hbStreams: hbStreams, | ||
| checkMembershipCh: checkMembershipCh, | ||
| httpClient: httpClient, | ||
| backendAddress: backendAddress, | ||
|
|
@@ -262,9 +272,75 @@ func (c *Cluster) BucketsStats(degree int, regionIDs ...uint64) map[uint64][]*bu | |
|
|
||
| // GetStorage returns the storage. | ||
| func (c *Cluster) GetStorage() storage.Storage { | ||
| if c == nil { | ||
| return nil | ||
| } | ||
| c.resourceMu.RLock() | ||
| defer c.resourceMu.RUnlock() | ||
| return c.storage | ||
| } | ||
|
|
||
| // GetHeartbeatStreams returns the heartbeat streams. | ||
| func (c *Cluster) GetHeartbeatStreams() *hbstream.HeartbeatStreams { | ||
| if c == nil { | ||
| return nil | ||
| } | ||
| c.resourceMu.RLock() | ||
| defer c.resourceMu.RUnlock() | ||
| return c.hbStreams | ||
| } | ||
|
|
||
| // GetMetaWatcher returns the meta watcher. | ||
| func (c *Cluster) GetMetaWatcher() *meta.Watcher { | ||
| if c == nil { | ||
| return nil | ||
| } | ||
| c.resourceMu.RLock() | ||
| defer c.resourceMu.RUnlock() | ||
| return c.metaWatcher | ||
| } | ||
|
|
||
| // SetRuntimeResources installs the cluster-scoped runtime resources after they are created. | ||
| func (c *Cluster) SetRuntimeResources( | ||
| metaWatcher *meta.Watcher, | ||
| configWatcher *config.Watcher, | ||
| ruleWatcher *rule.Watcher, | ||
| affinityWatcher *mcsaffinity.Watcher, | ||
| ) { | ||
| c.resourceMu.Lock() | ||
| defer c.resourceMu.Unlock() | ||
| c.metaWatcher = metaWatcher | ||
| c.configWatcher = configWatcher | ||
| c.ruleWatcher = ruleWatcher | ||
| c.affinityWatcher = affinityWatcher | ||
| } | ||
|
|
||
| func (c *Cluster) cleanupRuntimeResources() { | ||
| c.resourceMu.Lock() | ||
| defer c.resourceMu.Unlock() | ||
| if c.affinityWatcher != nil { | ||
| c.affinityWatcher.Close() | ||
| c.affinityWatcher = nil | ||
| } | ||
| if c.ruleWatcher != nil { | ||
| c.ruleWatcher.Close() | ||
| c.ruleWatcher = nil | ||
| } | ||
| if c.metaWatcher != nil { | ||
| c.metaWatcher.Close() | ||
| c.metaWatcher = nil | ||
| } | ||
| if c.configWatcher != nil { | ||
| c.configWatcher.Close() | ||
| c.configWatcher = nil | ||
| } | ||
| if c.hbStreams != nil { | ||
| c.hbStreams.Close() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Closing hbStreams synchronously here does wait for an in-flight stream.Send to return. Close only cancels the context and then waits on wg; if hbstream.run has already entered the send/keepalive branch, it will not observe ctx.Done until Send returns. In production heartbeatServer.Send has a 5s timeout, so this is usually not a permanent block, but it can delay primary stepdown/transfer on slow streams. Could we check context cancellation inside the send/keepalive path, or make the close path bounded, so primary exit is not amplified by slow streams?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| c.hbStreams = nil | ||
| } | ||
| c.storage = nil | ||
| } | ||
|
|
||
| // GetCheckerConfig returns the checker config. | ||
| func (c *Cluster) GetCheckerConfig() sc.CheckerConfigProvider { return c.persistConfig } | ||
|
|
||
|
|
@@ -653,7 +729,7 @@ func (c *Cluster) StartBackgroundJobs() { | |
| c.running.Store(true) | ||
| } | ||
|
|
||
| // StopBackgroundJobs stops background jobs. | ||
| // StopBackgroundJobs stops background jobs, these jobs is created by NewCluster. | ||
| func (c *Cluster) StopBackgroundJobs() { | ||
| if !c.running.Load() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On startup rollback, |
||
| return | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -281,4 +281,7 @@ func (rw *Watcher) initializeRegionLabelWatcher() error { | |
| func (rw *Watcher) Close() { | ||
| rw.cancel() | ||
| rw.wg.Wait() | ||
| if rw.checkerController != nil { | ||
| rw.checkerController.ClearSuspectKeyRanges() | ||
| } | ||
|
Comment on lines
+283
to
+285
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This cleanup order can still leave stale suspect key ranges. Close cancels the watcher and immediately calls ClearSuspectKeyRanges, but an already-running watcher callback is not preempted by cancellation and may still run postEventsFn/AddSuspectKeyRange afterward, re-adding suspect ranges after they were cleared. Could we move ClearSuspectKeyRanges after rw.wg.Wait(), so cleanup happens after all watcher callbacks have exited?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about this |
||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.