-
Notifications
You must be signed in to change notification settings - Fork 764
mcs/scheduling: clean primary resources on exit #10645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
9c92cfd
23174e5
e7bea36
b6baff6
8871821
6fff238
53ef0f2
3b30f5c
9c2100b
36b0d6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -506,70 +506,104 @@ func (s *Server) startServer() (err error) { | |
| return nil | ||
| } | ||
|
|
||
| func (s *Server) startCluster(context.Context) error { | ||
| s.basicCluster = core.NewBasicCluster() | ||
| s.storage = endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil) | ||
| err := s.startMetaConfWatcher() | ||
| func (s *Server) startCluster(ctx context.Context) error { | ||
| basicCluster := core.NewBasicCluster() | ||
| storage := endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil) | ||
| metaWatcher, configWatcher, err := s.startMetaConfWatcher(ctx, basicCluster, storage) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| s.hbStreams = hbstream.NewHeartbeatStreams(s.Context(), constant.SchedulingServiceName, s.basicCluster) | ||
| cluster, err := NewCluster(s.Context(), s.persistConfig, s.storage, s.basicCluster, s.hbStreams, s.checkMembershipCh, s.GetHTTPClient(), s.GetBackendEndpoints()) | ||
| hbStreams := hbstream.NewHeartbeatStreams(ctx, constant.SchedulingServiceName, basicCluster) | ||
| cluster, err := NewCluster(ctx, s.persistConfig, storage, basicCluster, hbStreams, s.checkMembershipCh, s.GetHTTPClient(), s.GetBackendEndpoints()) | ||
| if err != nil { | ||
| hbStreams.Close() | ||
| configWatcher.Close() | ||
| metaWatcher.Close() | ||
| return err | ||
| } | ||
| s.cluster.Store(cluster) | ||
| // Inject the cluster components into the config watcher after the scheduler controller is created. | ||
| s.configWatcher.SetSchedulersController(cluster.GetCoordinator().GetSchedulersController()) | ||
| // Start the rule watcher after the cluster is created. | ||
| s.ruleWatcher, err = rule.NewWatcher(s.Context(), s.GetClient(), s.storage, | ||
| configWatcher.SetSchedulersController(cluster.GetCoordinator().GetSchedulersController()) | ||
| ruleWatcher, err := rule.NewWatcher(ctx, s.GetClient(), storage, | ||
| cluster.GetCoordinator().GetCheckerController(), cluster.GetRuleManager(), cluster.GetRegionLabeler()) | ||
| if err != nil { | ||
| hbStreams.Close() | ||
| configWatcher.Close() | ||
| metaWatcher.Close() | ||
| return err | ||
| } | ||
| // Start the affinity watcher after the cluster is created. | ||
| s.affinityWatcher, err = affinity.NewWatcher(s.Context(), s.GetClient(), cluster.GetAffinityManager()) | ||
| affinityWatcher, err := affinity.NewWatcher(ctx, s.GetClient(), cluster.GetAffinityManager()) | ||
| if err != nil { | ||
| ruleWatcher.Close() | ||
| hbStreams.Close() | ||
| configWatcher.Close() | ||
| metaWatcher.Close() | ||
| return err | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| s.basicCluster = basicCluster | ||
|
lhy1024 marked this conversation as resolved.
Outdated
|
||
| s.storage = storage | ||
| s.metaWatcher = metaWatcher | ||
| s.configWatcher = configWatcher | ||
| s.hbStreams = hbStreams | ||
| s.ruleWatcher = ruleWatcher | ||
| s.affinityWatcher = affinityWatcher | ||
| s.cluster.Store(cluster) | ||
| cluster.StartBackgroundJobs() | ||
| return nil | ||
| } | ||
|
|
||
| func (s *Server) stopCluster() { | ||
| cluster := s.GetCluster() | ||
| if cluster != nil { | ||
| s.cluster.Store((*Cluster)(nil)) | ||
| cluster.StopBackgroundJobs() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This changes the shutdown visibility order in a risky way. The old cluster remains published via
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, remove the basicCluster from the service struct; all callers should get the basic cluster from the Cluster field.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for moving I think the shutdown visibility issue in this thread is still valid, though. Could we unpublish the cluster first, then stop and clean the captured old cluster? For example: capture
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a regression test to verify this window.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch, fixed |
||
| } | ||
| s.stopWatcher() | ||
| s.cleanupClusterResources() | ||
|
bufferflies marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| func (s *Server) startMetaConfWatcher() (err error) { | ||
| s.metaWatcher, err = meta.NewWatcher(s.Context(), s.GetClient(), s.basicCluster) | ||
| func (s *Server) startMetaConfWatcher( | ||
| ctx context.Context, | ||
| basicCluster *core.BasicCluster, | ||
| storage *endpoint.StorageEndpoint, | ||
| ) (metaWatcher *meta.Watcher, configWatcher *config.Watcher, err error) { | ||
| metaWatcher, err = meta.NewWatcher(ctx, s.GetClient(), basicCluster) | ||
| if err != nil { | ||
| return err | ||
| return nil, nil, err | ||
| } | ||
| s.configWatcher, err = config.NewWatcher(s.Context(), s.GetClient(), s.persistConfig, s.storage) | ||
| configWatcher, err = config.NewWatcher(ctx, s.GetClient(), s.persistConfig, storage) | ||
| if err != nil { | ||
| return err | ||
| metaWatcher.Close() | ||
| return nil, nil, err | ||
| } | ||
| return err | ||
| return metaWatcher, configWatcher, nil | ||
| } | ||
|
|
||
| func (s *Server) stopWatcher() { | ||
| if s.affinityWatcher != nil { | ||
| s.affinityWatcher.Close() | ||
| s.affinityWatcher = nil | ||
| } | ||
| if s.ruleWatcher != nil { | ||
| s.ruleWatcher.Close() | ||
| s.ruleWatcher = nil | ||
| } | ||
| if s.metaWatcher != nil { | ||
| s.metaWatcher.Close() | ||
| s.metaWatcher = nil | ||
| } | ||
| if s.configWatcher != nil { | ||
| s.configWatcher.Close() | ||
| s.configWatcher = nil | ||
| } | ||
| } | ||
|
|
||
| func (s *Server) cleanupClusterResources() { | ||
| s.stopWatcher() | ||
| if s.hbStreams != nil { | ||
| s.hbStreams.Close() | ||
| s.hbStreams = nil | ||
| } | ||
| s.cluster.Store((*Cluster)(nil)) | ||
|
bufferflies marked this conversation as resolved.
Outdated
|
||
| s.basicCluster = nil | ||
| s.storage = nil | ||
| } | ||
|
lhy1024 marked this conversation as resolved.
Outdated
|
||
|
|
||
| // GetPersistConfig returns the persist config. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| // Copyright 2026 TiKV Project Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| package server | ||
|
|
||
| import ( | ||
| "context" | ||
| "testing" | ||
|
|
||
| "github.com/stretchr/testify/require" | ||
|
|
||
| "github.com/tikv/pd/pkg/core" | ||
| "github.com/tikv/pd/pkg/mcs/utils/constant" | ||
| "github.com/tikv/pd/pkg/schedule/hbstream" | ||
| "github.com/tikv/pd/pkg/storage/endpoint" | ||
| "github.com/tikv/pd/pkg/storage/kv" | ||
| ) | ||
|
|
||
| func TestCleanupClusterResources(t *testing.T) { | ||
|
Check failure on line 30 in pkg/mcs/scheduling/server/server_test.go
|
||
| re := require.New(t) | ||
| ctx, cancel := context.WithCancel(context.Background()) | ||
| defer cancel() | ||
|
|
||
| hbStreams := hbstream.NewHeartbeatStreams(ctx, constant.SchedulingServiceName, core.NewBasicCluster()) | ||
| basicCluster := core.NewBasicCluster() | ||
| storage := endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil) | ||
| cluster := &Cluster{} | ||
|
|
||
| s := &Server{ | ||
| basicCluster: basicCluster, | ||
| hbStreams: hbStreams, | ||
| storage: storage, | ||
| } | ||
| s.cluster.Store(cluster) | ||
|
|
||
| s.cleanupClusterResources() | ||
| s.cleanupClusterResources() | ||
|
|
||
| re.Nil(s.GetCluster()) | ||
| re.Nil(s.basicCluster) | ||
| re.Nil(s.hbStreams) | ||
| re.Nil(s.storage) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This cleanup order can still leave stale suspect key ranges. Close cancels the watcher and immediately calls ClearSuspectKeyRanges, but an already-running watcher callback is not preempted by cancellation and may still run postEventsFn/AddSuspectKeyRange afterward, re-adding suspect ranges after they were cleared.
Could we move ClearSuspectKeyRanges after rw.wg.Wait(), so cleanup happens after all watcher callbacks have exited?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about this