-
Notifications
You must be signed in to change notification settings - Fork 764
mcs/scheduling: clean primary resources on exit #10645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 4 commits
9c92cfd
23174e5
e7bea36
b6baff6
8871821
6fff238
53ef0f2
3b30f5c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| // Copyright 2026 TiKV Project Authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| package apis | ||
|
|
||
| import ( | ||
| "net/http" | ||
| "net/http/httptest" | ||
| "testing" | ||
|
|
||
| "github.com/gin-gonic/gin" | ||
| "github.com/stretchr/testify/require" | ||
| "go.uber.org/goleak" | ||
|
|
||
| scheserver "github.com/tikv/pd/pkg/mcs/scheduling/server" | ||
| "github.com/tikv/pd/pkg/utils/apiutil/multiservicesapi" | ||
| "github.com/tikv/pd/pkg/utils/testutil" | ||
| ) | ||
|
|
||
| func TestMain(m *testing.M) { | ||
| goleak.VerifyTestMain(m, testutil.LeakOptions...) | ||
| } | ||
|
|
||
| func TestGetAllStoresReturnsNotBootstrappedWhenBasicClusterMissing(t *testing.T) { | ||
| gin.SetMode(gin.TestMode) | ||
| re := require.New(t) | ||
|
|
||
| resp := httptest.NewRecorder() | ||
| ctx, _ := gin.CreateTestContext(resp) | ||
| ctx.Request = httptest.NewRequest(http.MethodGet, "/stores", nil) | ||
| ctx.Set(multiservicesapi.ServiceContextKey, &scheserver.Server{}) | ||
|
|
||
| getAllStores(ctx) | ||
|
|
||
| re.Equal(http.StatusInternalServerError, resp.Code) | ||
| re.Contains(resp.Body.String(), "not bootstrapped") | ||
| } | ||
|
bufferflies marked this conversation as resolved.
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -654,9 +654,10 @@ func (c *Cluster) StartBackgroundJobs() { | |
| } | ||
|
|
||
| // StopBackgroundJobs stops background jobs. | ||
| func (c *Cluster) StopBackgroundJobs() { | ||
| // It will return false if the cluster isn't running, otherwise it will stop the background jobs and return true. | ||
| func (c *Cluster) StopBackgroundJobs() bool { | ||
| if !c.running.Load() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On startup rollback, |
||
| return | ||
| return false | ||
| } | ||
| c.running.Store(false) | ||
| c.coordinator.Stop() | ||
|
|
@@ -665,6 +666,7 @@ func (c *Cluster) StopBackgroundJobs() { | |
| c.logRunner.Stop() | ||
| c.cancel() | ||
| c.wg.Wait() | ||
| return true | ||
| } | ||
|
|
||
| // IsBackgroundJobsRunning returns whether the background jobs are running. Only for test purpose. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -92,7 +92,6 @@ type Server struct { | |
|
|
||
| cfg *config.Config | ||
| persistConfig *config.PersistConfig | ||
| basicCluster *core.BasicCluster | ||
|
|
||
| // for the primary election of scheduling | ||
| participant *member.Participant | ||
|
|
@@ -426,7 +425,10 @@ func (s *Server) GetCluster() *Cluster { | |
|
|
||
| // GetBasicCluster returns the basic cluster. | ||
| func (s *Server) GetBasicCluster() *core.BasicCluster { | ||
| return s.basicCluster | ||
| if cluster := s.GetCluster(); cluster != nil { | ||
| return cluster.GetBasicCluster() | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // GetCoordinator returns the coordinator. | ||
|
|
@@ -506,70 +508,112 @@ func (s *Server) startServer() (err error) { | |
| return nil | ||
| } | ||
|
|
||
| func (s *Server) startCluster(context.Context) error { | ||
| s.basicCluster = core.NewBasicCluster() | ||
| s.storage = endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil) | ||
| err := s.startMetaConfWatcher() | ||
| func (s *Server) startCluster(ctx context.Context) (err error) { | ||
| basicCluster := core.NewBasicCluster() | ||
| storage := endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil) | ||
| metaWatcher, configWatcher, err := s.startMetaConfWatcher(ctx, basicCluster, storage) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| s.hbStreams = hbstream.NewHeartbeatStreams(s.Context(), constant.SchedulingServiceName, s.basicCluster) | ||
| cluster, err := NewCluster(s.Context(), s.persistConfig, s.storage, s.basicCluster, s.hbStreams, s.checkMembershipCh, s.GetHTTPClient(), s.GetBackendEndpoints()) | ||
| hbStreams := hbstream.NewHeartbeatStreams(ctx, constant.SchedulingServiceName, basicCluster) | ||
| cluster, err := NewCluster(ctx, s.persistConfig, storage, basicCluster, hbStreams, s.checkMembershipCh, s.GetHTTPClient(), s.GetBackendEndpoints()) | ||
| defer func() { | ||
| // make sure the cluster is stopped if any error occurs | ||
| // if StopBackgroundJobs return false, it means the cluster is not running, so we need to close the context make the | ||
| // other goroutines exit. | ||
| if cluster != nil && !cluster.StopBackgroundJobs() { | ||
| cluster.cancel() | ||
| } | ||
| }() | ||
| if err != nil { | ||
| hbStreams.Close() | ||
| configWatcher.Close() | ||
| metaWatcher.Close() | ||
| return err | ||
| } | ||
| s.cluster.Store(cluster) | ||
| // Inject the cluster components into the config watcher after the scheduler controller is created. | ||
| s.configWatcher.SetSchedulersController(cluster.GetCoordinator().GetSchedulersController()) | ||
| // Start the rule watcher after the cluster is created. | ||
| s.ruleWatcher, err = rule.NewWatcher(s.Context(), s.GetClient(), s.storage, | ||
|
|
||
| configWatcher.SetSchedulersController(cluster.GetCoordinator().GetSchedulersController()) | ||
| ruleWatcher, err := rule.NewWatcher(ctx, s.GetClient(), storage, | ||
| cluster.GetCoordinator().GetCheckerController(), cluster.GetRuleManager(), cluster.GetRegionLabeler()) | ||
| if err != nil { | ||
| hbStreams.Close() | ||
| configWatcher.Close() | ||
| metaWatcher.Close() | ||
| return err | ||
| } | ||
| // Start the affinity watcher after the cluster is created. | ||
| s.affinityWatcher, err = affinity.NewWatcher(s.Context(), s.GetClient(), cluster.GetAffinityManager()) | ||
| affinityWatcher, err := affinity.NewWatcher(ctx, s.GetClient(), cluster.GetAffinityManager()) | ||
| if err != nil { | ||
| ruleWatcher.Close() | ||
| hbStreams.Close() | ||
| configWatcher.Close() | ||
| metaWatcher.Close() | ||
| return err | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| s.storage = storage | ||
| s.metaWatcher = metaWatcher | ||
| s.configWatcher = configWatcher | ||
| s.hbStreams = hbStreams | ||
| s.ruleWatcher = ruleWatcher | ||
| s.affinityWatcher = affinityWatcher | ||
| s.cluster.Store(cluster) | ||
| cluster.StartBackgroundJobs() | ||
| cluster = nil // defer cleanup no longer needed | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This only skips |
||
| return nil | ||
| } | ||
|
|
||
| func (s *Server) stopCluster() { | ||
| cluster := s.GetCluster() | ||
| if cluster != nil { | ||
| s.cluster.Store((*Cluster)(nil)) | ||
| cluster.StopBackgroundJobs() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This changes the shutdown visibility order in a risky way. The old cluster remains published via
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, remove the basicCluster from the service struct; all callers should get the basic cluster from the Cluster field.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for moving I think the shutdown visibility issue in this thread is still valid, though. Could we unpublish the cluster first, then stop and clean the captured old cluster? For example: capture
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a regression test to verify this window.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch, fixed |
||
| } | ||
| s.stopWatcher() | ||
| s.cleanupClusterResources() | ||
|
bufferflies marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| func (s *Server) startMetaConfWatcher() (err error) { | ||
| s.metaWatcher, err = meta.NewWatcher(s.Context(), s.GetClient(), s.basicCluster) | ||
| func (s *Server) startMetaConfWatcher( | ||
| ctx context.Context, | ||
| basicCluster *core.BasicCluster, | ||
| storage *endpoint.StorageEndpoint, | ||
| ) (metaWatcher *meta.Watcher, configWatcher *config.Watcher, err error) { | ||
| metaWatcher, err = meta.NewWatcher(ctx, s.GetClient(), basicCluster) | ||
| if err != nil { | ||
| return err | ||
| return nil, nil, err | ||
| } | ||
| s.configWatcher, err = config.NewWatcher(s.Context(), s.GetClient(), s.persistConfig, s.storage) | ||
| configWatcher, err = config.NewWatcher(ctx, s.GetClient(), s.persistConfig, storage) | ||
| if err != nil { | ||
| return err | ||
| metaWatcher.Close() | ||
| return nil, nil, err | ||
| } | ||
| return err | ||
| return metaWatcher, configWatcher, nil | ||
| } | ||
|
|
||
| func (s *Server) stopWatcher() { | ||
| if s.affinityWatcher != nil { | ||
| s.affinityWatcher.Close() | ||
| s.affinityWatcher = nil | ||
| } | ||
| if s.ruleWatcher != nil { | ||
| s.ruleWatcher.Close() | ||
| s.ruleWatcher = nil | ||
| } | ||
| if s.metaWatcher != nil { | ||
| s.metaWatcher.Close() | ||
| s.metaWatcher = nil | ||
| } | ||
| if s.configWatcher != nil { | ||
| s.configWatcher.Close() | ||
| s.configWatcher = nil | ||
| } | ||
| } | ||
|
|
||
| func (s *Server) cleanupClusterResources() { | ||
| s.cluster.Store((*Cluster)(nil)) | ||
| s.stopWatcher() | ||
| if s.hbStreams != nil { | ||
| s.hbStreams.Close() | ||
| s.hbStreams = nil | ||
| } | ||
| s.storage = nil | ||
| } | ||
|
|
||
| // GetPersistConfig returns the persist config. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.