-
Notifications
You must be signed in to change notification settings - Fork 764
server, config: support updating leader lease online #10631
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ package config | |
|
|
||
| import ( | ||
| "context" | ||
| "encoding/json" | ||
| "reflect" | ||
| "strconv" | ||
| "sync/atomic" | ||
|
|
@@ -55,6 +56,7 @@ type PersistOptions struct { | |
| keyspace atomic.Value | ||
| microservice atomic.Value | ||
| storeConfig atomic.Value | ||
| leaderLease atomic.Int64 | ||
| clusterVersion unsafe.Pointer | ||
| } | ||
|
|
||
|
|
@@ -68,6 +70,7 @@ func NewPersistOptions(cfg *Config) *PersistOptions { | |
| o.labelProperty.Store(cfg.LabelProperty) | ||
| o.keyspace.Store(&cfg.Keyspace) | ||
| o.microservice.Store(&cfg.Microservice) | ||
| o.leaderLease.Store(cfg.LeaderLease) | ||
| // storeConfig will be fetched from TiKV later, | ||
| // set it to an empty config here first. | ||
| o.storeConfig.Store(&sc.StoreConfig{}) | ||
|
|
@@ -156,6 +159,16 @@ func (o *PersistOptions) SetStoreConfig(cfg *sc.StoreConfig) { | |
| o.storeConfig.Store(cfg) | ||
| } | ||
|
|
||
| // GetLeaderLease returns the PD leader lease timeout in seconds. | ||
| func (o *PersistOptions) GetLeaderLease() int64 { | ||
| return o.leaderLease.Load() | ||
| } | ||
|
|
||
| // SetLeaderLease sets the PD leader lease timeout in seconds. | ||
| func (o *PersistOptions) SetLeaderLease(lease int64) { | ||
| o.leaderLease.Store(lease) | ||
| } | ||
|
|
||
| // GetClusterVersion returns the cluster version. | ||
| func (o *PersistOptions) GetClusterVersion() *semver.Version { | ||
| return (*semver.Version)(atomic.LoadPointer(&o.clusterVersion)) | ||
|
|
@@ -761,6 +774,30 @@ type persistedConfig struct { | |
| *Config | ||
| // StoreConfig is injected into Config to avoid breaking the original API. | ||
| StoreConfig sc.StoreConfig `json:"store"` | ||
| // leaseLoaded records whether the persisted etcd blob explicitly included | ||
| // the top-level "lease" field. Older blobs without that field should not | ||
| // silently overwrite the startup value with the default-filled lease. | ||
| leaseLoaded bool | ||
| } | ||
|
|
||
| // UnmarshalJSON records whether the persisted blob explicitly contains the | ||
| // leader lease field while preserving the default Config JSON decoding. | ||
| func (cfg *persistedConfig) UnmarshalJSON(data []byte) error { | ||
| if cfg.Config == nil { | ||
| cfg.Config = &Config{} | ||
| } | ||
| var fields map[string]json.RawMessage | ||
| if err := json.Unmarshal(data, &fields); err != nil { | ||
| return err | ||
| } | ||
| _, cfg.leaseLoaded = fields["lease"] | ||
|
|
||
| type persistedConfigAlias persistedConfig | ||
| return json.Unmarshal(data, (*persistedConfigAlias)(cfg)) | ||
| } | ||
|
|
||
| func (cfg *persistedConfig) hasValidLeaderLease() bool { | ||
| return cfg.leaseLoaded && IsValidLeaderLease(cfg.LeaderLease) | ||
| } | ||
|
|
||
| // SwitchRaftV2 update some config if tikv raft engine switch into partition raft v2 | ||
|
|
@@ -781,6 +818,7 @@ func (o *PersistOptions) Persist(storage endpoint.ConfigStorage) error { | |
| Keyspace: *o.GetKeyspaceConfig(), | ||
| Microservice: *o.GetMicroserviceConfig(), | ||
| ClusterVersion: *o.GetClusterVersion(), | ||
| LeaderLease: o.GetLeaderLease(), | ||
| }, | ||
| StoreConfig: *o.GetStoreConfig(), | ||
| } | ||
|
|
@@ -797,7 +835,6 @@ func (o *PersistOptions) Reload(storage endpoint.ConfigStorage) error { | |
| if err := cfg.Adjust(nil, true); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| isExist, err := storage.LoadConfig(cfg) | ||
| if err != nil { | ||
| return err | ||
|
|
@@ -813,12 +850,55 @@ func (o *PersistOptions) Reload(storage endpoint.ConfigStorage) error { | |
| o.labelProperty.Store(cfg.LabelProperty) | ||
| o.keyspace.Store(&cfg.Keyspace) | ||
| o.microservice.Store(&cfg.Microservice) | ||
| if cfg.hasValidLeaderLease() { | ||
| o.SetLeaderLease(cfg.LeaderLease) | ||
| } | ||
| o.storeConfig.Store(&cfg.StoreConfig) | ||
| o.SetClusterVersion(&cfg.ClusterVersion) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // LoadPersistedLeaderLease returns the leader lease that should be used for the | ||
| // next campaign. It reads only the persisted leader lease instead of reloading | ||
| // the whole configuration, so it is cheap enough to run on the election path. | ||
| // When the persisted blob carries no valid lease (e.g. a blob written before | ||
| // online lease update was supported), the current in-memory value is returned, | ||
| // preserving the previous campaign behavior. Storage and decode errors are | ||
| // returned to the caller, which decides whether to fall back. | ||
| func (o *PersistOptions) LoadPersistedLeaderLease(storage endpoint.ConfigStorage) (int64, error) { | ||
| failpoint.Inject("loadLeaderLeaseFail", func() { | ||
| failpoint.Return(int64(0), errors.New("failpoint: fail to load persisted leader lease")) | ||
| }) | ||
| cfg := &persistedConfig{Config: &Config{}} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This still unmarshals the full persisted config on the election path. Could we decode only a small |
||
| isExist, err := storage.LoadConfig(cfg) | ||
| if err != nil { | ||
| return 0, err | ||
| } | ||
| if isExist && cfg.hasValidLeaderLease() { | ||
| return cfg.LeaderLease, nil | ||
| } | ||
| return o.GetLeaderLease(), nil | ||
| } | ||
|
|
||
| // IsValidLeaderLease returns whether the given PD leader lease timeout is valid. | ||
| // It only checks positivity, because it also gates whether a persisted lease | ||
| // should be honored on reload. | ||
| func IsValidLeaderLease(lease int64) bool { | ||
| return lease > 0 | ||
| } | ||
|
|
||
| // ValidateLeaderLease validates a leader lease (in seconds) supplied through | ||
| // the online update API. Keep the accepted range aligned with file config | ||
| // behavior: reject explicit non-positive updates, but do not add an online-only | ||
| // upper bound. | ||
| func ValidateLeaderLease(lease int64) error { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we add an upper bound here? Values larger than |
||
| if !IsValidLeaderLease(lease) { | ||
| return errors.Errorf("leader lease must be positive, got %d", lease) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func adjustScheduleCfg(scheduleCfg *sc.ScheduleConfig) { | ||
| // In case we add new default schedulers. | ||
| for _, ps := range sc.DefaultSchedulers { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please consider version-gating this API until all PD members understand the persisted lease. During rolling upgrade, an old PD can still persist
lease: 0in the config blob, which makes the new code ignore the online value and fall back to local toml on the next campaign.