From af0bdc3862b9a44a77455f4703e92d8a8edc3996 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Mon, 6 Oct 2025 10:46:03 -0700 Subject: [PATCH 01/14] Debug why move does not happen on eu1 --- solrman/smmodel/model.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 192070f..5736e50 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -236,7 +236,8 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { } // Step 2: balance collections next, respecting node max size. - for _, bi := range balanceInfo { + for i, bi := range balanceInfo { + fmt.Printf("%d : coll %s score=%d maxCoresPerNode=%d coresPerNode=%v\n", i, bi.coll.Name, bi.score, bi.maxCoresPerNode, bi.coresPerNode) if bi.score == 0 { continue } @@ -271,6 +272,8 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { continue } + fmt.Printf("Picked core %+v to move\n", core) + // Find a suitable target node with the least number of collection replicas and the smallest node size targets := make([]*Node, len(m.Nodes)) copy(targets, m.Nodes) @@ -289,15 +292,19 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { break } if target.MaxSize > 0 && core.Size+target.Size > target.MaxSize { + fmt.Printf("Skipping target %s because it would exceed max size. Current target size=%d, core size=%d, max size=%d\n", target.Name, target.Size, core.Size, target.MaxSize) continue } // Found a good choice. - return &Move{ + move := &Move{ Core: core, FromNode: fromNode, ToNode: target, } + fmt.Printf("Found good move %+v\n", move) + return move + } } From 4536389ec09623cb0845ebc5269304e4c25afff0 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Mon, 6 Oct 2025 11:10:34 -0700 Subject: [PATCH 02/14] Debug why move does not happen on eu1 --- solrman/smmodel/model.go | 1 + 1 file changed, 1 insertion(+) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 5736e50..6258374 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -199,6 +199,7 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { } if 9*source.Size < 10*target.Size { + fmt.Printf("Skipping target %s because it's already >=90%% of source %s (source size=%d, target size=%d)\n", target.Name, source.Name, source.Size, target.Size) // if the target node is >=90% of the source node, don't bother return nil } From 69f40f653b223de31d32ef6d1231ed7b58bf8585 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Mon, 6 Oct 2025 11:33:09 -0700 Subject: [PATCH 03/14] Extra debugging --- solrman/smmodel/model.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 6258374..5107738 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -109,6 +109,7 @@ func (m *Model) WithMove(move Move) *Model { } func (m *Model) computeNextMove(immobileCores []bool) *Move { + fmt.Printf("Computing next moves\n") if len(m.Nodes) < 2 || len(m.Cores) < 1 { // can't balance a single-node or empty cluster return nil @@ -238,7 +239,7 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { // Step 2: balance collections next, respecting node max size. for i, bi := range balanceInfo { - fmt.Printf("%d : coll %s score=%d maxCoresPerNode=%d coresPerNode=%v\n", i, bi.coll.Name, bi.score, bi.maxCoresPerNode, bi.coresPerNode) + fmt.Printf("Balance info %d : coll %s score=%d maxCoresPerNode=%d coresPerNode=%v\n", i, bi.coll.Name, bi.score, bi.maxCoresPerNode, bi.coresPerNode) if bi.score == 0 { continue } From 9b259a8a98b87745f3cd0f8c67448a5d37d9e81c Mon Sep 17 00:00:00 2001 From: patsonluk Date: Mon, 6 Oct 2025 15:16:27 -0700 Subject: [PATCH 04/14] + score if there are many nodes with very few cores --- solrman/smmodel/collection.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/solrman/smmodel/collection.go b/solrman/smmodel/collection.go index 756942f..5580ed6 100644 --- a/solrman/smmodel/collection.go +++ b/solrman/smmodel/collection.go @@ -101,6 +101,8 @@ func (c *Collection) balance(nodeCount int) balanceInfo { for _, v := range coresPerNode { if v > maxCoresPerNode { score += int64((v - maxCoresPerNode) * (v - maxCoresPerNode)) + } else if v < maxCoresPerNode-1 { //+ score if there are many nodes with very few cores + score += int64((v - (maxCoresPerNode - 1)) * (v - (maxCoresPerNode - 1))) } } From 70d64c1d83e940dbae53d1b2d12abe54531dedf7 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 09:37:07 -0700 Subject: [PATCH 05/14] Step 3 moves only if target is < 98% of source --- solrman/smmodel/model.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 5107738..2a3214c 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -199,8 +199,8 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { } } - if 9*source.Size < 10*target.Size { - fmt.Printf("Skipping target %s because it's already >=90%% of source %s (source size=%d, target size=%d)\n", target.Name, source.Name, source.Size, target.Size) + if target.Size > int64(float64(source.Size)*0.98) { + fmt.Printf("Skipping step 3 move from %s to %s because target is already >98%% of source\n", source.Name, target.Name) // if the target node is >=90% of the source node, don't bother return nil } From 13d4559c998766664c6a38e4075ad6ce07e0eb29 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 09:54:49 -0700 Subject: [PATCH 06/14] Fixed comment --- solrman/smmodel/model.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 2a3214c..eea5799 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -201,7 +201,7 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { if target.Size > int64(float64(source.Size)*0.98) { fmt.Printf("Skipping step 3 move from %s to %s because target is already >98%% of source\n", source.Name, target.Name) - // if the target node is >=90% of the source node, don't bother + // if the target node is > 98% of the source node, don't bother return nil } From 9cbc53b7ea630017b524413d62837bc6a6a95944 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 10:34:34 -0700 Subject: [PATCH 07/14] Debug for step 3 --- solrman/smmodel/model.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index eea5799..33cf93a 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -168,6 +168,7 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { return nodesBySize[i].Size < nodesBySize[j].Size }) + fmt.Printf("Computing from step 3 move") // Try to move a core from the given node. tryMoveCoreFrom := func(source *Node, force bool) *Move { for _, target := range nodesBySize { @@ -226,6 +227,7 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { continue } + fmt.Printf("Found good move from step 3 from %s to %s. Source size: %d Target Size: %d on coll %s shard %s\n", source.Name, target.Name, source.Size, target.Size, core.Collection, core.Shard) // Found a good candidate. return &Move{ Core: core, @@ -304,7 +306,7 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { FromNode: fromNode, ToNode: target, } - fmt.Printf("Found good move %+v\n", move) + fmt.Printf("Found good move from step 2 %+v\n", move) return move } From e22cb88fe2b5e96e87f1acb37428eb8702d9d9e0 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 11:05:41 -0700 Subject: [PATCH 08/14] Only do one move max from step 3 for now --- solrman/smmodel/model.go | 7 ++++--- solrman/smmodel/move.go | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 33cf93a..7f40b5b 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -168,7 +168,6 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { return nodesBySize[i].Size < nodesBySize[j].Size }) - fmt.Printf("Computing from step 3 move") // Try to move a core from the given node. tryMoveCoreFrom := func(source *Node, force bool) *Move { for _, target := range nodesBySize { @@ -228,11 +227,12 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { } fmt.Printf("Found good move from step 3 from %s to %s. Source size: %d Target Size: %d on coll %s shard %s\n", source.Name, target.Name, source.Size, target.Size, core.Collection, core.Shard) - // Found a good candidate. + // Found a good candidate. Only allow one move from step 3 for now to workaround the "back and forth" issue return &Move{ Core: core, FromNode: source, ToNode: target, + lastMove: true, } } } @@ -312,6 +312,7 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { } } + fmt.Printf("Computing from step 3 move\n") // Step 3: balance nodes next, respecting collection balance. // Take the largest core from the largest node, and move it to the smallest node, provided we don't violate constraints. if len(nodesBySize) > 1 { @@ -332,7 +333,7 @@ func (m *Model) ComputeBestMoves(count int) []Move { immobileCores := make([]bool, len(m.Cores)) // cores that have already moved for i := 0; i < count; i++ { move := curModel.computeNextMove(immobileCores) - if move == nil { + if move == nil || move.lastMove { // no good moves break } diff --git a/solrman/smmodel/move.go b/solrman/smmodel/move.go index bf1108c..e50ab03 100644 --- a/solrman/smmodel/move.go +++ b/solrman/smmodel/move.go @@ -20,6 +20,7 @@ type Move struct { Core *Core FromNode *Node ToNode *Node + lastMove bool } func (m *Move) String() string { From aff73c5434d6b4e19a985e92d388a75d45a665c7 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 11:16:13 -0700 Subject: [PATCH 09/14] Only do one move max from step 3 for now --- solrman/smmodel/model.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 7f40b5b..e398285 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -333,13 +333,17 @@ func (m *Model) ComputeBestMoves(count int) []Move { immobileCores := make([]bool, len(m.Cores)) // cores that have already moved for i := 0; i < count; i++ { move := curModel.computeNextMove(immobileCores) - if move == nil || move.lastMove { + if move == nil { // no good moves break } immobileCores[move.Core.id] = true moves = append(moves, *move) curModel = curModel.WithMove(*move) + if move.lastMove { + // don't compute more moves + break + } } return moves From a7dfec0f2cfc8bb10d0e1b93c8edc9232e38e8f1 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 11:48:41 -0700 Subject: [PATCH 10/14] Fixed step 3 causing core imbalance and fight with step 2 --- solrman/smmodel/model.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index e398285..b1802c0 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -215,6 +215,11 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { continue } + //Make sure it would not violate balance per collection ie no nodes will be 2 shards than other after such moves + if coll.balanceInfo.coresPerNode[target.id] >= coll.balanceInfo.coresPerNode[source.id] { + continue + } + // Don't bother moving this core if the target node would become bigger than the source node. if target.Size+core.Size >= source.Size { continue From a600ffa1ff06e3fd3219a0ee41153b4be26e0272 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 12:12:02 -0700 Subject: [PATCH 11/14] Relaxed step 3 moves as it should no longer conflict with step 2 --- solrman/smmodel/model.go | 5 ----- solrman/smmodel/move.go | 1 - 2 files changed, 6 deletions(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index b1802c0..9924ad3 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -237,7 +237,6 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { Core: core, FromNode: source, ToNode: target, - lastMove: true, } } } @@ -345,10 +344,6 @@ func (m *Model) ComputeBestMoves(count int) []Move { immobileCores[move.Core.id] = true moves = append(moves, *move) curModel = curModel.WithMove(*move) - if move.lastMove { - // don't compute more moves - break - } } return moves diff --git a/solrman/smmodel/move.go b/solrman/smmodel/move.go index e50ab03..bf1108c 100644 --- a/solrman/smmodel/move.go +++ b/solrman/smmodel/move.go @@ -20,7 +20,6 @@ type Move struct { Core *Core FromNode *Node ToNode *Node - lastMove bool } func (m *Move) String() string { From b75097b83a7f9e1477c54d3ce71658ce11ba37fc Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 12:40:10 -0700 Subject: [PATCH 12/14] Step 3 moves only if target is < 99.5% of source --- solrman/smmodel/model.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 9924ad3..8878af1 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -199,9 +199,9 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { } } - if target.Size > int64(float64(source.Size)*0.98) { - fmt.Printf("Skipping step 3 move from %s to %s because target is already >98%% of source\n", source.Name, target.Name) - // if the target node is > 98% of the source node, don't bother + if target.Size > int64(float64(source.Size)*0.995) { + fmt.Printf("Skipping step 3 move from %s to %s because target is already >99.5%% of source\n", source.Name, target.Name) + // if the target node is > 99.5% of the source node, don't bother return nil } From e3f5c06d6f518f9748eb2d1e59cfb27a6d27a45d Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 13:17:47 -0700 Subject: [PATCH 13/14] Debug for node by size --- solrman/smmodel/model.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 8878af1..7071ea0 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -170,6 +170,11 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { // Try to move a core from the given node. tryMoveCoreFrom := func(source *Node, force bool) *Move { + fmt.Printf("Nodes by size: \n") + for i, target := range nodesBySize { + fmt.Printf(" #%d: %s %d \n", i, target.Name, target.Size) + } + for _, target := range nodesBySize { if target == source { continue From 5b9ebeef033502f6a008ba3a18c4b39dfe4eca5a Mon Sep 17 00:00:00 2001 From: patsonluk Date: Tue, 7 Oct 2025 14:03:19 -0700 Subject: [PATCH 14/14] Debug shortcircuits in step 3 --- solrman/smmodel/model.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/solrman/smmodel/model.go b/solrman/smmodel/model.go index 7071ea0..2139c4e 100644 --- a/solrman/smmodel/model.go +++ b/solrman/smmodel/model.go @@ -170,16 +170,13 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { // Try to move a core from the given node. tryMoveCoreFrom := func(source *Node, force bool) *Move { - fmt.Printf("Nodes by size: \n") for i, target := range nodesBySize { - fmt.Printf(" #%d: %s %d \n", i, target.Name, target.Size) - } - - for _, target := range nodesBySize { if target == source { continue } + fmt.Printf(" #%d: %s %d \n", i, target.Name, target.Size) + // Move the largest core that doesn't violate constraints. var candidates []*Core for i, c := range m.Cores { @@ -212,27 +209,32 @@ func (m *Model) computeNextMove(immobileCores []bool) *Move { for _, core := range candidates { if target.MaxSize > 0 && core.Size+target.Size > target.MaxSize { + fmt.Printf("Skipping %s as max size would exceed after move core size %d, disk size %d, max size %d", target.Name, target.Size, core.Size, target.MaxSize) continue } // Make sure moving this core won't violate collection balance. coll := m.Collections[core.collectionId] if coll.balanceInfo.coresPerNode[target.id] >= coll.balanceInfo.maxCoresPerNode { + fmt.Printf("Skipping %s as collection %s already has %d cores on it which maxCoresPerNode as %d", target.Name, coll.Name, coll.balanceInfo.coresPerNode[target.id], coll.balanceInfo.maxCoresPerNode) continue } //Make sure it would not violate balance per collection ie no nodes will be 2 shards than other after such moves if coll.balanceInfo.coresPerNode[target.id] >= coll.balanceInfo.coresPerNode[source.id] { + fmt.Printf("Skipping %s as collection %s already has %d cores on source and %d cores on target", target.Name, coll.Name, coll.balanceInfo.coresPerNode[source.id], coll.balanceInfo.coresPerNode[target.id]) continue } // Don't bother moving this core if the target node would become bigger than the source node. if target.Size+core.Size >= source.Size { + fmt.Printf("Skipping %s as after move size on target %d would exceed the size of source %d", target.Name, target.Size+core.Size, source.Size) continue } // If the source is substantially under the maximum size (<10%), only move if the target node is substantially smaller than the source node. // This is to avoid move thrashing in a cluster that is drastically below capacity while nodes are rapidly growing. if source.Size*10 < source.MaxSize && target.Size+5*core.Size >= source.Size { + fmt.Printf("Skipping %s as due to under size", target.Name) continue }