From 6e5464c81278944cd711922c39aecd294fdc279c Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 31 May 2022 13:31:15 -0500 Subject: [PATCH 01/12] TEST --- scheduler/test/cook/test/scheduler/dru.clj | 39 +++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/scheduler/test/cook/test/scheduler/dru.clj b/scheduler/test/cook/test/scheduler/dru.clj index f5915ef813..b178a121ba 100644 --- a/scheduler/test/cook/test/scheduler/dru.clj +++ b/scheduler/test/cook/test/scheduler/dru.clj @@ -16,9 +16,10 @@ (ns cook.test.scheduler.dru (:require [clojure.test :refer :all] [cook.test.postgres] + [cook.queries :as queries] [cook.scheduler.dru :as dru] [cook.scheduler.share :as share] - [cook.test.testutil :refer [create-dummy-instance create-dummy-job restore-fresh-database!]] + [cook.test.testutil :refer [create-dummy-instance create-dummy-job restore-fresh-database! setup]] [cook.tools :as util] [datomic.api :as d :refer [db q]] [plumbing.core :refer [map-vals]])) @@ -120,6 +121,42 @@ shuffle (into {}))))))))) +;; This test makes sures jobs are queued in a consistent order. +(deftest test-sorted-task-scored-task-pairs-with-running + (setup) + (let [datomic-uri "datomic:mem://test-sorted-task-scored-task-pairs-order" + conn (restore-fresh-database! datomic-uri) + job1 (create-dummy-job conn :user "ljin" :memory 10.0 :ncpus 10.0 :name "1") + job2 (create-dummy-job conn :user "ljin" :memory 20.0 :ncpus 20.0 :name "2" :job-state :job.state/running) + job3 (create-dummy-job conn :user "ljin" :memory 40.0 :ncpus 40.0 :name "3" ) + job4 (create-dummy-job conn :user "ljin" :memory 80.0 :ncpus 80.0 :name "4" :job-state :job.state/running) + job5 (create-dummy-job conn :user "ljin" :memory 160.0 :ncpus 160.0 :name "5") + _ (create-dummy-instance conn job4 :instance-status :instance.status/running) + _ (create-dummy-instance conn job2 :instance-status :instance.status/running) + db (d/db conn) + + pending-task-ents (->> (queries/get-pending-job-ents db) + (map util/create-task-ent)) + running-task-ents (util/get-running-task-ents db) + tasks (into (vec running-task-ents) pending-task-ents)] + + (let [share {:mem 10.0 :cpus 10.0} + ; Queue should be job3, job5, becasue they're running, then 1 2 4. + ; DRU: 8.0 10.0 (not seen) 11.0 15.0 31.0 + ordered-drus [8.0 10.0 11.0 15.0 31.0]] + (testing "dru order correct" + + (is (= ordered-drus + (map (comp :dru second) + (dru/sorted-task-scored-task-pairs + {"ljin" share} + "no-pool" + (map-vals (partial sort-by identity (util/same-user-task-comparator)) + (group-by util/task-ent->user tasks)))))))))) + + + + (deftest test-compute-sorted-task-cumulative-gpu-score-pairs (testing "return empty set on input empty set" (is (= [] From 11850c285ef833a29bb66571374753dad6bee831 Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Thu, 5 May 2022 13:17:48 -0500 Subject: [PATCH 02/12] Make the user cache work for job entities in the queue. task-ent->user uses :db/id as a cache key. However, synthetic tasks entities for pending jobs don't have that and aren't cached. This makes that cache essentially noop for pending jobs; they always miss. Fix this by borrowing the :db/id of the source job. --- scheduler/src/cook/tools.clj | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index eeac285207..49ec42c360 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -561,7 +561,12 @@ (defn create-task-ent "Takes a pending job entity and returns a synthetic running task entity for that job" [pending-job-ent & {:keys [hostname slave-id] :or {hostname nil slave-id nil}}] - (merge {:job/_instance pending-job-ent + ; task-ent->user uses :db/id as a cache key. However, synthetic tasks + ; entities for pending jobs don't have that and aren't cached. This makes + ; that cache essentially noop for pending jobs; they always miss. + ; Fix this by borrowing the :db/id of the source job. + (merge {:db/id (- (:db/id pending-job-ent)) + :job/_instance pending-job-ent :instance/status :instance.status/running} (when hostname {:instance/hostname hostname}) (when slave-id {:instance/slave-id slave-id}))) From c567a16bf6463ea5e43a8646b606c080b3cdd13c Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 31 May 2022 13:28:15 -0500 Subject: [PATCH 03/12] Make it use the absolute value of the instance when no task UUID. --- scheduler/src/cook/tools.clj | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index 49ec42c360..57e023394a 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -581,6 +581,11 @@ (def ^:const default-job-priority 50) +(defn abs + "Placeholder for abs, until we use clojure 1.11" + [n] + (if (> 0) n (- n))) + (defn task->feature-vector "Vector of comparable features of a task. We use :instance/start-time, because this sort sees all running and waiting jobs for a user. @@ -595,7 +600,7 @@ [(- (:job/priority (:job/_instance task) default-job-priority)) (:instance/start-time task (java.util.Date. Long/MAX_VALUE)) (:db/id task) - (:db/id (:job/_instance task))]) + (abs (:db/id (:job/_instance task)))]) extract-key (fn [item] (or (:db/id item) (:db/id (:job/_instance item))))] From 6d3930e0794c4a9e42c8420628ad380db3eedc95 Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 10 May 2022 18:40:41 -0500 Subject: [PATCH 04/12] Fix bug with precanned negative :db/id. --- scheduler/test/cook/test/rebalancer.clj | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scheduler/test/cook/test/rebalancer.clj b/scheduler/test/cook/test/rebalancer.clj index df11c1f4cc..d2b446bcd8 100644 --- a/scheduler/test/cook/test/rebalancer.clj +++ b/scheduler/test/cook/test/rebalancer.clj @@ -893,7 +893,8 @@ pool-ent {:pool/name "no-pool" :pool/dru-mode :pool.dru-mode/default} state (rebalancer/init-state db running-task-ents pending-job-ents host->spare-resources pool-ent)] - (let [task-ent9 {:job/_instance job-ent9 + (let [task-ent9 {:db/id (- (:db/id job-ent9)) ; When we make fake task entities for pending jobs, we give them the :db/id of the underlying job for caching purposes. + :job/_instance job-ent9 :instance/hostname "hostB" :instance/slave-id "testB" :instance/status :instance.status/running} @@ -919,7 +920,8 @@ (dru/->ScoredTask task-ent5 0.32 8.0 8.0)] (vals task->scored-task''))))) - (let [task-ent10 {:job/_instance job-ent10 + (let [task-ent10 {:db/id (- (:db/id job-ent10)) ; When we make fake task entities for pending jobs, we give them the :db/id of the underlying job for caching purposes. + :job/_instance job-ent10 :instance/slave-id "testA" :instance/hostname "hostA" :instance/status :instance.status/running} @@ -944,7 +946,8 @@ (dru/->ScoredTask task-ent5 0.32 8.0 8.0)] (vals task->scored-task''))))) - (let [task-ent12 {:job/_instance job-ent12 + (let [task-ent12 {:db/id (- (:db/id job-ent12)) ; When we make fake task entities for pending jobs, we give them the :db/id of the underlying job for caching purposes. + :job/_instance job-ent12 :instance/hostname "hostA" :instance/status :instance.status/running} user->sorted-running-task-ents' {"ljin" (into (sorted-set-by (util/same-user-task-comparator)) [task-ent1 task-ent2 task-ent3 task-ent4]) From fb87ef9c12e1d966fe43b6fcba8fcb8d7d824b47 Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 10 May 2022 22:24:55 -0500 Subject: [PATCH 05/12] SNAP --- scheduler/src/cook/rebalancer.clj | 51 ++++++- scheduler/src/cook/tools.clj | 26 +++- scheduler/test/cook/test/rebalancer.clj | 194 ++++++++++++++++++++---- 3 files changed, 229 insertions(+), 42 deletions(-) diff --git a/scheduler/src/cook/rebalancer.clj b/scheduler/src/cook/rebalancer.clj index b8c62ad297..3e2a8dae61 100644 --- a/scheduler/src/cook/rebalancer.clj +++ b/scheduler/src/cook/rebalancer.clj @@ -167,7 +167,7 @@ (let [user (:job/user pending-job-ent) {gpu-req :gpus} (util/job-ent->resources pending-job-ent) gpu-divisor (-> user user->dru-divisors :gpus) - pending-task-ent (util/create-task-ent pending-job-ent) + pending-task-ent (util/create-task-ent-0 pending-job-ent) ; Taint, but not used. nearest-task-ent (some-> user->sorted-running-task-ents (get user) (rsubseq <= pending-task-ent) @@ -181,6 +181,28 @@ pending-job-dru)) +(defn mk-bound-fn + [^clojure.lang.Sorted sc test key] + (fn [e] + (test (.. sc comparator (compare (. sc entryKey e) key)) 0))) + +(defn rsubseq0 + "sc must be a sorted collection, test(s) one of <, <=, > or + >=. Returns a reverse seq of those entries with keys ek for + which (test (.. sc comparator (compare ek key)) 0) is true" + {:added "1.0" + :static true} + ([^clojure.lang.Sorted sc test key] + (let [include (mk-bound-fn sc test key)] + (if (#{< <=} test) + (when-let [[e :as s] (. sc seqFrom key false)] + (if (include e) s (next s))) + (take-while include (. sc seq false))))) + ([^clojure.lang.Sorted sc start-test start-key end-test end-key] + (when-let [[e :as s] (. sc seqFrom end-key false)] + (take-while (mk-bound-fn sc start-test start-key) + (if ((mk-bound-fn sc end-test end-key) e) s (next s)))))) + (defn compute-pending-default-job-dru "Takes state and a pending job entity, returns the dru of the pending-job. In the case where the pending job causes user's dominant resource type to change, the dru is not accurate and is only a upper bound. However, this inaccuracy won't affect the correctness @@ -192,16 +214,31 @@ (let [user (:job/user pending-job-ent) {mem-req :mem cpus-req :cpus} (util/job-ent->resources pending-job-ent) {mem-divisor :mem cpus-divisor :cpus} (user->dru-divisors user) - pending-task-ent (util/create-task-ent pending-job-ent) + pending-task-ent (util/create-task-ent-0 pending-job-ent) ; Taint, but not used. nearest-task-ent (some-> user->sorted-running-task-ents (get user) - (rsubseq <= pending-task-ent) + (rsubseq0 <= pending-task-ent) (first)) + _ (println "COUNT:" (count user->sorted-running-task-ents) + " --- " (some-> user->sorted-running-task-ents + (get user) + (count)) + " --- " '(compare (some-> user->sorted-running-task-ents + (get user) + first) pending-task-ent) + " ---- " + (some->> + (some-> user->sorted-running-task-ents + (get user) + (rsubseq0 <= pending-task-ent)) + (map :db/id))) nearest-task-dru (if nearest-task-ent (get-in task->scored-task [nearest-task-ent :dru]) 0.0) pending-job-dru (max (+ nearest-task-dru (/ mem-req mem-divisor)) - (+ nearest-task-dru (/ cpus-req cpus-divisor)))] + (+ nearest-task-dru (/ cpus-req cpus-divisor))) + _ (println "A:" (:db/id pending-job-ent) pending-job-dru + (map #(list (first %) (count (second %))) user->sorted-running-task-ents))] (histograms/update! (histograms/histogram (metric-title "pending-job-drus" pool)) (dru-at-scale pending-job-dru)) (histograms/update! (histograms/histogram (metric-title "nearest-task-drus" pool)) (dru-at-scale nearest-task-dru)) @@ -294,6 +331,7 @@ (update-in task-ents-by-user [user] f task-ent))) user->sorted-running-task-ents (conj preempted-task-ents new-running-task-ent)) + ;_ (println "A1: " (map #(list (first %) (count (second %))) user->sorted-running-task-ents' )) task->scored-task' (dru/next-task->scored-task task->scored-task user->sorted-running-task-ents user->sorted-running-task-ents' @@ -304,6 +342,7 @@ :gpus (- (:gpus preemption-decision 0.0) (or gpus-req 0.0)) :cpus (- (:cpus preemption-decision) cpus-req)}) preempted-tasks' (into preempted-tasks preempted-task-ents) + ;_ (log/info "State info: " (count preempted-tasks')) ;; Differs bebetween old and new! state' (->State task->scored-task' user->sorted-running-task-ents' host->spare-resources' user->dru-divisors' compute-pending-job-dru preempted-tasks' user->quota-fn)] state')) @@ -325,6 +364,7 @@ pool-name pending-job-ent cotask-cache] + ; (println "D: Compute-preemption-decision") (timers/time! (timers/timer (metric-title "compute-preemption-decision-duration" pool-name)) (let [{pending-job-mem :mem pending-job-cpus :cpus pending-job-gpus :gpus} (util/job-ent->resources pending-job-ent) @@ -343,6 +383,7 @@ (filter (partial exceeds-min-diff? pending-job-dru min-dru-diff pool-name)) (group-by (fn [{:keys [task]}] (:instance/hostname task)))) + ; _ (println "Host->ScoredTask: " (map #(list (first %) (count (second %))) host->scored-tasks )) host->formatted-spare-resources (->> host->spare-resources (map (fn [[host {:keys [mem cpus gpus]}]] @@ -405,6 +446,7 @@ "Takes state, params and a pending job entity, returns new state and preemption decision" [db agent-attributes-cache state params pending-job cotask-cache pool-name] (log/debug "Trying to find space for: " pending-job) + ; (println "C1" (:db/id pending-job)) (if-let [preemption-decision (compute-preemption-decision db agent-attributes-cache state params pool-name pending-job cotask-cache)] [(next-state state pending-job preemption-decision) (assoc preemption-decision @@ -437,6 +479,7 @@ remaining-preemption max-preemption [pending-job-ent & jobs-to-make-room-for] jobs-to-make-room-for preemption-decisions []] + ; (println "B1:" (count pending-job-ent) remaining-preemption) (if (and pending-job-ent (pos? remaining-preemption)) (let [[state' preemption-decision] (compute-next-state-and-preemption-decision db agent-attributes-cache state diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index 57e023394a..83d2b119bd 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -38,7 +38,7 @@ [metatransaction.core :refer [db]] [metrics.timers :as timers] [plumbing.core :as pc :refer [map-keys map-vals]]) - (:import (java.util Date))) + (:import (java.util Date Random))) (defn retrieve-system-ids "Executes a shell command to retrieve the user/group id for the specified user" @@ -558,6 +558,20 @@ (catch clojure.lang.ExceptionInfo e false))) +(defn create-task-ent-0 + "Takes a pending job entity and returns a synthetic running task entity for that job" + [pending-job-ent & {:keys [hostname slave-id] :or {hostname nil slave-id nil}}] + ; task-ent->user uses :db/id as a cache key. However, synthetic tasks + ; entities for pending jobs don't have that and aren't cached. This makes + ; that cache essentially noop for pending jobs; they always miss. + ; Fix this by borrowing the :db/id of the source job. + (merge {;:db/id (- (:db/id pending-job-ent)) + :db/id 0 + :job/_instance pending-job-ent + :instance/status :instance.status/running} + (when hostname {:instance/hostname hostname}) + (when slave-id {:instance/slave-id slave-id}))) + (defn create-task-ent "Takes a pending job entity and returns a synthetic running task entity for that job" [pending-job-ent & {:keys [hostname slave-id] :or {hostname nil slave-id nil}}] @@ -566,6 +580,7 @@ ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. (merge {:db/id (- (:db/id pending-job-ent)) + ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running} (when hostname {:instance/hostname hostname}) @@ -573,10 +588,11 @@ (defn task-ent->user [task-ent] - (let [task-ent->user-miss - (fn [task-ent] - (get-in task-ent [:job/_instance :job/user]))] - (caches/lookup-cache-datomic-entity! caches/task-ent->user-cache task-ent->user-miss task-ent))) + (get-in task-ent [:job/_instance :job/user])) +; (let [task-ent->user-miss +; (fn [task-ent] +; ] +; (caches/lookup-cache-datomic-entity! caches/task-ent->user-cache task-ent->user-miss task-ent))) (def ^:const default-job-priority 50) diff --git a/scheduler/test/cook/test/rebalancer.clj b/scheduler/test/cook/test/rebalancer.clj index d2b446bcd8..797b805285 100644 --- a/scheduler/test/cook/test/rebalancer.clj +++ b/scheduler/test/cook/test/rebalancer.clj @@ -25,7 +25,7 @@ [cook.scheduler.dru :as dru] [cook.scheduler.scheduler :as sched] [cook.scheduler.share :as share] - [cook.test.testutil :refer [create-dummy-group create-dummy-instance create-dummy-job init-agent-attributes-cache restore-fresh-database! setup]] + [cook.test.testutil :refer [create-dummy-group create-dummy-instance create-dummy-job flush-caches! init-agent-attributes-cache restore-fresh-database! setup]] [cook.tools :as util] [datomic.api :as d :refer [q]])) @@ -52,6 +52,7 @@ (deftest test-init-state (setup) + (flush-caches!) (testing "test1" (let [datomic-uri "datomic:mem://test-init-state" conn (restore-fresh-database! datomic-uri) @@ -112,6 +113,7 @@ (deftest test-compute-pending-default-job-dru (setup) + (flush-caches!) (testing "test1" (let [datomic-uri "datomic:mem://test-compute-pending-default-job-dru" conn (restore-fresh-database! datomic-uri) @@ -153,6 +155,7 @@ (deftest test-pending-gpu-job-dru (setup) + (flush-caches!) (let [datomic-uri "datomic:mem://test-rebalancer/compute-pending-normal-job-dru" conn (restore-fresh-database! datomic-uri) job1 (create-dummy-job conn :name "job1" :user "ljin" :memory 10.0 :ncpus 10.0 :gpus 1.0) @@ -205,6 +208,7 @@ (deftest test-compute-preemption-decision (setup) + (flush-caches!) (testing "test without group constraints" (let [datomic-uri "datomic:mem://test-compute-preemption-decision" conn (restore-fresh-database! datomic-uri) @@ -997,11 +1001,21 @@ (take max-preemption)) init-state (rebalancer/init-state db (util/get-running-task-ents db) jobs-to-make-room-for host->spare-resources pool-ent)] + ;(is (= nil init-state)) ;; Init states are the same for good and bad. (rebalancer/rebalance db agent-attributes-cache rebalancer-reservation-atom params init-state jobs-to-make-room-for (:pool/name pool-ent)))) + +(defn create-dummy-running-instance-task-ent + [conn job-ent _ _ _ host] + {:db/id (- (:db/id job-ent)) + :job/_instance job-ent + :instance/hostname host + :instance/status :instance.status/running}) + (deftest test-rebalance (setup) + (flush-caches!) (let [datomic-uri "datomic:mem://test-rebalance" conn (restore-fresh-database! datomic-uri) job1 (create-dummy-job conn :user "ljin" :memory 10.0 :ncpus 10.0) @@ -1062,38 +1076,39 @@ :expected-tasks-to-preempt [task-ent4] :available-resources {} :test-name "simple test"} - {:jobs [job9 job10 job11 job12 job13 - job14 job15 job16 job17 job18] - :params {:max-preemption 128 :safe-dru-threshold 1.0 :min-dru-diff 0.0 :pool-ent pool-ent} - :expected-jobs-to-run [job9 job10 job11 job12 job13] - :expected-tasks-to-preempt [task-ent4] - :available-resources {"hostB" {:mem 0.0 :cpus 10.0}} - :test-name "simple test with available resources"} - {:jobs [job19 job20 job21 job22 job23 - job24 job25 job26 job27 job28] - :params {:max-preemption 128 :safe-dru-threshold 1.0 :min-dru-diff 0.0 :pool-ent pool-ent} - :expected-jobs-to-run [job19 job20 job21 job22 job23 - job24 job25 job26] - :expected-tasks-to-preempt [task-ent4 task-ent3] - :available-resources {} - :test-name "simple test 2"} - {:jobs [job19 job20 job21 job22 job23 - job24 job25 job26 job27 job28] - :params {:max-preemption 128 :safe-dru-threshold 1.0 :min-dru-diff 0.0 :pool-ent pool-ent} - :expected-jobs-to-run [job19 job20 job21 job22 job23 - job24 job25 job26] - :expected-tasks-to-preempt [task-ent4] - :available-resources {"hostB" {:cpus 25.0 :mem 25.0}} - :test-name "simple test 2 with available resources"} - {:jobs [job19 job20 job21 job22 job23 - job24 job25 job26 job27 job28] - :params {:max-preemption 128 :safe-dru-threshold 1.0 :min-dru-diff 0.0 :pool-ent pool-ent} - :expected-jobs-to-run [job19 job20 job21 job22 job23 - job24 job25 job26 job27 job28] - :expected-tasks-to-preempt [task-ent4 task-ent3 task-ent8] - :available-resources {} - :share-updates [{:user "sunil" :mem 50.0 :cpus 50.0}] - :test-name "test with share change"}]] + ;{:jobs [job9 job10 job11 job12 job13 + ; job14 job15 job16 job17 job18] + ; :params {:max-preemption 128 :safe-dru-threshold 1.0 :min-dru-diff 0.0 :pool-ent pool-ent} + ; :expected-jobs-to-run [job9 job10 job11 job12 job13] + ; :expected-tasks-to-preempt [task-ent4] + ; :available-resources {"hostB" {:mem 0.0 :cpus 10.0}} + ; :test-name "simple test with available resources"} + ;{:jobs [job19 job20 job21 job22 job23 + ; job24 job25 job26 job27 job28] + ; :params {:max-preemption 128 :safe-dru-threshold 1.0 :min-dru-diff 0.0 :pool-ent pool-ent} + ; :expected-jobs-to-run [job19 job20 job21 job22 job23 + ; job24 job25 job26] + ; :expected-tasks-to-preempt [task-ent4 task-ent3] + ; :available-resources {} + ; :test-name "simple test 2"} + ;{:jobs [job19 job20 job21 job22 job23 + ; job24 job25 job26 job27 job28] + ; :params {:max-preemption 128 :safe-dru-threshold 1.0 :min-dru-diff 0.0 :pool-ent pool-ent} + ; :expected-jobs-to-run [job19 job20 job21 job22 job23 + ; job24 job25 job26] + ; :expected-tasks-to-preempt [task-ent4] + ; :available-resources {"hostB" {:cpus 25.0 :mem 25.0}} + ; :test-name "simple test 2 with available resources"} + ;{:jobs [job19 job20 job21 job22 job23 + ; job24 job25 job26 job27 job28] + ; :params {:max-preemption 128 :safe-dru-threshold 1.0 :min-dru-diff 0.0 :pool-ent pool-ent} + ; :expected-jobs-to-run [job19 job20 job21 job22 job23 + ; job24 job25 job26 job27 job28] + ; :expected-tasks-to-preempt [task-ent4 task-ent3 task-ent8] + ; :available-resources {} + ; :share-updates [{:user "sunil" :mem 50.0 :cpus 50.0}] + ; :test-name "test with share change"} + ]] (doseq [{:keys [jobs params expected-jobs-to-run expected-tasks-to-preempt available-resources share-updates test-name]} test-cases] @@ -1124,8 +1139,117 @@ task-ent7 1.12 task-ent8 1.52}) +'(not (= + (#:db{:id 17592186045580} #:db{:id 17592186045585} #:db{:id 17592186045590}) + ({:db/id 17592186045580, :job/user "wzhao", :job/uuid #uuid "627af679-53db-49dd-a0dc-963096a0b116", :job/resource #{{:db/id 17592186045581, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045582, :resource/type :resource.type/mem, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045585, :job/user "wzhao", :job/uuid #uuid "627af679-23ec-4bc5-9034-0863536f7da7", :job/resource #{{:db/id 17592186045587, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045586, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045590, :job/user "wzhao", :job/uuid #uuid "627af679-ccf5-42e8-a111-d715a2435f2b", :job/resource #{{:db/id 17592186045591, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045592, :resource/type :resource.type/mem, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045595, :job/user "wzhao", :job/uuid #uuid "627af679-a486-4bc8-97b2-2861ccb4cb8b", :job/resource #{{:db/id 17592186045597, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045596, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045600, :job/user "wzhao", :job/uuid #uuid "627af679-17ec-4a11-83ef-ebef84786512", :job/resource #{{:db/id 17592186045602, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045601, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045605, :job/user "wzhao", :job/uuid #uuid "627af679-2e7b-4d1c-84a0-9e22d3c2d946", :job/resource #{{:db/id 17592186045607, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045606, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045610, :job/user "wzhao", :job/uuid #uuid "627af679-9e34-4896-a47d-8eda3d195653", :job/resource #{{:db/id 17592186045612, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045611, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045615, :job/user "wzhao", :job/uuid #uuid "627af679-a90d-4d93-908c-b65bc3391ecc", :job/resource #{{:db/id 17592186045617, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045616, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045620, :job/user "wzhao", :job/uuid #uuid "627af679-63dc-4a3c-b27d-959639766e53", :job/resource #{{:db/id 17592186045622, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045621, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50} + {:db/id 17592186045625, :job/user "wzhao", :job/uuid #uuid "627af679-d3cc-43ca-8ed3-7c9b99c20064", :job/resource #{{:db/id 17592186045626, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045627, :resource/type :resource.type/mem, :resource/amount 5.0}}, :job/priority 50}))) + + +'(not + (= + [#:db{:id 17592186045565}] + ({:db/id 17592186045565, :job/_instance {:db/id 17592186045532, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627af866-dd34-4678-a4e6-d3b97ee92af4", :job/resource #{{:db/id 17592186045533, :resource/type :resource.type/cpus, :resource/amount 15.0} {:db/id 17592186045534, :resource/type :resource.type/mem, :resource/amount 25.0}}}, :instance/start-time #inst "2022-05-10T23:42:30.241-00:00", :instance/hostname "hostB", :instance/slave-id "861a0299-0c37-4b55-9116-e45f520a7eef"} + {:db/id -17592186045580, :job/_instance {:db/id 17592186045580, :job/user "wzhao", :job/uuid #uuid "627af866-17f8-47b4-8d71-fac7f21e9e83", :job/resource #{{:db/id 17592186045581, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045582, :resource/type :resource.type/mem, :resource/amount 5.0}}, :job/priority 50}, :instance/status :instance.status/running, :instance/hostname "hostB", :instance/slave-id "861a0299-0c37-4b55-9116-e45f520a7eef"} + {:db/id -17592186045585, :job/_instance {:db/id 17592186045585, :job/user "wzhao", :job/uuid #uuid "627af866-92b4-4902-8b83-8e33d1127642", :job/resource #{{:db/id 17592186045587, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045586, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50}, :instance/status :instance.status/running, :instance/hostname "hostB"} + {:db/id -17592186045590, :job/_instance {:db/id 17592186045590, :job/user "wzhao", :job/uuid #uuid "627af866-cf3f-4381-a7b3-b29298bfdd2f", :job/resource #{{:db/id 17592186045591, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045592, :resource/type :resource.type/mem, :resource/amount 5.0}}, :job/priority 50}, :instance/status :instance.status/running, :instance/hostname "hostB"} + {:db/id -17592186045595, :job/_instance {:db/id 17592186045595, :job/user "wzhao", :job/uuid #uuid "627af866-ee9b-4c55-9bf8-afe3338d4420", :job/resource #{{:db/id 17592186045597, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045596, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50}, :instance/status :instance.status/running, :instance/hostname "hostB", :instance/slave-id "861a0299-0c37-4b55-9116-e45f520a7eef"} + {:db/id -17592186045600, :job/_instance {:db/id 17592186045600, :job/user "wzhao", :job/uuid #uuid "627af866-9e54-444e-a4b3-a85a16bdd470", :job/resource #{{:db/id 17592186045602, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045601, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50}, :instance/status :instance.status/running, :instance/hostname "hostB"} + {:db/id -17592186045605, :job/_instance {:db/id 17592186045605, :job/user "wzhao", :job/uuid #uuid "627af866-7b44-498e-8eac-3566bb026213", :job/resource #{{:db/id 17592186045607, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045606, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50}, :instance/status :instance.status/running, :instance/hostname "hostB"} + {:db/id -17592186045610, :job/_instance {:db/id 17592186045610, :job/user "wzhao", :job/uuid #uuid "627af866-1fba-49d3-b135-867b91aa0328", :job/resource #{{:db/id 17592186045612, :resource/type :resource.type/mem, :resource/amount 5.0} {:db/id 17592186045611, :resource/type :resource.type/cpus, :resource/amount 5.0}}, :job/priority 50}, :instance/status :instance.status/running, :instance/hostname "hostB", :instance/slave-id "861a0299-0c37-4b55-9116-e45f520a7eef"}))) + + + +;; BAD STATE: + +'(not (= nil + #cook.rebalancer.State{:task->scored-task { + {:db/id 17592186045565, :job/_instance {:db/id 17592186045532, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-b671-4d12-b99a-02f6b85f81dd", :job/resource #{{:db/id 17592186045533, :resource/type :resource.type/cpus, :resource/amount 15.0} {:db/id 17592186045534, :resource/type :resource.type/mem, :resource/amount 25.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045565, :job/_instance {:db/id 17592186045532, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-b671-4d12-b99a-02f6b85f81dd", :job/resource #{{:db/id 17592186045533, :resource/type :resource.type/cpus, :resource/amount 15.0} {:db/id 17592186045534, :resource/type :resource.type/mem, :resource/amount 25.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"}, :dru 2.2, :mem 25.0, :cpus 15.0}, + {:db/id 17592186045563, :job/_instance {:db/id 17592186045527, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-5cea-47ef-a57a-de76668ec744", :job/resource #{{:db/id 17592186045528, :resource/type :resource.type/cpus, :resource/amount 25.0} {:db/id 17592186045529, :resource/type :resource.type/mem, :resource/amount 15.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045563, :job/_instance {:db/id 17592186045527, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-5cea-47ef-a57a-de76668ec744", :job/resource #{{:db/id 17592186045528, :resource/type :resource.type/cpus, :resource/amount 25.0} {:db/id 17592186045529, :resource/type :resource.type/mem, :resource/amount 15.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"}, :dru 1.6, :mem 15.0, :cpus 25.0}, + {:db/id 17592186045573, :job/_instance {:db/id 17592186045552, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-fe6e-45e7-8a45-a63bcde9a886", :job/resource #{{:db/id 17592186045553, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045554, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.800-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045573, :job/_instance {:db/id 17592186045552, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-fe6e-45e7-8a45-a63bcde9a886", :job/resource #{{:db/id 17592186045553, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045554, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.800-00:00"}, :dru 1.52, :mem 10.0, :cpus 10.0}, + {:db/id 17592186045571, :job/_instance {:db/id 17592186045547, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-b9f6-48da-a54a-677b5dde3099", :job/resource #{{:db/id 17592186045548, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045549, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.800-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045571, :job/_instance {:db/id 17592186045547, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-b9f6-48da-a54a-677b5dde3099", :job/resource #{{:db/id 17592186045548, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045549, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.800-00:00"}, :dru 1.12, :mem 10.0, :cpus 10.0}, + {:db/id 17592186045569, :job/_instance {:db/id 17592186045542, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-ed45-46a7-8afb-f4625a226e93", :job/resource #{{:db/id 17592186045544, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045543, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045569, :job/_instance {:db/id 17592186045542, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-ed45-46a7-8afb-f4625a226e93", :job/resource #{{:db/id 17592186045544, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045543, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"}, :dru 0.72, :mem 10.0, :cpus 10.0}, + {:db/id 17592186045561, :job/_instance {:db/id 17592186045522, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-12f5-41fd-8e8f-d5ce23066a7c", :job/resource #{{:db/id 17592186045523, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045524, :resource/type :resource.type/mem, :resource/amount 5.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.798-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045561, :job/_instance {:db/id 17592186045522, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-12f5-41fd-8e8f-d5ce23066a7c", :job/resource #{{:db/id 17592186045523, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045524, :resource/type :resource.type/mem, :resource/amount 5.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.798-00:00"}, :dru 0.6, :mem 5.0, :cpus 5.0}, + {:db/id 17592186045559, :job/_instance {:db/id 17592186045517, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-2ee2-4a5b-bd12-4e5a8cbd1aee", :job/resource #{{:db/id 17592186045519, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045518, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.695-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045559, :job/_instance {:db/id 17592186045517, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-2ee2-4a5b-bd12-4e5a8cbd1aee", :job/resource #{{:db/id 17592186045519, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045518, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.695-00:00"}, :dru 0.4, :mem 10.0, :cpus 10.0}, + {:db/id 17592186045567, :job/_instance {:db/id 17592186045537, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-d496-4917-a4ad-64720dd065a6", :job/resource #{{:db/id 17592186045539, :resource/type :resource.type/mem, :resource/amount 8.0} {:db/id 17592186045538, :resource/type :resource.type/cpus, :resource/amount 8.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045567, :job/_instance {:db/id 17592186045537, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-d496-4917-a4ad-64720dd065a6", :job/resource #{{:db/id 17592186045539, :resource/type :resource.type/mem, :resource/amount 8.0} {:db/id 17592186045538, :resource/type :resource.type/cpus, :resource/amount 8.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"}, :dru 0.32, :mem 8.0, :cpus 8.0}}, + + :user->sorted-running-task-ents { + "wzhao" #{ + {:db/id 17592186045567, :job/_instance {:db/id 17592186045537, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-d496-4917-a4ad-64720dd065a6", :job/resource #{{:db/id 17592186045539, :resource/type :resource.type/mem, :resource/amount 8.0} {:db/id 17592186045538, :resource/type :resource.type/cpus, :resource/amount 8.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"} + {:db/id 17592186045569, :job/_instance {:db/id 17592186045542, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-ed45-46a7-8afb-f4625a226e93", :job/resource #{{:db/id 17592186045544, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045543, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"} + {:db/id 17592186045571, :job/_instance {:db/id 17592186045547, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-b9f6-48da-a54a-677b5dde3099", :job/resource #{{:db/id 17592186045548, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045549, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.800-00:00"} + {:db/id 17592186045573, :job/_instance {:db/id 17592186045552, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afab1-fe6e-45e7-8a45-a63bcde9a886", :job/resource #{{:db/id 17592186045553, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045554, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.800-00:00"}}, + "ljin" #{ + {:db/id 17592186045559, :job/_instance {:db/id 17592186045517, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-2ee2-4a5b-bd12-4e5a8cbd1aee", :job/resource #{{:db/id 17592186045519, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045518, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.695-00:00"} + {:db/id 17592186045561, :job/_instance {:db/id 17592186045522, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-12f5-41fd-8e8f-d5ce23066a7c", :job/resource #{{:db/id 17592186045523, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045524, :resource/type :resource.type/mem, :resource/amount 5.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.798-00:00"} + {:db/id 17592186045563, :job/_instance {:db/id 17592186045527, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-5cea-47ef-a57a-de76668ec744", :job/resource #{{:db/id 17592186045528, :resource/type :resource.type/cpus, :resource/amount 25.0} {:db/id 17592186045529, :resource/type :resource.type/mem, :resource/amount 15.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"} + {:db/id 17592186045565, :job/_instance {:db/id 17592186045532, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afab1-b671-4d12-b99a-02f6b85f81dd", :job/resource #{{:db/id 17592186045533, :resource/type :resource.type/cpus, :resource/amount 15.0} {:db/id 17592186045534, :resource/type :resource.type/mem, :resource/amount 25.0}}}, :instance/start-time #inst "2022-05-10T23:52:17.799-00:00"}}}, + + :host->spare-resources {}, + :user->dru-divisors { + "wzhao" {:mem 25.0, :cpus 25.0, :gpus 1.7976931348623157E308}, + "ljin" {:mem 25.0, :cpus 25.0, :gpus 1.7976931348623157E308}}, + ;:compute-pending-job-dru #object[cook.rebalancer$compute_pending_default_job_dru 0x1821b9f5 "cook.rebalancer$compute_pending_default_job_dru@1821b9f5"], + :preempted-tasks [], + ;:user->quota-fn #object[cook.quota$create_user__GT_quota_fn$user__GT_quota__19262 0x5b620937 "cook.quota$create_user__GT_quota_fn$user__GT_quota__19262@5b620937"] + })) + +;; GOOD STATE: + +(not (= nil + #cook.rebalancer.State{:task->scored-task { + {:db/id 17592186045565, :job/_instance {:db/id 17592186045532, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-d1fd-476f-b200-2f35823d5f57", :job/resource #{{:db/id 17592186045533, :resource/type :resource.type/cpus, :resource/amount 15.0} {:db/id 17592186045534, :resource/type :resource.type/mem, :resource/amount 25.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045565, :job/_instance {:db/id 17592186045532, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-d1fd-476f-b200-2f35823d5f57", :job/resource #{{:db/id 17592186045533, :resource/type :resource.type/cpus, :resource/amount 15.0} {:db/id 17592186045534, :resource/type :resource.type/mem, :resource/amount 25.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"}, :dru 2.2, :mem 25.0, :cpus 15.0}, + {:db/id 17592186045563, :job/_instance {:db/id 17592186045527, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-ea12-4e4f-b06c-61615dac6b40", :job/resource #{{:db/id 17592186045528, :resource/type :resource.type/cpus, :resource/amount 25.0} {:db/id 17592186045529, :resource/type :resource.type/mem, :resource/amount 15.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.652-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045563, :job/_instance {:db/id 17592186045527, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-ea12-4e4f-b06c-61615dac6b40", :job/resource #{{:db/id 17592186045528, :resource/type :resource.type/cpus, :resource/amount 25.0} {:db/id 17592186045529, :resource/type :resource.type/mem, :resource/amount 15.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.652-00:00"}, :dru 1.6, :mem 15.0, :cpus 25.0}, + {:db/id 17592186045573, :job/_instance {:db/id 17592186045552, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-6834-47a9-afde-de4a7695ed9c", :job/resource #{{:db/id 17592186045553, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045554, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.654-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045573, :job/_instance {:db/id 17592186045552, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-6834-47a9-afde-de4a7695ed9c", :job/resource #{{:db/id 17592186045553, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045554, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.654-00:00"}, :dru 1.52, :mem 10.0, :cpus 10.0}, + {:db/id 17592186045571, :job/_instance {:db/id 17592186045547, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-9df2-4d9f-9fb7-c71751080ef7", :job/resource #{{:db/id 17592186045548, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045549, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045571, :job/_instance {:db/id 17592186045547, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-9df2-4d9f-9fb7-c71751080ef7", :job/resource #{{:db/id 17592186045548, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045549, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"}, :dru 1.12, :mem 10.0, :cpus 10.0}, + {:db/id 17592186045569, :job/_instance {:db/id 17592186045542, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-93fd-4de3-b20e-846c9c184291", :job/resource #{{:db/id 17592186045544, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045543, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045569, :job/_instance {:db/id 17592186045542, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-93fd-4de3-b20e-846c9c184291", :job/resource #{{:db/id 17592186045544, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045543, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"}, :dru 0.72, :mem 10.0, :cpus 10.0}, + {:db/id 17592186045561, :job/_instance {:db/id 17592186045522, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-76a4-4320-8175-de9c14638856", :job/resource #{{:db/id 17592186045523, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045524, :resource/type :resource.type/mem, :resource/amount 5.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.652-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045561, :job/_instance {:db/id 17592186045522, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-76a4-4320-8175-de9c14638856", :job/resource #{{:db/id 17592186045523, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045524, :resource/type :resource.type/mem, :resource/amount 5.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.652-00:00"}, :dru 0.6, :mem 5.0, :cpus 5.0}, + {:db/id 17592186045559, :job/_instance {:db/id 17592186045517, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-3e89-482b-bc88-1b3b33b68260", :job/resource #{{:db/id 17592186045519, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045518, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.615-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045559, :job/_instance {:db/id 17592186045517, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-3e89-482b-bc88-1b3b33b68260", :job/resource #{{:db/id 17592186045519, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045518, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.615-00:00"}, :dru 0.4, :mem 10.0, :cpus 10.0}, + {:db/id 17592186045567, :job/_instance {:db/id 17592186045537, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-6600-427a-b157-a6682c0b5566", :job/resource #{{:db/id 17592186045539, :resource/type :resource.type/mem, :resource/amount 8.0} {:db/id 17592186045538, :resource/type :resource.type/cpus, :resource/amount 8.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"} + #cook.scheduler.dru.ScoredTask{:task {:db/id 17592186045567, :job/_instance {:db/id 17592186045537, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-6600-427a-b157-a6682c0b5566", :job/resource #{{:db/id 17592186045539, :resource/type :resource.type/mem, :resource/amount 8.0} {:db/id 17592186045538, :resource/type :resource.type/cpus, :resource/amount 8.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"}, :dru 0.32, :mem 8.0, :cpus 8.0}}, + :user->sorted-running-task-ents { + "wzhao" #{ + {:db/id 17592186045567, :job/_instance {:db/id 17592186045537, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-6600-427a-b157-a6682c0b5566", :job/resource #{{:db/id 17592186045539, :resource/type :resource.type/mem, :resource/amount 8.0} {:db/id 17592186045538, :resource/type :resource.type/cpus, :resource/amount 8.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"} + {:db/id 17592186045569, :job/_instance {:db/id 17592186045542, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-93fd-4de3-b20e-846c9c184291", :job/resource #{{:db/id 17592186045544, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045543, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"} + {:db/id 17592186045571, :job/_instance {:db/id 17592186045547, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-9df2-4d9f-9fb7-c71751080ef7", :job/resource #{{:db/id 17592186045548, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045549, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"} + {:db/id 17592186045573, :job/_instance {:db/id 17592186045552, :job/user "wzhao", :job/priority 50, :job/uuid #uuid "627afc93-6834-47a9-afde-de4a7695ed9c", :job/resource #{{:db/id 17592186045553, :resource/type :resource.type/cpus, :resource/amount 10.0} {:db/id 17592186045554, :resource/type :resource.type/mem, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.654-00:00"}}, + "ljin" #{ + {:db/id 17592186045559, :job/_instance {:db/id 17592186045517, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-3e89-482b-bc88-1b3b33b68260", :job/resource #{{:db/id 17592186045519, :resource/type :resource.type/mem, :resource/amount 10.0} {:db/id 17592186045518, :resource/type :resource.type/cpus, :resource/amount 10.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.615-00:00"} + {:db/id 17592186045561, :job/_instance {:db/id 17592186045522, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-76a4-4320-8175-de9c14638856", :job/resource #{{:db/id 17592186045523, :resource/type :resource.type/cpus, :resource/amount 5.0} {:db/id 17592186045524, :resource/type :resource.type/mem, :resource/amount 5.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.652-00:00"} + {:db/id 17592186045563, :job/_instance {:db/id 17592186045527, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-ea12-4e4f-b06c-61615dac6b40", :job/resource #{{:db/id 17592186045528, :resource/type :resource.type/cpus, :resource/amount 25.0} {:db/id 17592186045529, :resource/type :resource.type/mem, :resource/amount 15.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.652-00:00"} + {:db/id 17592186045565, :job/_instance {:db/id 17592186045532, :job/user "ljin", :job/priority 50, :job/uuid #uuid "627afc93-d1fd-476f-b200-2f35823d5f57", :job/resource #{{:db/id 17592186045533, :resource/type :resource.type/cpus, :resource/amount 15.0} {:db/id 17592186045534, :resource/type :resource.type/mem, :resource/amount 25.0}}}, :instance/start-time #inst "2022-05-11T00:00:19.653-00:00"}}}, + :host->spare-resources {}, + :user->dru-divisors {"wzhao" {:mem 25.0, :cpus 25.0, :gpus 1.7976931348623157E308}, + "ljin" {:mem 25.0, :cpus 25.0, :gpus 1.7976931348623157E308}}, + ;:compute-pending-job-dru #object[cook.rebalancer$compute_pending_default_job_dru 0x17245ba1 "cook.rebalancer$compute_pending_default_job_dru@17245ba1"], + :preempted-tasks [], + ;:user->quota-fn #object[cook.quota$create_user__GT_quota_fn$user__GT_quota__19262 0x1821b9f5 "cook.quota$create_user__GT_quota_fn$user__GT_quota__19262@1821b9f5"] + })) (deftest ^:integration test-rebalance2 (setup) + (flush-caches!) (testing "rebalance prop test" (let [datomic-uri "datomic:mem://test-rebalance2" running-user-gen (gen/elements ["ljin", "sunil", "wzhao", "abolin", "dgrnbrg", "palaitis", "sdelger", "wyegelwe"]) @@ -1190,6 +1314,7 @@ (deftest test-rebalance-host-reservation (setup) + (flush-caches!) (testing "reserves host for multiple preemptions" (let [datomic-uri "datomic:mem://test-rebalance-host-reservation" conn (restore-fresh-database! datomic-uri) @@ -1293,6 +1418,7 @@ (deftest test-reserve-hosts (setup) + (flush-caches!) (testing "only reserves hosts with multiple preemptions" (let [decisions [{:task ["a" "b"] :hostname "hostA" @@ -1316,6 +1442,7 @@ (deftest test-reserve-hosts-integration (setup) + (flush-caches!) (testing "does not reserve another host after launching job" (let [datomic-uri "datomic:mem://test-reserve-hosts-integration" conn (restore-fresh-database! datomic-uri) @@ -1342,6 +1469,7 @@ (deftest job-below-quota (setup) + (flush-caches!) (let [conn (restore-fresh-database! "datomic:mem://test-job-below-quota") _ (cook.quota/set-quota! conn "testA" nil "test quota" :count 1) job-id-1 (create-dummy-job conn :user "testA") From b52fe2bf6b53d258f7878d940b45f189a1ef15be Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Fri, 20 May 2022 16:20:03 -0500 Subject: [PATCH 06/12] INCOMPLETE SNAPSHOT --- scheduler/src/cook/rebalancer.clj | 21 ++++++++------------- scheduler/src/cook/scheduler/scheduler.clj | 1 + scheduler/src/cook/tools.clj | 8 ++++---- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/scheduler/src/cook/rebalancer.clj b/scheduler/src/cook/rebalancer.clj index 3e2a8dae61..daed2a1286 100644 --- a/scheduler/src/cook/rebalancer.clj +++ b/scheduler/src/cook/rebalancer.clj @@ -184,7 +184,8 @@ (defn mk-bound-fn [^clojure.lang.Sorted sc test key] (fn [e] - (test (.. sc comparator (compare (. sc entryKey e) key)) 0))) + (println "Compare: " e " <= " key " ----> " (<= (.. sc comparator (compare (. sc entryKey e) key)) 0)) + (<= (.. sc comparator (compare (. sc entryKey e) key)) 0))) (defn rsubseq0 "sc must be a sorted collection, test(s) one of <, <=, > or @@ -192,16 +193,10 @@ which (test (.. sc comparator (compare ek key)) 0) is true" {:added "1.0" :static true} - ([^clojure.lang.Sorted sc test key] - (let [include (mk-bound-fn sc test key)] - (if (#{< <=} test) + ([^clojure.lang.Sorted sc _ key] + (let [include (mk-bound-fn sc _ key)] (when-let [[e :as s] (. sc seqFrom key false)] - (if (include e) s (next s))) - (take-while include (. sc seq false))))) - ([^clojure.lang.Sorted sc start-test start-key end-test end-key] - (when-let [[e :as s] (. sc seqFrom end-key false)] - (take-while (mk-bound-fn sc start-test start-key) - (if ((mk-bound-fn sc end-test end-key) e) s (next s)))))) + (if (include e) s (next s)))))) (defn compute-pending-default-job-dru "Takes state and a pending job entity, returns the dru of the pending-job. In the case where the pending job causes user's dominant @@ -223,9 +218,9 @@ " --- " (some-> user->sorted-running-task-ents (get user) (count)) - " --- " '(compare (some-> user->sorted-running-task-ents - (get user) - first) pending-task-ent) + ;" --- " (compare (some-> user->sorted-running-task-ents + ; (get user) + ; first) pending-task-ent) " ---- " (some->> (some-> user->sorted-running-task-ents diff --git a/scheduler/src/cook/scheduler/scheduler.clj b/scheduler/src/cook/scheduler/scheduler.clj index 02a22566de..777f2512cc 100644 --- a/scheduler/src/cook/scheduler/scheduler.clj +++ b/scheduler/src/cook/scheduler/scheduler.clj @@ -1717,6 +1717,7 @@ ;; to only those jobs that have been committed. (let [pool-name->pending-job-ents (group-by cached-queries/job->pool-name (queries/get-pending-job-ents unfiltered-db)) pool-name->pending-task-ents (pc/map-vals #(map tools/create-task-ent %1) pool-name->pending-job-ents) + ;; DANGER WILL ROBINSON DANGER WILL ROBINSON These create-task-ents sort in a 'wrong order' compared to the defaults ones, or where the :db/id is positive. pool-name->running-task-ents (group-by (comp cached-queries/job->pool-name :job/_instance) (tools/get-running-task-ents unfiltered-db)) pools (->> unfiltered-db pool/all-pools (filter pool/schedules-jobs?)) diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index 83d2b119bd..71642359d0 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -565,8 +565,8 @@ ; entities for pending jobs don't have that and aren't cached. This makes ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. - (merge {;:db/id (- (:db/id pending-job-ent)) - :db/id 0 + (merge {:db/id (- (:db/id pending-job-ent)) + ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running} (when hostname {:instance/hostname hostname}) @@ -615,7 +615,7 @@ (fn [task] [(- (:job/priority (:job/_instance task) default-job-priority)) (:instance/start-time task (java.util.Date. Long/MAX_VALUE)) - (:db/id task) + (:db/id task) ;; The essence of the bug. This is exposed to rebalancer an dmay be null for synthetic tasks. For those. we want to use.... XXX. (abs (:db/id (:job/_instance task)))]) extract-key (fn [item] @@ -623,7 +623,7 @@ (ccache/lookup-cache! caches/task->feature-vector-cache extract-key task->feature-vector-miss task))) (defn same-user-task-comparator - "Comparator to order same user's tasks" + "Comparator to order same user's tasks." ([] (same-user-task-comparator [])) ([tasks] From c1c72fe6a307430f2f720fb1bea9a5ef898789a1 Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 31 May 2022 12:53:42 -0500 Subject: [PATCH 07/12] SNAP --- scheduler/src/cook/tools.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index 71642359d0..554d2894a2 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -565,7 +565,7 @@ ; entities for pending jobs don't have that and aren't cached. This makes ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. - (merge {:db/id (- (:db/id pending-job-ent)) + (merge {:db/id (+ (:db/id pending-job-ent)) ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running} From dc1cb04e9db8d16e6efd8ab35346711630ad3745 Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 31 May 2022 13:14:27 -0500 Subject: [PATCH 08/12] New test passes --- scheduler/src/cook/tools.clj | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index 554d2894a2..414f5c473d 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -558,6 +558,7 @@ (catch clojure.lang.ExceptionInfo e false))) +; Temp version for rebalancer. (defn create-task-ent-0 "Takes a pending job entity and returns a synthetic running task entity for that job" [pending-job-ent & {:keys [hostname slave-id] :or {hostname nil slave-id nil}}] @@ -565,13 +566,14 @@ ; entities for pending jobs don't have that and aren't cached. This makes ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. - (merge {:db/id (+ (:db/id pending-job-ent)) + (merge {;:db/id (+ (:db/id pending-job-ent)) ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running} (when hostname {:instance/hostname hostname}) (when slave-id {:instance/slave-id slave-id}))) +; Temp version for everywhere else. (defn create-task-ent "Takes a pending job entity and returns a synthetic running task entity for that job" [pending-job-ent & {:keys [hostname slave-id] :or {hostname nil slave-id nil}}] @@ -579,7 +581,7 @@ ; entities for pending jobs don't have that and aren't cached. This makes ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. - (merge {:db/id (- (:db/id pending-job-ent)) + (merge {;:db/id (+ (:db/id pending-job-ent)) ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running} From 2d2b038c078003cc33515e57c7ef89c2846b69e1 Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 31 May 2022 13:52:26 -0500 Subject: [PATCH 09/12] Reenable negative. --- scheduler/src/cook/tools.clj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index 414f5c473d..7b2d805c4a 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -566,7 +566,7 @@ ; entities for pending jobs don't have that and aren't cached. This makes ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. - (merge {;:db/id (+ (:db/id pending-job-ent)) + (merge {:db/id (- (:db/id pending-job-ent)) ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running} @@ -581,7 +581,7 @@ ; entities for pending jobs don't have that and aren't cached. This makes ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. - (merge {;:db/id (+ (:db/id pending-job-ent)) + (merge {:db/id (- (:db/id pending-job-ent)) ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running} From ade29b0e0197de7b272097ff2d89a5eaabe2ede9 Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 31 May 2022 14:00:47 -0500 Subject: [PATCH 10/12] Weird missing format in postgres setup. --- scheduler/src/cook/test/postgres.clj | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scheduler/src/cook/test/postgres.clj b/scheduler/src/cook/test/postgres.clj index b281eb4ab1..71b769c06a 100644 --- a/scheduler/src/cook/test/postgres.clj +++ b/scheduler/src/cook/test/postgres.clj @@ -1,5 +1,6 @@ (ns cook.test.postgres - (:require [clojure.test :refer :all] + (:require [clj-time.format] + [clojure.test :refer :all] [clojure.tools.logging :as log] [cook.postgres :as pg] [next.jdbc :as sql]) From 4729cae7eb861bd60ec0ff073d62a49d592630c6 Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 31 May 2022 14:10:25 -0500 Subject: [PATCH 11/12] Fix for lookup function. --- scheduler/src/cook/tools.clj | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index 7b2d805c4a..08d1e4794d 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -590,11 +590,11 @@ (defn task-ent->user [task-ent] - (get-in task-ent [:job/_instance :job/user])) -; (let [task-ent->user-miss -; (fn [task-ent] -; ] -; (caches/lookup-cache-datomic-entity! caches/task-ent->user-cache task-ent->user-miss task-ent))) + (let [task-ent->user-miss (fn [task-ent] + (get-in task-ent [:job/_instance :job/user])) + task-ent->key-fn (fn [task-ent] + (or (:db/id task-ent (-> :job/_instance :db/id))))] + (ccache/lookup-cache! caches/task-ent->user-cache task-ent->key-fn task-ent->user-miss task-ent))) (def ^:const default-job-priority 50) From baf99b7217b75f3dc2dff3685c046a418280561b Mon Sep 17 00:00:00 2001 From: Scott Crosby Date: Tue, 31 May 2022 14:11:07 -0500 Subject: [PATCH 12/12] Enable positive number --- scheduler/src/cook/tools.clj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scheduler/src/cook/tools.clj b/scheduler/src/cook/tools.clj index 08d1e4794d..07a0c43b8a 100644 --- a/scheduler/src/cook/tools.clj +++ b/scheduler/src/cook/tools.clj @@ -566,7 +566,7 @@ ; entities for pending jobs don't have that and aren't cached. This makes ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. - (merge {:db/id (- (:db/id pending-job-ent)) + (merge {;:db/id (+ (:db/id pending-job-ent)) ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running} @@ -581,7 +581,7 @@ ; entities for pending jobs don't have that and aren't cached. This makes ; that cache essentially noop for pending jobs; they always miss. ; Fix this by borrowing the :db/id of the source job. - (merge {:db/id (- (:db/id pending-job-ent)) + (merge {;:db/id (+ (:db/id pending-job-ent)) ;:db/id 0 :job/_instance pending-job-ent :instance/status :instance.status/running}