-
Notifications
You must be signed in to change notification settings - Fork 634
Expand file tree
/
Copy pathconfig.yaml
More file actions
234 lines (224 loc) · 8.13 KB
/
config.yaml
File metadata and controls
234 lines (224 loc) · 8.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
{{$MODE := DefaultParam .CL2_MODE "Indexed"}}
{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
{{$token := .CL2_TOKEN }}
{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
# Node resource configuration
{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
# fast fill job configuration - for initial fill up
{{$fillPercentage := DefaultParam .CL2_FILL_PERCENTAGE 90}}
{{$fillPodsCount := DivideInt (MultiplyInt $totalGPUs $fillPercentage) 100}}
{{$fillPodsPerNamespace := DivideInt $fillPodsCount $namespaces}}
{{$longJobSize := 1}}
{{$longJobRunningTime := DefaultParam .CL2_LONG_JOB_RUNNING_TIME "1h"}}
# churn job configuration for steady state
{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
{{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
{{$smallJobSize := 1}}
{{$smallJobCompletions := 10}}
{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
{{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}}
{{$extendedResourceName := ""}}
{{if $ENABLE_EXTENDED_RESOURCES}}
{{$extendedResourceName = DefaultParam .CL2_EXTENDED_RESOURCE_NAME "example.com/gpu"}}
name: dra-extended-resources-steady-state
{{else}}
name: dra-steady-state
{{end}}
namespace:
number: {{$namespaces}}
tuningSets:
- name: FastFill
qpsLoad:
qps: {{$LOAD_TEST_THROUGHPUT}}
- name: SteadyState
qpsLoad:
qps: {{$STEADY_STATE_QPS}}
dependencies:
- name: Install dra-example-driver for test
Method: DRATestDriver
Params:
WorkerNodeCount: {{.Nodes}}
{{if $ENABLE_EXTENDED_RESOURCES}}
ExtendedResourceName: {{$extendedResourceName}}
{{end}}
Timeout: 5m
steps:
- name: Start measurements
measurements:
- Identifier: WaitForFinishedJobs
Method: WaitForFinishedJobs
Params:
action: start
labelSelector: job-type = short-lived
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: start
apiVersion: batch/v1
kind: Job
labelSelector: job-type = long-running
operationTimeout: 120s
- Identifier: FastFillPodStartupLatency
Method: PodStartupLatency
Params:
action: start
labelSelector: job-type = long-running
- Identifier: FastFillClaimAllocationLatency
Method: ResourceClaimAllocationLatency
Params:
action: start
- Identifier: FastFillSchedulingMetrics
Method: PrometheusSchedulingMetrics
Params:
action: start
- Identifier: FastFillDRAMetrics
Method: GenericPrometheusQuery
Params:
action: start
metricName: fastfill_kubelet_latencies
metricVersion: v1
unit: s
queries:
- name: p99_dra_prepare_resources
query: histogram_quantile(0.99, sum(rate(dra_operations_duration_seconds_bucket{operation_name="PrepareResources"}[%v])) by (le))
- name: p99_dra_unprepare_operations
query: histogram_quantile(0.99, sum(rate(dra_operations_duration_seconds_bucket{operation_name="UnprepareResources"}[%v])) by (le))
- name: p99_dra_grpc_node_prepare_resources
query: histogram_quantile(0.99, sum(rate(dra_grpc_operations_duration_seconds_bucket{method_name=~".*NodePrepareResources"}[%v])) by (le))
- name: p99_dra_grpc_node_unprepare_resources
query: histogram_quantile(0.99, sum(rate(dra_grpc_operations_duration_seconds_bucket{method_name=~".*NodeUnprepareResources"}[%v])) by (le))
{{if not $ENABLE_EXTENDED_RESOURCES}}
- name: Create ResourceClaimTemplates in namespaces
phases:
- namespaceRange:
min: 1
max: {{$namespaces}}
replicasPerNamespace: 1
tuningSet: FastFill
objectBundle:
- basename: single-gpu
objectTemplatePath: "resourceclaimtemplate.yaml"
{{end}}
- name: Fill cluster to {{$fillPercentage}}% utilization
phases:
- namespaceRange:
min: 1
max: {{$namespaces}}
replicasPerNamespace: {{$fillPodsPerNamespace}}
tuningSet: FastFill
objectBundle:
- basename: long-running
objectTemplatePath: "long-running-job.yaml"
templateFillMap:
Replicas: {{$longJobSize}}
Mode: {{$MODE}}
Sleep: {{$longJobRunningTime}}
ExtendedResource: {{ $ENABLE_EXTENDED_RESOURCES }}
- name: Wait for fill pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: gather
labelSelector: job-type = long-running
timeout: 15m
- name: Gather measurements for long running pods
measurements:
- Identifier: FastFillSchedulingMetrics
Method: PrometheusSchedulingMetrics
Params:
action: gather
- Identifier: FastFillPodStartupLatency
Method: PodStartupLatency
Params:
action: gather
- Identifier: FastFillClaimAllocationLatency
Method: ResourceClaimAllocationLatency
Params:
action: gather
- Identifier: FastFillDRAMetrics
Method: GenericPrometheusQuery
Params:
action: gather
- name: reset metrics for steady state churn
measurements:
- Identifier: ChurnSchedulingMetrics
Method: PrometheusSchedulingMetrics
Params:
action: start
- Identifier: ChurnPodStartupLatency
Method: PodStartupLatency
Params:
action: start
labelSelector: job-type = short-lived
perc50Threshold: 40s
perc90Threshold: 60s
perc99Threshold: 80s
- Identifier: ChurnClaimAllocationLatency
Method: ResourceClaimAllocationLatency
Params:
action: start
- Identifier: ChurnDRAMetrics
Method: GenericPrometheusQuery
Params:
action: start
metricName: churn_kubelet_latencies
metricVersion: v1
unit: s
queries:
- name: p99_prepare_operations
query: histogram_quantile(0.99, sum(rate(dra_operations_duration_seconds_bucket{operation_name="PrepareResources"}[%v])) by (le))
- name: p99_unprepare_operations
query: histogram_quantile(0.99, sum(rate(dra_operations_duration_seconds_bucket{operation_name="UnprepareResources"}[%v])) by (le))
- name: p99_dra_grpc_node_prepare_resources
query: histogram_quantile(0.99, sum(rate(dra_grpc_operations_duration_seconds_bucket{method_name=~".*NodePrepareResources"}[%v])) by (le))
- name: p99_dra_grpc_node_unprepare_resources
query: histogram_quantile(0.99, sum(rate(dra_grpc_operations_duration_seconds_bucket{method_name=~".*NodeUnprepareResources"}[%v])) by (le))
- name: Create steady state {{$MODE}} jobs
phases:
- namespaceRange:
min: 1
max: {{$namespaces}}
replicasPerNamespace: {{$smallJobsPerNamespace}}
tuningSet: SteadyState
objectBundle:
- basename: small
objectTemplatePath: "job.yaml"
templateFillMap:
Replicas: {{$smallJobSize}}
CompletionReplicas: {{$smallJobCompletions}}
Mode: {{$MODE}}
Sleep: {{$jobRunningTime}}
ExtendedResource: {{ $ENABLE_EXTENDED_RESOURCES }}
- name: Wait for short-lived jobs to finish
measurements:
- Identifier: WaitForFinishedJobs
Method: WaitForFinishedJobs
Params:
action: gather
labelSelector: job-type = short-lived
timeout: 30m
- name: Measure scheduler metrics
measurements:
- Identifier: ChurnSchedulingMetrics
Method: PrometheusSchedulingMetrics
Params:
action: gather
- Identifier: ChurnPodStartupLatency
Method: PodStartupLatency
Params:
action: gather
perc50Threshold: 40s
perc90Threshold: 60s
perc99Threshold: 80s
- Identifier: ChurnClaimAllocationLatency
Method: ResourceClaimAllocationLatency
Params:
action: gather
- Identifier: ChurnDRAMetrics
Method: GenericPrometheusQuery
Params:
action: gather