xref: /linux/drivers/gpu/drm/scheduler/sched_rq.c (revision 16e7698bc04d3dd19d95a688e4b0297a0e28a93b)
1 // SPDX-License-Identifier: MIT
2 /* Copyright 2015 Advanced Micro Devices, Inc. */
3 /* Copyright (c) 2025 Valve Corporation */
4 
5 #include <linux/rbtree.h>
6 
7 #include <drm/drm_print.h>
8 #include <drm/gpu_scheduler.h>
9 
10 #include "sched_internal.h"
11 
12 static __always_inline bool
13 drm_sched_entity_compare_before(struct rb_node *a, const struct rb_node *b)
14 {
15 	struct drm_sched_entity *ea =
16 		rb_entry((a), struct drm_sched_entity, rb_tree_node);
17 	struct drm_sched_entity *eb =
18 		rb_entry((b), struct drm_sched_entity, rb_tree_node);
19 
20 	return ktime_before(ea->oldest_job_waiting, eb->oldest_job_waiting);
21 }
22 
23 static void drm_sched_rq_update_prio(struct drm_sched_rq *rq)
24 {
25 	enum drm_sched_priority prio = DRM_SCHED_PRIORITY_INVALID;
26 	struct rb_node *rb;
27 
28 	lockdep_assert_held(&rq->lock);
29 
30 	rb = rb_first_cached(&rq->rb_tree_root);
31 	if (rb) {
32 		struct drm_sched_entity *entity =
33 			rb_entry(rb, typeof(*entity), rb_tree_node);
34 
35 		/*
36 		 * The normal locking order is entity then run-queue so taking
37 		 * the entity lock here would be a locking inversion for the
38 		 * case when the current head of the run-queue is different from
39 		 * the one we already have locked. The unlocked read is fine
40 		 * though, because if the priority had just changed it is no big
41 		 * deal for our algorithm, but just a transient reachable only
42 		 * by drivers with userspace dynamic priority changes API. Equal
43 		 * in effect to the priority change becoming visible a few
44 		 * instructions later.
45 		 */
46 		prio = READ_ONCE(entity->priority);
47 	}
48 
49 	rq->head_prio = prio;
50 }
51 
52 static void drm_sched_rq_remove_tree_locked(struct drm_sched_entity *entity,
53 					    struct drm_sched_rq *rq)
54 {
55 	lockdep_assert_held(&entity->lock);
56 	lockdep_assert_held(&rq->lock);
57 
58 	if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
59 		rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
60 		RB_CLEAR_NODE(&entity->rb_tree_node);
61 		drm_sched_rq_update_prio(rq);
62 	}
63 }
64 
65 static void drm_sched_rq_update_tree_locked(struct drm_sched_entity *entity,
66 					    struct drm_sched_rq *rq,
67 					    ktime_t ts)
68 {
69 	/*
70 	 * Both locks need to be grabbed, one to protect from entity->rq change
71 	 * for entity from within concurrent drm_sched_entity_select_rq and the
72 	 * other to update the rb tree structure.
73 	 */
74 	lockdep_assert_held(&entity->lock);
75 	lockdep_assert_held(&rq->lock);
76 
77 	drm_sched_rq_remove_tree_locked(entity, rq);
78 
79 	entity->oldest_job_waiting = ts;
80 
81 	rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
82 		      drm_sched_entity_compare_before);
83 	drm_sched_rq_update_prio(rq);
84 }
85 
86 /**
87  * drm_sched_rq_init - initialize a given run queue struct
88  * @rq: scheduler run queue
89  *
90  * Initializes a scheduler runqueue.
91  */
92 void drm_sched_rq_init(struct drm_sched_rq *rq)
93 {
94 	spin_lock_init(&rq->lock);
95 	INIT_LIST_HEAD(&rq->entities);
96 	rq->rb_tree_root = RB_ROOT_CACHED;
97 	rq->head_prio = DRM_SCHED_PRIORITY_INVALID;
98 }
99 
100 /*
101  * Core part of the CFS-like algorithm is that the virtual runtime of lower
102  * priority tasks should grow quicker than the higher priority ones, so that
103  * when we then schedule entities with the aim of keeping their accumulated
104  * virtual time balanced, we can approach fair distribution of GPU time.
105  *
106  * For converting the real GPU time into virtual we pick some multipliers with
107  * the idea to achieve the following GPU time distribution:
108  *
109  *  - Kernel priority gets roughly 2x GPU time compared to high.
110  *  - High gets ~4x relative to normal.
111  *  - Normal gets ~8x relative to low.
112  */
113 static const unsigned int vruntime_shift[] = {
114 	[DRM_SCHED_PRIORITY_KERNEL] = 1,
115 	[DRM_SCHED_PRIORITY_HIGH]   = 2,
116 	[DRM_SCHED_PRIORITY_NORMAL] = 4,
117 	[DRM_SCHED_PRIORITY_LOW]    = 7,
118 };
119 
120 static ktime_t
121 drm_sched_rq_get_min_vruntime(struct drm_sched_rq *rq)
122 {
123 	ktime_t vruntime = 0;
124 	struct rb_node *rb;
125 
126 	lockdep_assert_held(&rq->lock);
127 
128 	rb = rb_first_cached(&rq->rb_tree_root);
129 	if (rb) {
130 		struct drm_sched_entity *entity =
131 			rb_entry(rb, typeof(*entity), rb_tree_node);
132 		struct drm_sched_entity_stats *stats = entity->stats;
133 
134 		spin_lock(&stats->lock);
135 		vruntime = stats->vruntime;
136 		spin_unlock(&stats->lock);
137 	}
138 
139 	return vruntime;
140 }
141 
142 static void
143 drm_sched_entity_save_vruntime(struct drm_sched_entity *entity,
144 			       ktime_t min_vruntime)
145 {
146 	struct drm_sched_entity_stats *stats = entity->stats;
147 	ktime_t vruntime;
148 
149 	spin_lock(&stats->lock);
150 	vruntime = stats->vruntime;
151 	if (min_vruntime && vruntime > min_vruntime)
152 		vruntime = ktime_sub(vruntime, min_vruntime);
153 	else
154 		vruntime = 0;
155 	stats->vruntime = vruntime;
156 	spin_unlock(&stats->lock);
157 }
158 
159 static ktime_t
160 drm_sched_entity_restore_vruntime(struct drm_sched_entity *entity,
161 				  ktime_t min_vruntime,
162 				  enum drm_sched_priority rq_prio)
163 {
164 	struct drm_sched_entity_stats *stats = entity->stats;
165 	struct drm_gpu_scheduler *sched =
166 			container_of(entity->rq, typeof(*sched), rq);
167 	enum drm_sched_priority prio = entity->priority;
168 	unsigned long avg_us, sched_avg_us;
169 	ktime_t vruntime;
170 
171 	BUILD_BUG_ON(DRM_SCHED_PRIORITY_NORMAL < DRM_SCHED_PRIORITY_HIGH);
172 
173 	spin_lock(&stats->lock);
174 	vruntime = stats->vruntime;
175 	avg_us = ewma_drm_sched_avgtime_read(&stats->avg_job_us);
176 	/*
177 	 * Unlocked read of the scheduler average is fine since it is just
178 	 * heuristics and data type is a natural word size.
179 	 */
180 	sched_avg_us = ewma_drm_sched_avgtime_read(&sched->avg_job_us);
181 
182 	/*
183 	 * Special handling for entities which were picked from the top of the
184 	 * queue and are now re-joining the top with another one already there.
185 	 */
186 	if (!vruntime && rq_prio != DRM_SCHED_PRIORITY_INVALID) {
187 		if (prio > rq_prio) {
188 			/*
189 			 * Lower priority should not overtake higher when re-
190 			 * joining at the top of the queue so push it back
191 			 * somewhere behind the "middle" of the run-queue,
192 			 * proportional to the scheduler and entity average job
193 			 * durations.
194 			 */
195 			vruntime = us_to_ktime((1 + avg_us + sched_avg_us) <<
196 					       vruntime_shift[prio]);
197 		} else if (prio < rq_prio) {
198 			/*
199 			 * Higher priority can go first.
200 			 */
201 			vruntime = -ns_to_ktime(rq_prio - prio);
202 		} else {
203 			/* Favour entity with shorter jobs (interactivity). */
204 			if (avg_us <= sched_avg_us)
205 				vruntime = -ns_to_ktime(1);
206 			else
207 				vruntime = ns_to_ktime(1);
208 		}
209 	}
210 
211 	/*
212 	 * Restore saved relative position in the queue.
213 	 */
214 	vruntime = ktime_add(min_vruntime, vruntime);
215 
216 	stats->vruntime = vruntime;
217 	spin_unlock(&stats->lock);
218 
219 	return vruntime;
220 }
221 
222 static ktime_t drm_sched_entity_update_vruntime(struct drm_sched_entity *entity)
223 {
224 	struct drm_sched_entity_stats *stats = entity->stats;
225 	ktime_t runtime, prev;
226 
227 	spin_lock(&stats->lock);
228 	prev = stats->prev_runtime;
229 	runtime = stats->runtime;
230 	stats->prev_runtime = runtime;
231 	runtime = ktime_add_ns(stats->vruntime,
232 			       ktime_to_ns(ktime_sub(runtime, prev)) <<
233 			       vruntime_shift[entity->priority]);
234 	stats->vruntime = runtime;
235 	spin_unlock(&stats->lock);
236 
237 	return runtime;
238 }
239 
240 /**
241  * drm_sched_rq_add_entity - add an entity
242  * @entity: scheduler entity
243  *
244  * Adds a scheduler entity to the run queue.
245  *
246  * Return: DRM scheduler selected to handle this entity or NULL if entity has
247  * been stopped and cannot be submitted to.
248  */
249 struct drm_gpu_scheduler *
250 drm_sched_rq_add_entity(struct drm_sched_entity *entity)
251 {
252 	struct drm_gpu_scheduler *sched;
253 	struct drm_sched_rq *rq;
254 	ktime_t ts;
255 
256 	/* Add the entity to the run queue */
257 	spin_lock(&entity->lock);
258 	if (entity->stopped) {
259 		spin_unlock(&entity->lock);
260 
261 		DRM_ERROR("Trying to push to a killed entity\n");
262 		return NULL;
263 	}
264 
265 	rq = entity->rq;
266 	sched = container_of(rq, typeof(*sched), rq);
267 	spin_lock(&rq->lock);
268 
269 	if (list_empty(&entity->list)) {
270 		atomic_inc(sched->score);
271 		list_add_tail(&entity->list, &rq->entities);
272 	}
273 
274 	ts = drm_sched_rq_get_min_vruntime(rq);
275 	ts = drm_sched_entity_restore_vruntime(entity, ts, rq->head_prio);
276 	drm_sched_rq_update_tree_locked(entity, rq, ts);
277 
278 	spin_unlock(&rq->lock);
279 	spin_unlock(&entity->lock);
280 
281 	return sched;
282 }
283 
284 /**
285  * drm_sched_rq_remove_entity - remove an entity
286  * @rq: scheduler run queue
287  * @entity: scheduler entity
288  *
289  * Removes a scheduler entity from the run queue.
290  */
291 void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
292 				struct drm_sched_entity *entity)
293 {
294 	struct drm_gpu_scheduler *sched = container_of(rq, typeof(*sched), rq);
295 
296 	lockdep_assert_held(&entity->lock);
297 
298 	if (list_empty(&entity->list))
299 		return;
300 
301 	spin_lock(&rq->lock);
302 
303 	atomic_dec(sched->score);
304 	list_del_init(&entity->list);
305 
306 	drm_sched_rq_remove_tree_locked(entity, rq);
307 
308 	spin_unlock(&rq->lock);
309 }
310 
311 /**
312  * drm_sched_rq_pop_entity - pops an entity
313  * @entity: scheduler entity
314  *
315  * To be called every time after a job is popped from the entity.
316  */
317 void drm_sched_rq_pop_entity(struct drm_sched_entity *entity)
318 {
319 	struct drm_sched_job *next_job;
320 	struct drm_sched_rq *rq;
321 
322 	/*
323 	 * Update the entity's location in the min heap according to
324 	 * the timestamp of the next job, if any.
325 	 */
326 	spin_lock(&entity->lock);
327 	rq = entity->rq;
328 	spin_lock(&rq->lock);
329 	next_job = drm_sched_entity_queue_peek(entity);
330 	if (next_job) {
331 		ktime_t ts;
332 
333 		ts = drm_sched_entity_update_vruntime(entity);
334 		drm_sched_rq_update_tree_locked(entity, rq, ts);
335 	} else {
336 		ktime_t min_vruntime;
337 
338 		drm_sched_rq_remove_tree_locked(entity, rq);
339 		min_vruntime = drm_sched_rq_get_min_vruntime(rq);
340 		drm_sched_entity_save_vruntime(entity, min_vruntime);
341 	}
342 	spin_unlock(&rq->lock);
343 	spin_unlock(&entity->lock);
344 }
345 
346 /**
347  * drm_sched_select_entity - Select an entity which provides a job to run
348  * @sched: the gpu scheduler
349  *
350  * Find oldest waiting ready entity.
351  *
352  * Return an entity if one is found; return an error-pointer (!NULL) if an
353  * entity was ready, but the scheduler had insufficient credits to accommodate
354  * its job; return NULL, if no ready entity was found.
355  */
356 struct drm_sched_entity *
357 drm_sched_select_entity(struct drm_gpu_scheduler *sched)
358 {
359 	struct drm_sched_rq *rq = &sched->rq;
360 	struct rb_node *rb;
361 
362 	spin_lock(&rq->lock);
363 	for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) {
364 		struct drm_sched_entity *entity;
365 
366 		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
367 		if (drm_sched_entity_is_ready(entity)) {
368 			/* If we can't queue yet, preserve the current entity in
369 			 * terms of fairness.
370 			 */
371 			if (!drm_sched_can_queue(sched, entity)) {
372 				spin_unlock(&rq->lock);
373 				return ERR_PTR(-ENOSPC);
374 			}
375 
376 			reinit_completion(&entity->entity_idle);
377 			break;
378 		}
379 	}
380 	spin_unlock(&rq->lock);
381 
382 	return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
383 }
384