1 // SPDX-License-Identifier: MIT 2 /* Copyright 2015 Advanced Micro Devices, Inc. */ 3 /* Copyright (c) 2025 Valve Corporation */ 4 5 #include <linux/rbtree.h> 6 7 #include <drm/drm_print.h> 8 #include <drm/gpu_scheduler.h> 9 10 #include "sched_internal.h" 11 12 static __always_inline bool 13 drm_sched_entity_compare_before(struct rb_node *a, const struct rb_node *b) 14 { 15 struct drm_sched_entity *ea = 16 rb_entry((a), struct drm_sched_entity, rb_tree_node); 17 struct drm_sched_entity *eb = 18 rb_entry((b), struct drm_sched_entity, rb_tree_node); 19 20 return ktime_before(ea->oldest_job_waiting, eb->oldest_job_waiting); 21 } 22 23 static void drm_sched_rq_update_prio(struct drm_sched_rq *rq) 24 { 25 enum drm_sched_priority prio = DRM_SCHED_PRIORITY_INVALID; 26 struct rb_node *rb; 27 28 lockdep_assert_held(&rq->lock); 29 30 rb = rb_first_cached(&rq->rb_tree_root); 31 if (rb) { 32 struct drm_sched_entity *entity = 33 rb_entry(rb, typeof(*entity), rb_tree_node); 34 35 /* 36 * The normal locking order is entity then run-queue so taking 37 * the entity lock here would be a locking inversion for the 38 * case when the current head of the run-queue is different from 39 * the one we already have locked. The unlocked read is fine 40 * though, because if the priority had just changed it is no big 41 * deal for our algorithm, but just a transient reachable only 42 * by drivers with userspace dynamic priority changes API. Equal 43 * in effect to the priority change becoming visible a few 44 * instructions later. 45 */ 46 prio = READ_ONCE(entity->priority); 47 } 48 49 rq->head_prio = prio; 50 } 51 52 static void drm_sched_rq_remove_tree_locked(struct drm_sched_entity *entity, 53 struct drm_sched_rq *rq) 54 { 55 lockdep_assert_held(&entity->lock); 56 lockdep_assert_held(&rq->lock); 57 58 if (!RB_EMPTY_NODE(&entity->rb_tree_node)) { 59 rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root); 60 RB_CLEAR_NODE(&entity->rb_tree_node); 61 drm_sched_rq_update_prio(rq); 62 } 63 } 64 65 static void drm_sched_rq_update_tree_locked(struct drm_sched_entity *entity, 66 struct drm_sched_rq *rq, 67 ktime_t ts) 68 { 69 /* 70 * Both locks need to be grabbed, one to protect from entity->rq change 71 * for entity from within concurrent drm_sched_entity_select_rq and the 72 * other to update the rb tree structure. 73 */ 74 lockdep_assert_held(&entity->lock); 75 lockdep_assert_held(&rq->lock); 76 77 drm_sched_rq_remove_tree_locked(entity, rq); 78 79 entity->oldest_job_waiting = ts; 80 81 rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root, 82 drm_sched_entity_compare_before); 83 drm_sched_rq_update_prio(rq); 84 } 85 86 /** 87 * drm_sched_rq_init - initialize a given run queue struct 88 * @rq: scheduler run queue 89 * 90 * Initializes a scheduler runqueue. 91 */ 92 void drm_sched_rq_init(struct drm_sched_rq *rq) 93 { 94 spin_lock_init(&rq->lock); 95 INIT_LIST_HEAD(&rq->entities); 96 rq->rb_tree_root = RB_ROOT_CACHED; 97 rq->head_prio = DRM_SCHED_PRIORITY_INVALID; 98 } 99 100 /* 101 * Core part of the CFS-like algorithm is that the virtual runtime of lower 102 * priority tasks should grow quicker than the higher priority ones, so that 103 * when we then schedule entities with the aim of keeping their accumulated 104 * virtual time balanced, we can approach fair distribution of GPU time. 105 * 106 * For converting the real GPU time into virtual we pick some multipliers with 107 * the idea to achieve the following GPU time distribution: 108 * 109 * - Kernel priority gets roughly 2x GPU time compared to high. 110 * - High gets ~4x relative to normal. 111 * - Normal gets ~8x relative to low. 112 */ 113 static const unsigned int vruntime_shift[] = { 114 [DRM_SCHED_PRIORITY_KERNEL] = 1, 115 [DRM_SCHED_PRIORITY_HIGH] = 2, 116 [DRM_SCHED_PRIORITY_NORMAL] = 4, 117 [DRM_SCHED_PRIORITY_LOW] = 7, 118 }; 119 120 static ktime_t 121 drm_sched_rq_get_min_vruntime(struct drm_sched_rq *rq) 122 { 123 ktime_t vruntime = 0; 124 struct rb_node *rb; 125 126 lockdep_assert_held(&rq->lock); 127 128 rb = rb_first_cached(&rq->rb_tree_root); 129 if (rb) { 130 struct drm_sched_entity *entity = 131 rb_entry(rb, typeof(*entity), rb_tree_node); 132 struct drm_sched_entity_stats *stats = entity->stats; 133 134 spin_lock(&stats->lock); 135 vruntime = stats->vruntime; 136 spin_unlock(&stats->lock); 137 } 138 139 return vruntime; 140 } 141 142 static void 143 drm_sched_entity_save_vruntime(struct drm_sched_entity *entity, 144 ktime_t min_vruntime) 145 { 146 struct drm_sched_entity_stats *stats = entity->stats; 147 ktime_t vruntime; 148 149 spin_lock(&stats->lock); 150 vruntime = stats->vruntime; 151 if (min_vruntime && vruntime > min_vruntime) 152 vruntime = ktime_sub(vruntime, min_vruntime); 153 else 154 vruntime = 0; 155 stats->vruntime = vruntime; 156 spin_unlock(&stats->lock); 157 } 158 159 static ktime_t 160 drm_sched_entity_restore_vruntime(struct drm_sched_entity *entity, 161 ktime_t min_vruntime, 162 enum drm_sched_priority rq_prio) 163 { 164 struct drm_sched_entity_stats *stats = entity->stats; 165 struct drm_gpu_scheduler *sched = 166 container_of(entity->rq, typeof(*sched), rq); 167 enum drm_sched_priority prio = entity->priority; 168 unsigned long avg_us, sched_avg_us; 169 ktime_t vruntime; 170 171 BUILD_BUG_ON(DRM_SCHED_PRIORITY_NORMAL < DRM_SCHED_PRIORITY_HIGH); 172 173 spin_lock(&stats->lock); 174 vruntime = stats->vruntime; 175 avg_us = ewma_drm_sched_avgtime_read(&stats->avg_job_us); 176 /* 177 * Unlocked read of the scheduler average is fine since it is just 178 * heuristics and data type is a natural word size. 179 */ 180 sched_avg_us = ewma_drm_sched_avgtime_read(&sched->avg_job_us); 181 182 /* 183 * Special handling for entities which were picked from the top of the 184 * queue and are now re-joining the top with another one already there. 185 */ 186 if (!vruntime && rq_prio != DRM_SCHED_PRIORITY_INVALID) { 187 if (prio > rq_prio) { 188 /* 189 * Lower priority should not overtake higher when re- 190 * joining at the top of the queue so push it back 191 * somewhere behind the "middle" of the run-queue, 192 * proportional to the scheduler and entity average job 193 * durations. 194 */ 195 vruntime = us_to_ktime((1 + avg_us + sched_avg_us) << 196 vruntime_shift[prio]); 197 } else if (prio < rq_prio) { 198 /* 199 * Higher priority can go first. 200 */ 201 vruntime = -ns_to_ktime(rq_prio - prio); 202 } else { 203 /* Favour entity with shorter jobs (interactivity). */ 204 if (avg_us <= sched_avg_us) 205 vruntime = -ns_to_ktime(1); 206 else 207 vruntime = ns_to_ktime(1); 208 } 209 } 210 211 /* 212 * Restore saved relative position in the queue. 213 */ 214 vruntime = ktime_add(min_vruntime, vruntime); 215 216 stats->vruntime = vruntime; 217 spin_unlock(&stats->lock); 218 219 return vruntime; 220 } 221 222 static ktime_t drm_sched_entity_update_vruntime(struct drm_sched_entity *entity) 223 { 224 struct drm_sched_entity_stats *stats = entity->stats; 225 ktime_t runtime, prev; 226 227 spin_lock(&stats->lock); 228 prev = stats->prev_runtime; 229 runtime = stats->runtime; 230 stats->prev_runtime = runtime; 231 runtime = ktime_add_ns(stats->vruntime, 232 ktime_to_ns(ktime_sub(runtime, prev)) << 233 vruntime_shift[entity->priority]); 234 stats->vruntime = runtime; 235 spin_unlock(&stats->lock); 236 237 return runtime; 238 } 239 240 /** 241 * drm_sched_rq_add_entity - add an entity 242 * @entity: scheduler entity 243 * 244 * Adds a scheduler entity to the run queue. 245 * 246 * Return: DRM scheduler selected to handle this entity or NULL if entity has 247 * been stopped and cannot be submitted to. 248 */ 249 struct drm_gpu_scheduler * 250 drm_sched_rq_add_entity(struct drm_sched_entity *entity) 251 { 252 struct drm_gpu_scheduler *sched; 253 struct drm_sched_rq *rq; 254 ktime_t ts; 255 256 /* Add the entity to the run queue */ 257 spin_lock(&entity->lock); 258 if (entity->stopped) { 259 spin_unlock(&entity->lock); 260 261 DRM_ERROR("Trying to push to a killed entity\n"); 262 return NULL; 263 } 264 265 rq = entity->rq; 266 sched = container_of(rq, typeof(*sched), rq); 267 spin_lock(&rq->lock); 268 269 if (list_empty(&entity->list)) { 270 atomic_inc(sched->score); 271 list_add_tail(&entity->list, &rq->entities); 272 } 273 274 ts = drm_sched_rq_get_min_vruntime(rq); 275 ts = drm_sched_entity_restore_vruntime(entity, ts, rq->head_prio); 276 drm_sched_rq_update_tree_locked(entity, rq, ts); 277 278 spin_unlock(&rq->lock); 279 spin_unlock(&entity->lock); 280 281 return sched; 282 } 283 284 /** 285 * drm_sched_rq_remove_entity - remove an entity 286 * @rq: scheduler run queue 287 * @entity: scheduler entity 288 * 289 * Removes a scheduler entity from the run queue. 290 */ 291 void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, 292 struct drm_sched_entity *entity) 293 { 294 struct drm_gpu_scheduler *sched = container_of(rq, typeof(*sched), rq); 295 296 lockdep_assert_held(&entity->lock); 297 298 if (list_empty(&entity->list)) 299 return; 300 301 spin_lock(&rq->lock); 302 303 atomic_dec(sched->score); 304 list_del_init(&entity->list); 305 306 drm_sched_rq_remove_tree_locked(entity, rq); 307 308 spin_unlock(&rq->lock); 309 } 310 311 /** 312 * drm_sched_rq_pop_entity - pops an entity 313 * @entity: scheduler entity 314 * 315 * To be called every time after a job is popped from the entity. 316 */ 317 void drm_sched_rq_pop_entity(struct drm_sched_entity *entity) 318 { 319 struct drm_sched_job *next_job; 320 struct drm_sched_rq *rq; 321 322 /* 323 * Update the entity's location in the min heap according to 324 * the timestamp of the next job, if any. 325 */ 326 spin_lock(&entity->lock); 327 rq = entity->rq; 328 spin_lock(&rq->lock); 329 next_job = drm_sched_entity_queue_peek(entity); 330 if (next_job) { 331 ktime_t ts; 332 333 ts = drm_sched_entity_update_vruntime(entity); 334 drm_sched_rq_update_tree_locked(entity, rq, ts); 335 } else { 336 ktime_t min_vruntime; 337 338 drm_sched_rq_remove_tree_locked(entity, rq); 339 min_vruntime = drm_sched_rq_get_min_vruntime(rq); 340 drm_sched_entity_save_vruntime(entity, min_vruntime); 341 } 342 spin_unlock(&rq->lock); 343 spin_unlock(&entity->lock); 344 } 345 346 /** 347 * drm_sched_select_entity - Select an entity which provides a job to run 348 * @sched: the gpu scheduler 349 * 350 * Find oldest waiting ready entity. 351 * 352 * Return an entity if one is found; return an error-pointer (!NULL) if an 353 * entity was ready, but the scheduler had insufficient credits to accommodate 354 * its job; return NULL, if no ready entity was found. 355 */ 356 struct drm_sched_entity * 357 drm_sched_select_entity(struct drm_gpu_scheduler *sched) 358 { 359 struct drm_sched_rq *rq = &sched->rq; 360 struct rb_node *rb; 361 362 spin_lock(&rq->lock); 363 for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) { 364 struct drm_sched_entity *entity; 365 366 entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node); 367 if (drm_sched_entity_is_ready(entity)) { 368 /* If we can't queue yet, preserve the current entity in 369 * terms of fairness. 370 */ 371 if (!drm_sched_can_queue(sched, entity)) { 372 spin_unlock(&rq->lock); 373 return ERR_PTR(-ENOSPC); 374 } 375 376 reinit_completion(&entity->entity_idle); 377 break; 378 } 379 } 380 spin_unlock(&rq->lock); 381 382 return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL; 383 } 384