xref: /linux/drivers/gpu/drm/scheduler/sched_main.c (revision 33b4e4fcd2980ee5fd754731ca9b0325f0344f04)
1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 /**
25  * DOC: Overview
26  *
27  * The GPU scheduler provides entities which allow userspace to push jobs
28  * into software queues which are then scheduled on a hardware run queue.
29  * The software queues have a priority among them. The scheduler selects the entities
30  * from the run queue using a FIFO. The scheduler provides dependency handling
31  * features among jobs. The driver is supposed to provide callback functions for
32  * backend operations to the scheduler like submitting a job to hardware run queue,
33  * returning the dependencies of a job etc.
34  *
35  * The organisation of the scheduler is the following:
36  *
37  * 1. Each hw run queue has one scheduler
38  * 2. Each scheduler has multiple run queues with different priorities
39  *    (e.g., HIGH_HW,HIGH_SW, KERNEL, NORMAL)
40  * 3. Each scheduler run queue has a queue of entities to schedule
41  * 4. Entities themselves maintain a queue of jobs that will be scheduled on
42  *    the hardware.
43  *
44  * The jobs in an entity are always scheduled in the order in which they were pushed.
45  *
46  * Note that once a job was taken from the entities queue and pushed to the
47  * hardware, i.e. the pending queue, the entity must not be referenced anymore
48  * through the jobs entity pointer.
49  */
50 
51 /**
52  * DOC: Flow Control
53  *
54  * The DRM GPU scheduler provides a flow control mechanism to regulate the rate
55  * in which the jobs fetched from scheduler entities are executed.
56  *
57  * In this context the &drm_gpu_scheduler keeps track of a driver specified
58  * credit limit representing the capacity of this scheduler and a credit count;
59  * every &drm_sched_job carries a driver specified number of credits.
60  *
61  * Once a job is executed (but not yet finished), the job's credits contribute
62  * to the scheduler's credit count until the job is finished. If by executing
63  * one more job the scheduler's credit count would exceed the scheduler's
64  * credit limit, the job won't be executed. Instead, the scheduler will wait
65  * until the credit count has decreased enough to not overflow its credit limit.
66  * This implies waiting for previously executed jobs.
67  */
68 
69 #include <linux/export.h>
70 #include <linux/wait.h>
71 #include <linux/sched.h>
72 #include <linux/completion.h>
73 #include <linux/dma-resv.h>
74 #include <uapi/linux/sched/types.h>
75 
76 #include <drm/drm_print.h>
77 #include <drm/drm_gem.h>
78 #include <drm/drm_syncobj.h>
79 #include <drm/gpu_scheduler.h>
80 #include <drm/spsc_queue.h>
81 
82 #include "sched_internal.h"
83 
84 #define CREATE_TRACE_POINTS
85 #include "gpu_scheduler_trace.h"
86 
87 #ifdef CONFIG_LOCKDEP
88 static struct lockdep_map drm_sched_lockdep_map = {
89 	.name = "drm_sched_lockdep_map"
90 };
91 #endif
92 
93 int drm_sched_policy = DRM_SCHED_POLICY_FIFO;
94 
95 /**
96  * DOC: sched_policy (int)
97  * Used to override default entities scheduling policy in a run queue.
98  */
99 MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
100 module_param_named(sched_policy, drm_sched_policy, int, 0444);
101 
102 static u32 drm_sched_available_credits(struct drm_gpu_scheduler *sched)
103 {
104 	u32 credits;
105 
106 	WARN_ON(check_sub_overflow(sched->credit_limit,
107 				   atomic_read(&sched->credit_count),
108 				   &credits));
109 
110 	return credits;
111 }
112 
113 /**
114  * drm_sched_can_queue -- Can we queue more to the hardware?
115  * @sched: scheduler instance
116  * @entity: the scheduler entity
117  *
118  * Return true if we can push at least one more job from @entity, false
119  * otherwise.
120  */
121 static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
122 				struct drm_sched_entity *entity)
123 {
124 	struct drm_sched_job *s_job;
125 
126 	s_job = drm_sched_entity_queue_peek(entity);
127 	if (!s_job)
128 		return false;
129 
130 	/* If a job exceeds the credit limit, truncate it to the credit limit
131 	 * itself to guarantee forward progress.
132 	 */
133 	if (s_job->credits > sched->credit_limit) {
134 		dev_WARN(sched->dev,
135 			 "Jobs may not exceed the credit limit, truncate.\n");
136 		s_job->credits = sched->credit_limit;
137 	}
138 
139 	return drm_sched_available_credits(sched) >= s_job->credits;
140 }
141 
142 static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
143 							    const struct rb_node *b)
144 {
145 	struct drm_sched_entity *ent_a =  rb_entry((a), struct drm_sched_entity, rb_tree_node);
146 	struct drm_sched_entity *ent_b =  rb_entry((b), struct drm_sched_entity, rb_tree_node);
147 
148 	return ktime_before(ent_a->oldest_job_waiting, ent_b->oldest_job_waiting);
149 }
150 
151 static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity,
152 					    struct drm_sched_rq *rq)
153 {
154 	if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
155 		rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
156 		RB_CLEAR_NODE(&entity->rb_tree_node);
157 	}
158 }
159 
160 void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
161 				     struct drm_sched_rq *rq,
162 				     ktime_t ts)
163 {
164 	/*
165 	 * Both locks need to be grabbed, one to protect from entity->rq change
166 	 * for entity from within concurrent drm_sched_entity_select_rq and the
167 	 * other to update the rb tree structure.
168 	 */
169 	lockdep_assert_held(&entity->lock);
170 	lockdep_assert_held(&rq->lock);
171 
172 	drm_sched_rq_remove_fifo_locked(entity, rq);
173 
174 	entity->oldest_job_waiting = ts;
175 
176 	rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
177 		      drm_sched_entity_compare_before);
178 }
179 
180 /**
181  * drm_sched_rq_init - initialize a given run queue struct
182  *
183  * @sched: scheduler instance to associate with this run queue
184  * @rq: scheduler run queue
185  *
186  * Initializes a scheduler runqueue.
187  */
188 static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
189 			      struct drm_sched_rq *rq)
190 {
191 	spin_lock_init(&rq->lock);
192 	INIT_LIST_HEAD(&rq->entities);
193 	rq->rb_tree_root = RB_ROOT_CACHED;
194 	rq->current_entity = NULL;
195 	rq->sched = sched;
196 }
197 
198 /**
199  * drm_sched_rq_add_entity - add an entity
200  *
201  * @rq: scheduler run queue
202  * @entity: scheduler entity
203  *
204  * Adds a scheduler entity to the run queue.
205  */
206 void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
207 			     struct drm_sched_entity *entity)
208 {
209 	lockdep_assert_held(&entity->lock);
210 	lockdep_assert_held(&rq->lock);
211 
212 	if (!list_empty(&entity->list))
213 		return;
214 
215 	atomic_inc(rq->sched->score);
216 	list_add_tail(&entity->list, &rq->entities);
217 }
218 
219 /**
220  * drm_sched_rq_remove_entity - remove an entity
221  *
222  * @rq: scheduler run queue
223  * @entity: scheduler entity
224  *
225  * Removes a scheduler entity from the run queue.
226  */
227 void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
228 				struct drm_sched_entity *entity)
229 {
230 	lockdep_assert_held(&entity->lock);
231 
232 	if (list_empty(&entity->list))
233 		return;
234 
235 	spin_lock(&rq->lock);
236 
237 	atomic_dec(rq->sched->score);
238 	list_del_init(&entity->list);
239 
240 	if (rq->current_entity == entity)
241 		rq->current_entity = NULL;
242 
243 	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
244 		drm_sched_rq_remove_fifo_locked(entity, rq);
245 
246 	spin_unlock(&rq->lock);
247 }
248 
249 /**
250  * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
251  *
252  * @sched: the gpu scheduler
253  * @rq: scheduler run queue to check.
254  *
255  * Try to find the next ready entity.
256  *
257  * Return an entity if one is found; return an error-pointer (!NULL) if an
258  * entity was ready, but the scheduler had insufficient credits to accommodate
259  * its job; return NULL, if no ready entity was found.
260  */
261 static struct drm_sched_entity *
262 drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
263 			      struct drm_sched_rq *rq)
264 {
265 	struct drm_sched_entity *entity;
266 
267 	spin_lock(&rq->lock);
268 
269 	entity = rq->current_entity;
270 	if (entity) {
271 		list_for_each_entry_continue(entity, &rq->entities, list) {
272 			if (drm_sched_entity_is_ready(entity)) {
273 				/* If we can't queue yet, preserve the current
274 				 * entity in terms of fairness.
275 				 */
276 				if (!drm_sched_can_queue(sched, entity)) {
277 					spin_unlock(&rq->lock);
278 					return ERR_PTR(-ENOSPC);
279 				}
280 
281 				rq->current_entity = entity;
282 				reinit_completion(&entity->entity_idle);
283 				spin_unlock(&rq->lock);
284 				return entity;
285 			}
286 		}
287 	}
288 
289 	list_for_each_entry(entity, &rq->entities, list) {
290 		if (drm_sched_entity_is_ready(entity)) {
291 			/* If we can't queue yet, preserve the current entity in
292 			 * terms of fairness.
293 			 */
294 			if (!drm_sched_can_queue(sched, entity)) {
295 				spin_unlock(&rq->lock);
296 				return ERR_PTR(-ENOSPC);
297 			}
298 
299 			rq->current_entity = entity;
300 			reinit_completion(&entity->entity_idle);
301 			spin_unlock(&rq->lock);
302 			return entity;
303 		}
304 
305 		if (entity == rq->current_entity)
306 			break;
307 	}
308 
309 	spin_unlock(&rq->lock);
310 
311 	return NULL;
312 }
313 
314 /**
315  * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
316  *
317  * @sched: the gpu scheduler
318  * @rq: scheduler run queue to check.
319  *
320  * Find oldest waiting ready entity.
321  *
322  * Return an entity if one is found; return an error-pointer (!NULL) if an
323  * entity was ready, but the scheduler had insufficient credits to accommodate
324  * its job; return NULL, if no ready entity was found.
325  */
326 static struct drm_sched_entity *
327 drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
328 				struct drm_sched_rq *rq)
329 {
330 	struct rb_node *rb;
331 
332 	spin_lock(&rq->lock);
333 	for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) {
334 		struct drm_sched_entity *entity;
335 
336 		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
337 		if (drm_sched_entity_is_ready(entity)) {
338 			/* If we can't queue yet, preserve the current entity in
339 			 * terms of fairness.
340 			 */
341 			if (!drm_sched_can_queue(sched, entity)) {
342 				spin_unlock(&rq->lock);
343 				return ERR_PTR(-ENOSPC);
344 			}
345 
346 			reinit_completion(&entity->entity_idle);
347 			break;
348 		}
349 	}
350 	spin_unlock(&rq->lock);
351 
352 	return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
353 }
354 
355 /**
356  * drm_sched_run_job_queue - enqueue run-job work
357  * @sched: scheduler instance
358  */
359 static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
360 {
361 	if (!READ_ONCE(sched->pause_submit))
362 		queue_work(sched->submit_wq, &sched->work_run_job);
363 }
364 
365 /**
366  * __drm_sched_run_free_queue - enqueue free-job work
367  * @sched: scheduler instance
368  */
369 static void __drm_sched_run_free_queue(struct drm_gpu_scheduler *sched)
370 {
371 	if (!READ_ONCE(sched->pause_submit))
372 		queue_work(sched->submit_wq, &sched->work_free_job);
373 }
374 
375 /**
376  * drm_sched_run_free_queue - enqueue free-job work if ready
377  * @sched: scheduler instance
378  */
379 static void drm_sched_run_free_queue(struct drm_gpu_scheduler *sched)
380 {
381 	struct drm_sched_job *job;
382 
383 	spin_lock(&sched->job_list_lock);
384 	job = list_first_entry_or_null(&sched->pending_list,
385 				       struct drm_sched_job, list);
386 	if (job && dma_fence_is_signaled(&job->s_fence->finished))
387 		__drm_sched_run_free_queue(sched);
388 	spin_unlock(&sched->job_list_lock);
389 }
390 
391 /**
392  * drm_sched_job_done - complete a job
393  * @s_job: pointer to the job which is done
394  *
395  * Finish the job's fence and resubmit the work items.
396  */
397 static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
398 {
399 	struct drm_sched_fence *s_fence = s_job->s_fence;
400 	struct drm_gpu_scheduler *sched = s_fence->sched;
401 
402 	atomic_sub(s_job->credits, &sched->credit_count);
403 	atomic_dec(sched->score);
404 
405 	trace_drm_sched_job_done(s_fence);
406 
407 	dma_fence_get(&s_fence->finished);
408 	drm_sched_fence_finished(s_fence, result);
409 	dma_fence_put(&s_fence->finished);
410 	__drm_sched_run_free_queue(sched);
411 }
412 
413 /**
414  * drm_sched_job_done_cb - the callback for a done job
415  * @f: fence
416  * @cb: fence callbacks
417  */
418 static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
419 {
420 	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
421 
422 	drm_sched_job_done(s_job, f->error);
423 }
424 
425 /**
426  * drm_sched_start_timeout - start timeout for reset worker
427  *
428  * @sched: scheduler instance to start the worker for
429  *
430  * Start the timeout for the given scheduler.
431  */
432 static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
433 {
434 	lockdep_assert_held(&sched->job_list_lock);
435 
436 	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
437 	    !list_empty(&sched->pending_list))
438 		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
439 }
440 
441 static void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched)
442 {
443 	spin_lock(&sched->job_list_lock);
444 	drm_sched_start_timeout(sched);
445 	spin_unlock(&sched->job_list_lock);
446 }
447 
448 /**
449  * drm_sched_tdr_queue_imm: - immediately start job timeout handler
450  *
451  * @sched: scheduler for which the timeout handling should be started.
452  *
453  * Start timeout handling immediately for the named scheduler.
454  */
455 void drm_sched_tdr_queue_imm(struct drm_gpu_scheduler *sched)
456 {
457 	spin_lock(&sched->job_list_lock);
458 	sched->timeout = 0;
459 	drm_sched_start_timeout(sched);
460 	spin_unlock(&sched->job_list_lock);
461 }
462 EXPORT_SYMBOL(drm_sched_tdr_queue_imm);
463 
464 /**
465  * drm_sched_fault - immediately start timeout handler
466  *
467  * @sched: scheduler where the timeout handling should be started.
468  *
469  * Start timeout handling immediately when the driver detects a hardware fault.
470  */
471 void drm_sched_fault(struct drm_gpu_scheduler *sched)
472 {
473 	if (sched->timeout_wq)
474 		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
475 }
476 EXPORT_SYMBOL(drm_sched_fault);
477 
478 /**
479  * drm_sched_suspend_timeout - Suspend scheduler job timeout
480  *
481  * @sched: scheduler instance for which to suspend the timeout
482  *
483  * Suspend the delayed work timeout for the scheduler. This is done by
484  * modifying the delayed work timeout to an arbitrary large value,
485  * MAX_SCHEDULE_TIMEOUT in this case.
486  *
487  * Returns the timeout remaining
488  *
489  */
490 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
491 {
492 	unsigned long sched_timeout, now = jiffies;
493 
494 	sched_timeout = sched->work_tdr.timer.expires;
495 
496 	/*
497 	 * Modify the timeout to an arbitrarily large value. This also prevents
498 	 * the timeout to be restarted when new submissions arrive
499 	 */
500 	if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
501 			&& time_after(sched_timeout, now))
502 		return sched_timeout - now;
503 	else
504 		return sched->timeout;
505 }
506 EXPORT_SYMBOL(drm_sched_suspend_timeout);
507 
508 /**
509  * drm_sched_resume_timeout - Resume scheduler job timeout
510  *
511  * @sched: scheduler instance for which to resume the timeout
512  * @remaining: remaining timeout
513  *
514  * Resume the delayed work timeout for the scheduler.
515  */
516 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
517 		unsigned long remaining)
518 {
519 	spin_lock(&sched->job_list_lock);
520 
521 	if (list_empty(&sched->pending_list))
522 		cancel_delayed_work(&sched->work_tdr);
523 	else
524 		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, remaining);
525 
526 	spin_unlock(&sched->job_list_lock);
527 }
528 EXPORT_SYMBOL(drm_sched_resume_timeout);
529 
530 static void drm_sched_job_begin(struct drm_sched_job *s_job)
531 {
532 	struct drm_gpu_scheduler *sched = s_job->sched;
533 
534 	spin_lock(&sched->job_list_lock);
535 	list_add_tail(&s_job->list, &sched->pending_list);
536 	drm_sched_start_timeout(sched);
537 	spin_unlock(&sched->job_list_lock);
538 }
539 
540 static void drm_sched_job_timedout(struct work_struct *work)
541 {
542 	struct drm_gpu_scheduler *sched;
543 	struct drm_sched_job *job;
544 	enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;
545 
546 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
547 
548 	/* Protects against concurrent deletion in drm_sched_get_finished_job */
549 	spin_lock(&sched->job_list_lock);
550 	job = list_first_entry_or_null(&sched->pending_list,
551 				       struct drm_sched_job, list);
552 
553 	if (job) {
554 		/*
555 		 * Remove the bad job so it cannot be freed by a concurrent
556 		 * &struct drm_sched_backend_ops.free_job. It will be
557 		 * reinserted after the scheduler's work items have been
558 		 * cancelled, at which point it's safe.
559 		 */
560 		list_del_init(&job->list);
561 		spin_unlock(&sched->job_list_lock);
562 
563 		status = job->sched->ops->timedout_job(job);
564 
565 		/*
566 		 * Guilty job did complete and hence needs to be manually removed
567 		 * See drm_sched_stop doc.
568 		 */
569 		if (sched->free_guilty) {
570 			job->sched->ops->free_job(job);
571 			sched->free_guilty = false;
572 		}
573 	} else {
574 		spin_unlock(&sched->job_list_lock);
575 	}
576 
577 	if (status != DRM_GPU_SCHED_STAT_ENODEV)
578 		drm_sched_start_timeout_unlocked(sched);
579 }
580 
581 /**
582  * drm_sched_stop - stop the scheduler
583  *
584  * @sched: scheduler instance
585  * @bad: job which caused the time out
586  *
587  * Stop the scheduler and also removes and frees all completed jobs.
588  * Note: bad job will not be freed as it might be used later and so it's
589  * callers responsibility to release it manually if it's not part of the
590  * pending list any more.
591  *
592  * This function is typically used for reset recovery (see the docu of
593  * drm_sched_backend_ops.timedout_job() for details). Do not call it for
594  * scheduler teardown, i.e., before calling drm_sched_fini().
595  */
596 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
597 {
598 	struct drm_sched_job *s_job, *tmp;
599 
600 	drm_sched_wqueue_stop(sched);
601 
602 	/*
603 	 * Reinsert back the bad job here - now it's safe as
604 	 * drm_sched_get_finished_job() cannot race against us and release the
605 	 * bad job at this point - we parked (waited for) any in progress
606 	 * (earlier) cleanups and drm_sched_get_finished_job() will not be
607 	 * called now until the scheduler's work items are submitted again.
608 	 */
609 	if (bad && bad->sched == sched)
610 		/*
611 		 * Add at the head of the queue to reflect it was the earliest
612 		 * job extracted.
613 		 */
614 		list_add(&bad->list, &sched->pending_list);
615 
616 	/*
617 	 * Iterate the job list from later to  earlier one and either deactive
618 	 * their HW callbacks or remove them from pending list if they already
619 	 * signaled.
620 	 * This iteration is thread safe as the scheduler's work items have been
621 	 * cancelled.
622 	 */
623 	list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list,
624 					 list) {
625 		if (s_job->s_fence->parent &&
626 		    dma_fence_remove_callback(s_job->s_fence->parent,
627 					      &s_job->cb)) {
628 			dma_fence_put(s_job->s_fence->parent);
629 			s_job->s_fence->parent = NULL;
630 			atomic_sub(s_job->credits, &sched->credit_count);
631 		} else {
632 			/*
633 			 * remove job from pending_list.
634 			 * Locking here is for concurrent resume timeout
635 			 */
636 			spin_lock(&sched->job_list_lock);
637 			list_del_init(&s_job->list);
638 			spin_unlock(&sched->job_list_lock);
639 
640 			/*
641 			 * Wait for job's HW fence callback to finish using s_job
642 			 * before releasing it.
643 			 *
644 			 * Job is still alive so fence refcount at least 1
645 			 */
646 			dma_fence_wait(&s_job->s_fence->finished, false);
647 
648 			/*
649 			 * We must keep bad job alive for later use during
650 			 * recovery by some of the drivers but leave a hint
651 			 * that the guilty job must be released.
652 			 */
653 			if (bad != s_job)
654 				sched->ops->free_job(s_job);
655 			else
656 				sched->free_guilty = true;
657 		}
658 	}
659 
660 	/*
661 	 * Stop pending timer in flight as we rearm it in  drm_sched_start. This
662 	 * avoids the pending timeout work in progress to fire right away after
663 	 * this TDR finished and before the newly restarted jobs had a
664 	 * chance to complete.
665 	 */
666 	cancel_delayed_work(&sched->work_tdr);
667 }
668 EXPORT_SYMBOL(drm_sched_stop);
669 
670 /**
671  * drm_sched_start - recover jobs after a reset
672  *
673  * @sched: scheduler instance
674  * @errno: error to set on the pending fences
675  *
676  * This function is typically used for reset recovery (see the docu of
677  * drm_sched_backend_ops.timedout_job() for details). Do not call it for
678  * scheduler startup. The scheduler itself is fully operational after
679  * drm_sched_init() succeeded.
680  */
681 void drm_sched_start(struct drm_gpu_scheduler *sched, int errno)
682 {
683 	struct drm_sched_job *s_job, *tmp;
684 
685 	/*
686 	 * Locking the list is not required here as the scheduler's work items
687 	 * are currently not running, so no new jobs are being inserted or
688 	 * removed. Also concurrent GPU recovers can't run in parallel.
689 	 */
690 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
691 		struct dma_fence *fence = s_job->s_fence->parent;
692 
693 		atomic_add(s_job->credits, &sched->credit_count);
694 
695 		if (!fence) {
696 			drm_sched_job_done(s_job, errno ?: -ECANCELED);
697 			continue;
698 		}
699 
700 		if (dma_fence_add_callback(fence, &s_job->cb,
701 					   drm_sched_job_done_cb))
702 			drm_sched_job_done(s_job, fence->error ?: errno);
703 	}
704 
705 	drm_sched_start_timeout_unlocked(sched);
706 	drm_sched_wqueue_start(sched);
707 }
708 EXPORT_SYMBOL(drm_sched_start);
709 
710 /**
711  * drm_sched_resubmit_jobs - Deprecated, don't use in new code!
712  *
713  * @sched: scheduler instance
714  *
715  * Re-submitting jobs was a concept AMD came up as cheap way to implement
716  * recovery after a job timeout.
717  *
718  * This turned out to be not working very well. First of all there are many
719  * problem with the dma_fence implementation and requirements. Either the
720  * implementation is risking deadlocks with core memory management or violating
721  * documented implementation details of the dma_fence object.
722  *
723  * Drivers can still save and restore their state for recovery operations, but
724  * we shouldn't make this a general scheduler feature around the dma_fence
725  * interface.
726  */
727 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
728 {
729 	struct drm_sched_job *s_job, *tmp;
730 	uint64_t guilty_context;
731 	bool found_guilty = false;
732 	struct dma_fence *fence;
733 
734 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
735 		struct drm_sched_fence *s_fence = s_job->s_fence;
736 
737 		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
738 			found_guilty = true;
739 			guilty_context = s_job->s_fence->scheduled.context;
740 		}
741 
742 		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
743 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
744 
745 		fence = sched->ops->run_job(s_job);
746 
747 		if (IS_ERR_OR_NULL(fence)) {
748 			if (IS_ERR(fence))
749 				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
750 
751 			s_job->s_fence->parent = NULL;
752 		} else {
753 
754 			s_job->s_fence->parent = dma_fence_get(fence);
755 
756 			/* Drop for orignal kref_init */
757 			dma_fence_put(fence);
758 		}
759 	}
760 }
761 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
762 
763 /**
764  * drm_sched_job_init - init a scheduler job
765  * @job: scheduler job to init
766  * @entity: scheduler entity to use
767  * @credits: the number of credits this job contributes to the schedulers
768  * credit limit
769  * @owner: job owner for debugging
770  * @drm_client_id: &struct drm_file.client_id of the owner (used by trace
771  * events)
772  *
773  * Refer to drm_sched_entity_push_job() documentation
774  * for locking considerations.
775  *
776  * Drivers must make sure drm_sched_job_cleanup() if this function returns
777  * successfully, even when @job is aborted before drm_sched_job_arm() is called.
778  *
779  * Note that this function does not assign a valid value to each struct member
780  * of struct drm_sched_job. Take a look at that struct's documentation to see
781  * who sets which struct member with what lifetime.
782  *
783  * WARNING: amdgpu abuses &drm_sched.ready to signal when the hardware
784  * has died, which can mean that there's no valid runqueue for a @entity.
785  * This function returns -ENOENT in this case (which probably should be -EIO as
786  * a more meanigful return value).
787  *
788  * Returns 0 for success, negative error code otherwise.
789  */
790 int drm_sched_job_init(struct drm_sched_job *job,
791 		       struct drm_sched_entity *entity,
792 		       u32 credits, void *owner,
793 		       uint64_t drm_client_id)
794 {
795 	if (!entity->rq) {
796 		/* This will most likely be followed by missing frames
797 		 * or worse--a blank screen--leave a trail in the
798 		 * logs, so this can be debugged easier.
799 		 */
800 		dev_err(job->sched->dev, "%s: entity has no rq!\n", __func__);
801 		return -ENOENT;
802 	}
803 
804 	if (unlikely(!credits)) {
805 		pr_err("*ERROR* %s: credits cannot be 0!\n", __func__);
806 		return -EINVAL;
807 	}
808 
809 	/*
810 	 * We don't know for sure how the user has allocated. Thus, zero the
811 	 * struct so that unallowed (i.e., too early) usage of pointers that
812 	 * this function does not set is guaranteed to lead to a NULL pointer
813 	 * exception instead of UB.
814 	 */
815 	memset(job, 0, sizeof(*job));
816 
817 	job->entity = entity;
818 	job->credits = credits;
819 	job->s_fence = drm_sched_fence_alloc(entity, owner, drm_client_id);
820 	if (!job->s_fence)
821 		return -ENOMEM;
822 
823 	INIT_LIST_HEAD(&job->list);
824 
825 	xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
826 
827 	return 0;
828 }
829 EXPORT_SYMBOL(drm_sched_job_init);
830 
831 /**
832  * drm_sched_job_arm - arm a scheduler job for execution
833  * @job: scheduler job to arm
834  *
835  * This arms a scheduler job for execution. Specifically it initializes the
836  * &drm_sched_job.s_fence of @job, so that it can be attached to struct dma_resv
837  * or other places that need to track the completion of this job. It also
838  * initializes sequence numbers, which are fundamental for fence ordering.
839  *
840  * Refer to drm_sched_entity_push_job() documentation for locking
841  * considerations.
842  *
843  * Once this function was called, you *must* submit @job with
844  * drm_sched_entity_push_job().
845  *
846  * This can only be called if drm_sched_job_init() succeeded.
847  */
848 void drm_sched_job_arm(struct drm_sched_job *job)
849 {
850 	struct drm_gpu_scheduler *sched;
851 	struct drm_sched_entity *entity = job->entity;
852 
853 	BUG_ON(!entity);
854 	drm_sched_entity_select_rq(entity);
855 	sched = entity->rq->sched;
856 
857 	job->sched = sched;
858 	job->s_priority = entity->priority;
859 
860 	drm_sched_fence_init(job->s_fence, job->entity);
861 }
862 EXPORT_SYMBOL(drm_sched_job_arm);
863 
864 /**
865  * drm_sched_job_add_dependency - adds the fence as a job dependency
866  * @job: scheduler job to add the dependencies to
867  * @fence: the dma_fence to add to the list of dependencies.
868  *
869  * Note that @fence is consumed in both the success and error cases.
870  *
871  * Returns:
872  * 0 on success, or an error on failing to expand the array.
873  */
874 int drm_sched_job_add_dependency(struct drm_sched_job *job,
875 				 struct dma_fence *fence)
876 {
877 	struct dma_fence *entry;
878 	unsigned long index;
879 	u32 id = 0;
880 	int ret;
881 
882 	if (!fence)
883 		return 0;
884 
885 	/* Deduplicate if we already depend on a fence from the same context.
886 	 * This lets the size of the array of deps scale with the number of
887 	 * engines involved, rather than the number of BOs.
888 	 */
889 	xa_for_each(&job->dependencies, index, entry) {
890 		if (entry->context != fence->context)
891 			continue;
892 
893 		if (dma_fence_is_later(fence, entry)) {
894 			dma_fence_put(entry);
895 			xa_store(&job->dependencies, index, fence, GFP_KERNEL);
896 		} else {
897 			dma_fence_put(fence);
898 		}
899 		return 0;
900 	}
901 
902 	ret = xa_alloc(&job->dependencies, &id, fence, xa_limit_32b, GFP_KERNEL);
903 	if (ret != 0)
904 		dma_fence_put(fence);
905 
906 	return ret;
907 }
908 EXPORT_SYMBOL(drm_sched_job_add_dependency);
909 
910 /**
911  * drm_sched_job_add_syncobj_dependency - adds a syncobj's fence as a job dependency
912  * @job: scheduler job to add the dependencies to
913  * @file: drm file private pointer
914  * @handle: syncobj handle to lookup
915  * @point: timeline point
916  *
917  * This adds the fence matching the given syncobj to @job.
918  *
919  * Returns:
920  * 0 on success, or an error on failing to expand the array.
921  */
922 int drm_sched_job_add_syncobj_dependency(struct drm_sched_job *job,
923 					 struct drm_file *file,
924 					 u32 handle,
925 					 u32 point)
926 {
927 	struct dma_fence *fence;
928 	int ret;
929 
930 	ret = drm_syncobj_find_fence(file, handle, point, 0, &fence);
931 	if (ret)
932 		return ret;
933 
934 	return drm_sched_job_add_dependency(job, fence);
935 }
936 EXPORT_SYMBOL(drm_sched_job_add_syncobj_dependency);
937 
938 /**
939  * drm_sched_job_add_resv_dependencies - add all fences from the resv to the job
940  * @job: scheduler job to add the dependencies to
941  * @resv: the dma_resv object to get the fences from
942  * @usage: the dma_resv_usage to use to filter the fences
943  *
944  * This adds all fences matching the given usage from @resv to @job.
945  * Must be called with the @resv lock held.
946  *
947  * Returns:
948  * 0 on success, or an error on failing to expand the array.
949  */
950 int drm_sched_job_add_resv_dependencies(struct drm_sched_job *job,
951 					struct dma_resv *resv,
952 					enum dma_resv_usage usage)
953 {
954 	struct dma_resv_iter cursor;
955 	struct dma_fence *fence;
956 	int ret;
957 
958 	dma_resv_assert_held(resv);
959 
960 	dma_resv_for_each_fence(&cursor, resv, usage, fence) {
961 		/* Make sure to grab an additional ref on the added fence */
962 		dma_fence_get(fence);
963 		ret = drm_sched_job_add_dependency(job, fence);
964 		if (ret) {
965 			dma_fence_put(fence);
966 			return ret;
967 		}
968 	}
969 	return 0;
970 }
971 EXPORT_SYMBOL(drm_sched_job_add_resv_dependencies);
972 
973 /**
974  * drm_sched_job_add_implicit_dependencies - adds implicit dependencies as job
975  *   dependencies
976  * @job: scheduler job to add the dependencies to
977  * @obj: the gem object to add new dependencies from.
978  * @write: whether the job might write the object (so we need to depend on
979  * shared fences in the reservation object).
980  *
981  * This should be called after drm_gem_lock_reservations() on your array of
982  * GEM objects used in the job but before updating the reservations with your
983  * own fences.
984  *
985  * Returns:
986  * 0 on success, or an error on failing to expand the array.
987  */
988 int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
989 					    struct drm_gem_object *obj,
990 					    bool write)
991 {
992 	return drm_sched_job_add_resv_dependencies(job, obj->resv,
993 						   dma_resv_usage_rw(write));
994 }
995 EXPORT_SYMBOL(drm_sched_job_add_implicit_dependencies);
996 
997 /**
998  * drm_sched_job_has_dependency - check whether fence is the job's dependency
999  * @job: scheduler job to check
1000  * @fence: fence to look for
1001  *
1002  * Returns:
1003  * True if @fence is found within the job's dependencies, or otherwise false.
1004  */
1005 bool drm_sched_job_has_dependency(struct drm_sched_job *job,
1006 				  struct dma_fence *fence)
1007 {
1008 	struct dma_fence *f;
1009 	unsigned long index;
1010 
1011 	xa_for_each(&job->dependencies, index, f) {
1012 		if (f == fence)
1013 			return true;
1014 	}
1015 
1016 	return false;
1017 }
1018 EXPORT_SYMBOL(drm_sched_job_has_dependency);
1019 
1020 /**
1021  * drm_sched_job_cleanup - clean up scheduler job resources
1022  * @job: scheduler job to clean up
1023  *
1024  * Cleans up the resources allocated with drm_sched_job_init().
1025  *
1026  * Drivers should call this from their error unwind code if @job is aborted
1027  * before drm_sched_job_arm() is called.
1028  *
1029  * drm_sched_job_arm() is a point of no return since it initializes the fences
1030  * and their sequence number etc. Once that function has been called, you *must*
1031  * submit it with drm_sched_entity_push_job() and cannot simply abort it by
1032  * calling drm_sched_job_cleanup().
1033  *
1034  * This function should be called in the &drm_sched_backend_ops.free_job callback.
1035  */
1036 void drm_sched_job_cleanup(struct drm_sched_job *job)
1037 {
1038 	struct dma_fence *fence;
1039 	unsigned long index;
1040 
1041 	if (kref_read(&job->s_fence->finished.refcount)) {
1042 		/* The job has been processed by the scheduler, i.e.,
1043 		 * drm_sched_job_arm() and drm_sched_entity_push_job() have
1044 		 * been called.
1045 		 */
1046 		dma_fence_put(&job->s_fence->finished);
1047 	} else {
1048 		/* The job was aborted before it has been committed to be run;
1049 		 * notably, drm_sched_job_arm() has not been called.
1050 		 */
1051 		drm_sched_fence_free(job->s_fence);
1052 	}
1053 
1054 	job->s_fence = NULL;
1055 
1056 	xa_for_each(&job->dependencies, index, fence) {
1057 		dma_fence_put(fence);
1058 	}
1059 	xa_destroy(&job->dependencies);
1060 
1061 }
1062 EXPORT_SYMBOL(drm_sched_job_cleanup);
1063 
1064 /**
1065  * drm_sched_wakeup - Wake up the scheduler if it is ready to queue
1066  * @sched: scheduler instance
1067  *
1068  * Wake up the scheduler if we can queue jobs.
1069  */
1070 void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
1071 {
1072 	drm_sched_run_job_queue(sched);
1073 }
1074 
1075 /**
1076  * drm_sched_select_entity - Select next entity to process
1077  *
1078  * @sched: scheduler instance
1079  *
1080  * Return an entity to process or NULL if none are found.
1081  *
1082  * Note, that we break out of the for-loop when "entity" is non-null, which can
1083  * also be an error-pointer--this assures we don't process lower priority
1084  * run-queues. See comments in the respectively called functions.
1085  */
1086 static struct drm_sched_entity *
1087 drm_sched_select_entity(struct drm_gpu_scheduler *sched)
1088 {
1089 	struct drm_sched_entity *entity;
1090 	int i;
1091 
1092 	/* Start with the highest priority.
1093 	 */
1094 	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
1095 		entity = drm_sched_policy == DRM_SCHED_POLICY_FIFO ?
1096 			drm_sched_rq_select_entity_fifo(sched, sched->sched_rq[i]) :
1097 			drm_sched_rq_select_entity_rr(sched, sched->sched_rq[i]);
1098 		if (entity)
1099 			break;
1100 	}
1101 
1102 	return IS_ERR(entity) ? NULL : entity;
1103 }
1104 
1105 /**
1106  * drm_sched_get_finished_job - fetch the next finished job to be destroyed
1107  *
1108  * @sched: scheduler instance
1109  *
1110  * Returns the next finished job from the pending list (if there is one)
1111  * ready for it to be destroyed.
1112  */
1113 static struct drm_sched_job *
1114 drm_sched_get_finished_job(struct drm_gpu_scheduler *sched)
1115 {
1116 	struct drm_sched_job *job, *next;
1117 
1118 	spin_lock(&sched->job_list_lock);
1119 
1120 	job = list_first_entry_or_null(&sched->pending_list,
1121 				       struct drm_sched_job, list);
1122 
1123 	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
1124 		/* remove job from pending_list */
1125 		list_del_init(&job->list);
1126 
1127 		/* cancel this job's TO timer */
1128 		cancel_delayed_work(&sched->work_tdr);
1129 		/* make the scheduled timestamp more accurate */
1130 		next = list_first_entry_or_null(&sched->pending_list,
1131 						typeof(*next), list);
1132 
1133 		if (next) {
1134 			if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
1135 				     &next->s_fence->scheduled.flags))
1136 				next->s_fence->scheduled.timestamp =
1137 					dma_fence_timestamp(&job->s_fence->finished);
1138 			/* start TO timer for next job */
1139 			drm_sched_start_timeout(sched);
1140 		}
1141 	} else {
1142 		job = NULL;
1143 	}
1144 
1145 	spin_unlock(&sched->job_list_lock);
1146 
1147 	return job;
1148 }
1149 
1150 /**
1151  * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
1152  * @sched_list: list of drm_gpu_schedulers
1153  * @num_sched_list: number of drm_gpu_schedulers in the sched_list
1154  *
1155  * Returns pointer of the sched with the least load or NULL if none of the
1156  * drm_gpu_schedulers are ready
1157  */
1158 struct drm_gpu_scheduler *
1159 drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
1160 		     unsigned int num_sched_list)
1161 {
1162 	struct drm_gpu_scheduler *sched, *picked_sched = NULL;
1163 	int i;
1164 	unsigned int min_score = UINT_MAX, num_score;
1165 
1166 	for (i = 0; i < num_sched_list; ++i) {
1167 		sched = sched_list[i];
1168 
1169 		if (!sched->ready) {
1170 			DRM_WARN("scheduler %s is not ready, skipping",
1171 				 sched->name);
1172 			continue;
1173 		}
1174 
1175 		num_score = atomic_read(sched->score);
1176 		if (num_score < min_score) {
1177 			min_score = num_score;
1178 			picked_sched = sched;
1179 		}
1180 	}
1181 
1182 	return picked_sched;
1183 }
1184 EXPORT_SYMBOL(drm_sched_pick_best);
1185 
1186 /**
1187  * drm_sched_free_job_work - worker to call free_job
1188  *
1189  * @w: free job work
1190  */
1191 static void drm_sched_free_job_work(struct work_struct *w)
1192 {
1193 	struct drm_gpu_scheduler *sched =
1194 		container_of(w, struct drm_gpu_scheduler, work_free_job);
1195 	struct drm_sched_job *job;
1196 
1197 	job = drm_sched_get_finished_job(sched);
1198 	if (job)
1199 		sched->ops->free_job(job);
1200 
1201 	drm_sched_run_free_queue(sched);
1202 	drm_sched_run_job_queue(sched);
1203 }
1204 
1205 /**
1206  * drm_sched_run_job_work - worker to call run_job
1207  *
1208  * @w: run job work
1209  */
1210 static void drm_sched_run_job_work(struct work_struct *w)
1211 {
1212 	struct drm_gpu_scheduler *sched =
1213 		container_of(w, struct drm_gpu_scheduler, work_run_job);
1214 	struct drm_sched_entity *entity;
1215 	struct dma_fence *fence;
1216 	struct drm_sched_fence *s_fence;
1217 	struct drm_sched_job *sched_job;
1218 	int r;
1219 
1220 	/* Find entity with a ready job */
1221 	entity = drm_sched_select_entity(sched);
1222 	if (!entity)
1223 		return;	/* No more work */
1224 
1225 	sched_job = drm_sched_entity_pop_job(entity);
1226 	if (!sched_job) {
1227 		complete_all(&entity->entity_idle);
1228 		drm_sched_run_job_queue(sched);
1229 		return;
1230 	}
1231 
1232 	s_fence = sched_job->s_fence;
1233 
1234 	atomic_add(sched_job->credits, &sched->credit_count);
1235 	drm_sched_job_begin(sched_job);
1236 
1237 	trace_drm_sched_job_run(sched_job, entity);
1238 	/*
1239 	 * The run_job() callback must by definition return a fence whose
1240 	 * refcount has been incremented for the scheduler already.
1241 	 */
1242 	fence = sched->ops->run_job(sched_job);
1243 	complete_all(&entity->entity_idle);
1244 	drm_sched_fence_scheduled(s_fence, fence);
1245 
1246 	if (!IS_ERR_OR_NULL(fence)) {
1247 		r = dma_fence_add_callback(fence, &sched_job->cb,
1248 					   drm_sched_job_done_cb);
1249 		if (r == -ENOENT)
1250 			drm_sched_job_done(sched_job, fence->error);
1251 		else if (r)
1252 			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
1253 
1254 		dma_fence_put(fence);
1255 	} else {
1256 		drm_sched_job_done(sched_job, IS_ERR(fence) ?
1257 				   PTR_ERR(fence) : 0);
1258 	}
1259 
1260 	wake_up(&sched->job_scheduled);
1261 	drm_sched_run_job_queue(sched);
1262 }
1263 
1264 /**
1265  * drm_sched_init - Init a gpu scheduler instance
1266  *
1267  * @sched: scheduler instance
1268  * @args: scheduler initialization arguments
1269  *
1270  * Return 0 on success, otherwise error code.
1271  */
1272 int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_args *args)
1273 {
1274 	int i;
1275 
1276 	sched->ops = args->ops;
1277 	sched->credit_limit = args->credit_limit;
1278 	sched->name = args->name;
1279 	sched->timeout = args->timeout;
1280 	sched->hang_limit = args->hang_limit;
1281 	sched->timeout_wq = args->timeout_wq ? args->timeout_wq : system_wq;
1282 	sched->score = args->score ? args->score : &sched->_score;
1283 	sched->dev = args->dev;
1284 
1285 	if (args->num_rqs > DRM_SCHED_PRIORITY_COUNT) {
1286 		/* This is a gross violation--tell drivers what the  problem is.
1287 		 */
1288 		dev_err(sched->dev, "%s: num_rqs cannot be greater than DRM_SCHED_PRIORITY_COUNT\n",
1289 			__func__);
1290 		return -EINVAL;
1291 	} else if (sched->sched_rq) {
1292 		/* Not an error, but warn anyway so drivers can
1293 		 * fine-tune their DRM calling order, and return all
1294 		 * is good.
1295 		 */
1296 		dev_warn(sched->dev, "%s: scheduler already initialized!\n", __func__);
1297 		return 0;
1298 	}
1299 
1300 	if (args->submit_wq) {
1301 		sched->submit_wq = args->submit_wq;
1302 		sched->own_submit_wq = false;
1303 	} else {
1304 #ifdef CONFIG_LOCKDEP
1305 		sched->submit_wq = alloc_ordered_workqueue_lockdep_map(args->name,
1306 								       WQ_MEM_RECLAIM,
1307 								       &drm_sched_lockdep_map);
1308 #else
1309 		sched->submit_wq = alloc_ordered_workqueue(args->name, WQ_MEM_RECLAIM);
1310 #endif
1311 		if (!sched->submit_wq)
1312 			return -ENOMEM;
1313 
1314 		sched->own_submit_wq = true;
1315 	}
1316 
1317 	sched->sched_rq = kmalloc_array(args->num_rqs, sizeof(*sched->sched_rq),
1318 					GFP_KERNEL | __GFP_ZERO);
1319 	if (!sched->sched_rq)
1320 		goto Out_check_own;
1321 	sched->num_rqs = args->num_rqs;
1322 	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
1323 		sched->sched_rq[i] = kzalloc(sizeof(*sched->sched_rq[i]), GFP_KERNEL);
1324 		if (!sched->sched_rq[i])
1325 			goto Out_unroll;
1326 		drm_sched_rq_init(sched, sched->sched_rq[i]);
1327 	}
1328 
1329 	init_waitqueue_head(&sched->job_scheduled);
1330 	INIT_LIST_HEAD(&sched->pending_list);
1331 	spin_lock_init(&sched->job_list_lock);
1332 	atomic_set(&sched->credit_count, 0);
1333 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
1334 	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
1335 	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
1336 	atomic_set(&sched->_score, 0);
1337 	atomic64_set(&sched->job_id_count, 0);
1338 	sched->pause_submit = false;
1339 
1340 	sched->ready = true;
1341 	return 0;
1342 Out_unroll:
1343 	for (--i ; i >= DRM_SCHED_PRIORITY_KERNEL; i--)
1344 		kfree(sched->sched_rq[i]);
1345 
1346 	kfree(sched->sched_rq);
1347 	sched->sched_rq = NULL;
1348 Out_check_own:
1349 	if (sched->own_submit_wq)
1350 		destroy_workqueue(sched->submit_wq);
1351 	dev_err(sched->dev, "%s: Failed to setup GPU scheduler--out of memory\n", __func__);
1352 	return -ENOMEM;
1353 }
1354 EXPORT_SYMBOL(drm_sched_init);
1355 
1356 /**
1357  * drm_sched_fini - Destroy a gpu scheduler
1358  *
1359  * @sched: scheduler instance
1360  *
1361  * Tears down and cleans up the scheduler.
1362  *
1363  * This stops submission of new jobs to the hardware through
1364  * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job()
1365  * will not be called for all jobs still in drm_gpu_scheduler.pending_list.
1366  * There is no solution for this currently. Thus, it is up to the driver to make
1367  * sure that:
1368  *
1369  *  a) drm_sched_fini() is only called after for all submitted jobs
1370  *     drm_sched_backend_ops.free_job() has been called or that
1371  *  b) the jobs for which drm_sched_backend_ops.free_job() has not been called
1372  *     after drm_sched_fini() ran are freed manually.
1373  *
1374  * FIXME: Take care of the above problem and prevent this function from leaking
1375  * the jobs in drm_gpu_scheduler.pending_list under any circumstances.
1376  */
1377 void drm_sched_fini(struct drm_gpu_scheduler *sched)
1378 {
1379 	struct drm_sched_entity *s_entity;
1380 	int i;
1381 
1382 	drm_sched_wqueue_stop(sched);
1383 
1384 	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
1385 		struct drm_sched_rq *rq = sched->sched_rq[i];
1386 
1387 		spin_lock(&rq->lock);
1388 		list_for_each_entry(s_entity, &rq->entities, list)
1389 			/*
1390 			 * Prevents reinsertion and marks job_queue as idle,
1391 			 * it will be removed from the rq in drm_sched_entity_fini()
1392 			 * eventually
1393 			 */
1394 			s_entity->stopped = true;
1395 		spin_unlock(&rq->lock);
1396 		kfree(sched->sched_rq[i]);
1397 	}
1398 
1399 	/* Wakeup everyone stuck in drm_sched_entity_flush for this scheduler */
1400 	wake_up_all(&sched->job_scheduled);
1401 
1402 	/* Confirm no work left behind accessing device structures */
1403 	cancel_delayed_work_sync(&sched->work_tdr);
1404 
1405 	if (sched->own_submit_wq)
1406 		destroy_workqueue(sched->submit_wq);
1407 	sched->ready = false;
1408 	kfree(sched->sched_rq);
1409 	sched->sched_rq = NULL;
1410 }
1411 EXPORT_SYMBOL(drm_sched_fini);
1412 
1413 /**
1414  * drm_sched_increase_karma - Update sched_entity guilty flag
1415  *
1416  * @bad: The job guilty of time out
1417  *
1418  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
1419  * limit of the scheduler then the respective sched entity is marked guilty and
1420  * jobs from it will not be scheduled further
1421  */
1422 void drm_sched_increase_karma(struct drm_sched_job *bad)
1423 {
1424 	int i;
1425 	struct drm_sched_entity *tmp;
1426 	struct drm_sched_entity *entity;
1427 	struct drm_gpu_scheduler *sched = bad->sched;
1428 
1429 	/* don't change @bad's karma if it's from KERNEL RQ,
1430 	 * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
1431 	 * corrupt but keep in mind that kernel jobs always considered good.
1432 	 */
1433 	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
1434 		atomic_inc(&bad->karma);
1435 
1436 		for (i = DRM_SCHED_PRIORITY_HIGH; i < sched->num_rqs; i++) {
1437 			struct drm_sched_rq *rq = sched->sched_rq[i];
1438 
1439 			spin_lock(&rq->lock);
1440 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
1441 				if (bad->s_fence->scheduled.context ==
1442 				    entity->fence_context) {
1443 					if (entity->guilty)
1444 						atomic_set(entity->guilty, 1);
1445 					break;
1446 				}
1447 			}
1448 			spin_unlock(&rq->lock);
1449 			if (&entity->list != &rq->entities)
1450 				break;
1451 		}
1452 	}
1453 }
1454 EXPORT_SYMBOL(drm_sched_increase_karma);
1455 
1456 /**
1457  * drm_sched_wqueue_ready - Is the scheduler ready for submission
1458  *
1459  * @sched: scheduler instance
1460  *
1461  * Returns true if submission is ready
1462  */
1463 bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched)
1464 {
1465 	return sched->ready;
1466 }
1467 EXPORT_SYMBOL(drm_sched_wqueue_ready);
1468 
1469 /**
1470  * drm_sched_wqueue_stop - stop scheduler submission
1471  * @sched: scheduler instance
1472  *
1473  * Stops the scheduler from pulling new jobs from entities. It also stops
1474  * freeing jobs automatically through drm_sched_backend_ops.free_job().
1475  */
1476 void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
1477 {
1478 	WRITE_ONCE(sched->pause_submit, true);
1479 	cancel_work_sync(&sched->work_run_job);
1480 	cancel_work_sync(&sched->work_free_job);
1481 }
1482 EXPORT_SYMBOL(drm_sched_wqueue_stop);
1483 
1484 /**
1485  * drm_sched_wqueue_start - start scheduler submission
1486  * @sched: scheduler instance
1487  *
1488  * Restarts the scheduler after drm_sched_wqueue_stop() has stopped it.
1489  *
1490  * This function is not necessary for 'conventional' startup. The scheduler is
1491  * fully operational after drm_sched_init() succeeded.
1492  */
1493 void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
1494 {
1495 	WRITE_ONCE(sched->pause_submit, false);
1496 	queue_work(sched->submit_wq, &sched->work_run_job);
1497 	queue_work(sched->submit_wq, &sched->work_free_job);
1498 }
1499 EXPORT_SYMBOL(drm_sched_wqueue_start);
1500