xref: /linux/drivers/gpu/drm/panthor/panthor_sched.c (revision 2fe3c78a2c26dd5ee811024a1b7d6cfb4d654319)
1 // SPDX-License-Identifier: GPL-2.0 or MIT
2 /* Copyright 2023 Collabora ltd. */
3 
4 #include <drm/drm_drv.h>
5 #include <drm/drm_exec.h>
6 #include <drm/drm_gem_shmem_helper.h>
7 #include <drm/drm_managed.h>
8 #include <drm/gpu_scheduler.h>
9 #include <drm/panthor_drm.h>
10 
11 #include <linux/build_bug.h>
12 #include <linux/clk.h>
13 #include <linux/delay.h>
14 #include <linux/dma-mapping.h>
15 #include <linux/dma-resv.h>
16 #include <linux/firmware.h>
17 #include <linux/interrupt.h>
18 #include <linux/io.h>
19 #include <linux/iopoll.h>
20 #include <linux/iosys-map.h>
21 #include <linux/module.h>
22 #include <linux/platform_device.h>
23 #include <linux/pm_runtime.h>
24 
25 #include "panthor_devfreq.h"
26 #include "panthor_device.h"
27 #include "panthor_fw.h"
28 #include "panthor_gem.h"
29 #include "panthor_gpu.h"
30 #include "panthor_heap.h"
31 #include "panthor_mmu.h"
32 #include "panthor_regs.h"
33 #include "panthor_sched.h"
34 
35 /**
36  * DOC: Scheduler
37  *
38  * Mali CSF hardware adopts a firmware-assisted scheduling model, where
39  * the firmware takes care of scheduling aspects, to some extent.
40  *
41  * The scheduling happens at the scheduling group level, each group
42  * contains 1 to N queues (N is FW/hardware dependent, and exposed
43  * through the firmware interface). Each queue is assigned a command
44  * stream ring buffer, which serves as a way to get jobs submitted to
45  * the GPU, among other things.
46  *
47  * The firmware can schedule a maximum of M groups (M is FW/hardware
48  * dependent, and exposed through the firmware interface). Passed
49  * this maximum number of groups, the kernel must take care of
50  * rotating the groups passed to the firmware so every group gets
51  * a chance to have his queues scheduled for execution.
52  *
53  * The current implementation only supports with kernel-mode queues.
54  * In other terms, userspace doesn't have access to the ring-buffer.
55  * Instead, userspace passes indirect command stream buffers that are
56  * called from the queue ring-buffer by the kernel using a pre-defined
57  * sequence of command stream instructions to ensure the userspace driver
58  * always gets consistent results (cache maintenance,
59  * synchronization, ...).
60  *
61  * We rely on the drm_gpu_scheduler framework to deal with job
62  * dependencies and submission. As any other driver dealing with a
63  * FW-scheduler, we use the 1:1 entity:scheduler mode, such that each
64  * entity has its own job scheduler. When a job is ready to be executed
65  * (all its dependencies are met), it is pushed to the appropriate
66  * queue ring-buffer, and the group is scheduled for execution if it
67  * wasn't already active.
68  *
69  * Kernel-side group scheduling is timeslice-based. When we have less
70  * groups than there are slots, the periodic tick is disabled and we
71  * just let the FW schedule the active groups. When there are more
72  * groups than slots, we let each group a chance to execute stuff for
73  * a given amount of time, and then re-evaluate and pick new groups
74  * to schedule. The group selection algorithm is based on
75  * priority+round-robin.
76  *
77  * Even though user-mode queues is out of the scope right now, the
78  * current design takes them into account by avoiding any guess on the
79  * group/queue state that would be based on information we wouldn't have
80  * if userspace was in charge of the ring-buffer. That's also one of the
81  * reason we don't do 'cooperative' scheduling (encoding FW group slot
82  * reservation as dma_fence that would be returned from the
83  * drm_gpu_scheduler::prepare_job() hook, and treating group rotation as
84  * a queue of waiters, ordered by job submission order). This approach
85  * would work for kernel-mode queues, but would make user-mode queues a
86  * lot more complicated to retrofit.
87  */
88 
89 #define JOB_TIMEOUT_MS				5000
90 
91 #define MIN_CS_PER_CSG				8
92 
93 #define MIN_CSGS				3
94 #define MAX_CSG_PRIO				0xf
95 
96 struct panthor_group;
97 
98 /**
99  * struct panthor_csg_slot - Command stream group slot
100  *
101  * This represents a FW slot for a scheduling group.
102  */
103 struct panthor_csg_slot {
104 	/** @group: Scheduling group bound to this slot. */
105 	struct panthor_group *group;
106 
107 	/** @priority: Group priority. */
108 	u8 priority;
109 
110 	/**
111 	 * @idle: True if the group bound to this slot is idle.
112 	 *
113 	 * A group is idle when it has nothing waiting for execution on
114 	 * all its queues, or when queues are blocked waiting for something
115 	 * to happen (synchronization object).
116 	 */
117 	bool idle;
118 };
119 
120 /**
121  * enum panthor_csg_priority - Group priority
122  */
123 enum panthor_csg_priority {
124 	/** @PANTHOR_CSG_PRIORITY_LOW: Low priority group. */
125 	PANTHOR_CSG_PRIORITY_LOW = 0,
126 
127 	/** @PANTHOR_CSG_PRIORITY_MEDIUM: Medium priority group. */
128 	PANTHOR_CSG_PRIORITY_MEDIUM,
129 
130 	/** @PANTHOR_CSG_PRIORITY_HIGH: High priority group. */
131 	PANTHOR_CSG_PRIORITY_HIGH,
132 
133 	/**
134 	 * @PANTHOR_CSG_PRIORITY_RT: Real-time priority group.
135 	 *
136 	 * Real-time priority allows one to preempt scheduling of other
137 	 * non-real-time groups. When such a group becomes executable,
138 	 * it will evict the group with the lowest non-rt priority if
139 	 * there's no free group slot available.
140 	 *
141 	 * Currently not exposed to userspace.
142 	 */
143 	PANTHOR_CSG_PRIORITY_RT,
144 
145 	/** @PANTHOR_CSG_PRIORITY_COUNT: Number of priority levels. */
146 	PANTHOR_CSG_PRIORITY_COUNT,
147 };
148 
149 /**
150  * struct panthor_scheduler - Object used to manage the scheduler
151  */
152 struct panthor_scheduler {
153 	/** @ptdev: Device. */
154 	struct panthor_device *ptdev;
155 
156 	/**
157 	 * @wq: Workqueue used by our internal scheduler logic and
158 	 * drm_gpu_scheduler.
159 	 *
160 	 * Used for the scheduler tick, group update or other kind of FW
161 	 * event processing that can't be handled in the threaded interrupt
162 	 * path. Also passed to the drm_gpu_scheduler instances embedded
163 	 * in panthor_queue.
164 	 */
165 	struct workqueue_struct *wq;
166 
167 	/**
168 	 * @heap_alloc_wq: Workqueue used to schedule tiler_oom works.
169 	 *
170 	 * We have a queue dedicated to heap chunk allocation works to avoid
171 	 * blocking the rest of the scheduler if the allocation tries to
172 	 * reclaim memory.
173 	 */
174 	struct workqueue_struct *heap_alloc_wq;
175 
176 	/** @tick_work: Work executed on a scheduling tick. */
177 	struct delayed_work tick_work;
178 
179 	/**
180 	 * @sync_upd_work: Work used to process synchronization object updates.
181 	 *
182 	 * We use this work to unblock queues/groups that were waiting on a
183 	 * synchronization object.
184 	 */
185 	struct work_struct sync_upd_work;
186 
187 	/**
188 	 * @fw_events_work: Work used to process FW events outside the interrupt path.
189 	 *
190 	 * Even if the interrupt is threaded, we need any event processing
191 	 * that require taking the panthor_scheduler::lock to be processed
192 	 * outside the interrupt path so we don't block the tick logic when
193 	 * it calls panthor_fw_{csg,wait}_wait_acks(). Since most of the
194 	 * event processing requires taking this lock, we just delegate all
195 	 * FW event processing to the scheduler workqueue.
196 	 */
197 	struct work_struct fw_events_work;
198 
199 	/**
200 	 * @fw_events: Bitmask encoding pending FW events.
201 	 */
202 	atomic_t fw_events;
203 
204 	/**
205 	 * @resched_target: When the next tick should occur.
206 	 *
207 	 * Expressed in jiffies.
208 	 */
209 	u64 resched_target;
210 
211 	/**
212 	 * @last_tick: When the last tick occurred.
213 	 *
214 	 * Expressed in jiffies.
215 	 */
216 	u64 last_tick;
217 
218 	/** @tick_period: Tick period in jiffies. */
219 	u64 tick_period;
220 
221 	/**
222 	 * @lock: Lock protecting access to all the scheduler fields.
223 	 *
224 	 * Should be taken in the tick work, the irq handler, and anywhere the @groups
225 	 * fields are touched.
226 	 */
227 	struct mutex lock;
228 
229 	/** @groups: Various lists used to classify groups. */
230 	struct {
231 		/**
232 		 * @runnable: Runnable group lists.
233 		 *
234 		 * When a group has queues that want to execute something,
235 		 * its panthor_group::run_node should be inserted here.
236 		 *
237 		 * One list per-priority.
238 		 */
239 		struct list_head runnable[PANTHOR_CSG_PRIORITY_COUNT];
240 
241 		/**
242 		 * @idle: Idle group lists.
243 		 *
244 		 * When all queues of a group are idle (either because they
245 		 * have nothing to execute, or because they are blocked), the
246 		 * panthor_group::run_node field should be inserted here.
247 		 *
248 		 * One list per-priority.
249 		 */
250 		struct list_head idle[PANTHOR_CSG_PRIORITY_COUNT];
251 
252 		/**
253 		 * @waiting: List of groups whose queues are blocked on a
254 		 * synchronization object.
255 		 *
256 		 * Insert panthor_group::wait_node here when a group is waiting
257 		 * for synchronization objects to be signaled.
258 		 *
259 		 * This list is evaluated in the @sync_upd_work work.
260 		 */
261 		struct list_head waiting;
262 	} groups;
263 
264 	/**
265 	 * @csg_slots: FW command stream group slots.
266 	 */
267 	struct panthor_csg_slot csg_slots[MAX_CSGS];
268 
269 	/** @csg_slot_count: Number of command stream group slots exposed by the FW. */
270 	u32 csg_slot_count;
271 
272 	/** @cs_slot_count: Number of command stream slot per group slot exposed by the FW. */
273 	u32 cs_slot_count;
274 
275 	/** @as_slot_count: Number of address space slots supported by the MMU. */
276 	u32 as_slot_count;
277 
278 	/** @used_csg_slot_count: Number of command stream group slot currently used. */
279 	u32 used_csg_slot_count;
280 
281 	/** @sb_slot_count: Number of scoreboard slots. */
282 	u32 sb_slot_count;
283 
284 	/**
285 	 * @might_have_idle_groups: True if an active group might have become idle.
286 	 *
287 	 * This will force a tick, so other runnable groups can be scheduled if one
288 	 * or more active groups became idle.
289 	 */
290 	bool might_have_idle_groups;
291 
292 	/** @pm: Power management related fields. */
293 	struct {
294 		/** @has_ref: True if the scheduler owns a runtime PM reference. */
295 		bool has_ref;
296 	} pm;
297 
298 	/** @reset: Reset related fields. */
299 	struct {
300 		/** @lock: Lock protecting the other reset fields. */
301 		struct mutex lock;
302 
303 		/**
304 		 * @in_progress: True if a reset is in progress.
305 		 *
306 		 * Set to true in panthor_sched_pre_reset() and back to false in
307 		 * panthor_sched_post_reset().
308 		 */
309 		atomic_t in_progress;
310 
311 		/**
312 		 * @stopped_groups: List containing all groups that were stopped
313 		 * before a reset.
314 		 *
315 		 * Insert panthor_group::run_node in the pre_reset path.
316 		 */
317 		struct list_head stopped_groups;
318 	} reset;
319 };
320 
321 /**
322  * struct panthor_syncobj_32b - 32-bit FW synchronization object
323  */
324 struct panthor_syncobj_32b {
325 	/** @seqno: Sequence number. */
326 	u32 seqno;
327 
328 	/**
329 	 * @status: Status.
330 	 *
331 	 * Not zero on failure.
332 	 */
333 	u32 status;
334 };
335 
336 /**
337  * struct panthor_syncobj_64b - 64-bit FW synchronization object
338  */
339 struct panthor_syncobj_64b {
340 	/** @seqno: Sequence number. */
341 	u64 seqno;
342 
343 	/**
344 	 * @status: Status.
345 	 *
346 	 * Not zero on failure.
347 	 */
348 	u32 status;
349 
350 	/** @pad: MBZ. */
351 	u32 pad;
352 };
353 
354 /**
355  * struct panthor_queue - Execution queue
356  */
357 struct panthor_queue {
358 	/** @scheduler: DRM scheduler used for this queue. */
359 	struct drm_gpu_scheduler scheduler;
360 
361 	/** @entity: DRM scheduling entity used for this queue. */
362 	struct drm_sched_entity entity;
363 
364 	/**
365 	 * @remaining_time: Time remaining before the job timeout expires.
366 	 *
367 	 * The job timeout is suspended when the queue is not scheduled by the
368 	 * FW. Every time we suspend the timer, we need to save the remaining
369 	 * time so we can restore it later on.
370 	 */
371 	unsigned long remaining_time;
372 
373 	/** @timeout_suspended: True if the job timeout was suspended. */
374 	bool timeout_suspended;
375 
376 	/**
377 	 * @doorbell_id: Doorbell assigned to this queue.
378 	 *
379 	 * Right now, all groups share the same doorbell, and the doorbell ID
380 	 * is assigned to group_slot + 1 when the group is assigned a slot. But
381 	 * we might decide to provide fine grained doorbell assignment at some
382 	 * point, so don't have to wake up all queues in a group every time one
383 	 * of them is updated.
384 	 */
385 	u8 doorbell_id;
386 
387 	/**
388 	 * @priority: Priority of the queue inside the group.
389 	 *
390 	 * Must be less than 16 (Only 4 bits available).
391 	 */
392 	u8 priority;
393 #define CSF_MAX_QUEUE_PRIO	GENMASK(3, 0)
394 
395 	/** @ringbuf: Command stream ring-buffer. */
396 	struct panthor_kernel_bo *ringbuf;
397 
398 	/** @iface: Firmware interface. */
399 	struct {
400 		/** @mem: FW memory allocated for this interface. */
401 		struct panthor_kernel_bo *mem;
402 
403 		/** @input: Input interface. */
404 		struct panthor_fw_ringbuf_input_iface *input;
405 
406 		/** @output: Output interface. */
407 		const struct panthor_fw_ringbuf_output_iface *output;
408 
409 		/** @input_fw_va: FW virtual address of the input interface buffer. */
410 		u32 input_fw_va;
411 
412 		/** @output_fw_va: FW virtual address of the output interface buffer. */
413 		u32 output_fw_va;
414 	} iface;
415 
416 	/**
417 	 * @syncwait: Stores information about the synchronization object this
418 	 * queue is waiting on.
419 	 */
420 	struct {
421 		/** @gpu_va: GPU address of the synchronization object. */
422 		u64 gpu_va;
423 
424 		/** @ref: Reference value to compare against. */
425 		u64 ref;
426 
427 		/** @gt: True if this is a greater-than test. */
428 		bool gt;
429 
430 		/** @sync64: True if this is a 64-bit sync object. */
431 		bool sync64;
432 
433 		/** @bo: Buffer object holding the synchronization object. */
434 		struct drm_gem_object *obj;
435 
436 		/** @offset: Offset of the synchronization object inside @bo. */
437 		u64 offset;
438 
439 		/**
440 		 * @kmap: Kernel mapping of the buffer object holding the
441 		 * synchronization object.
442 		 */
443 		void *kmap;
444 	} syncwait;
445 
446 	/** @fence_ctx: Fence context fields. */
447 	struct {
448 		/** @lock: Used to protect access to all fences allocated by this context. */
449 		spinlock_t lock;
450 
451 		/**
452 		 * @id: Fence context ID.
453 		 *
454 		 * Allocated with dma_fence_context_alloc().
455 		 */
456 		u64 id;
457 
458 		/** @seqno: Sequence number of the last initialized fence. */
459 		atomic64_t seqno;
460 
461 		/**
462 		 * @last_fence: Fence of the last submitted job.
463 		 *
464 		 * We return this fence when we get an empty command stream.
465 		 * This way, we are guaranteed that all earlier jobs have completed
466 		 * when drm_sched_job::s_fence::finished without having to feed
467 		 * the CS ring buffer with a dummy job that only signals the fence.
468 		 */
469 		struct dma_fence *last_fence;
470 
471 		/**
472 		 * @in_flight_jobs: List containing all in-flight jobs.
473 		 *
474 		 * Used to keep track and signal panthor_job::done_fence when the
475 		 * synchronization object attached to the queue is signaled.
476 		 */
477 		struct list_head in_flight_jobs;
478 	} fence_ctx;
479 };
480 
481 /**
482  * enum panthor_group_state - Scheduling group state.
483  */
484 enum panthor_group_state {
485 	/** @PANTHOR_CS_GROUP_CREATED: Group was created, but not scheduled yet. */
486 	PANTHOR_CS_GROUP_CREATED,
487 
488 	/** @PANTHOR_CS_GROUP_ACTIVE: Group is currently scheduled. */
489 	PANTHOR_CS_GROUP_ACTIVE,
490 
491 	/**
492 	 * @PANTHOR_CS_GROUP_SUSPENDED: Group was scheduled at least once, but is
493 	 * inactive/suspended right now.
494 	 */
495 	PANTHOR_CS_GROUP_SUSPENDED,
496 
497 	/**
498 	 * @PANTHOR_CS_GROUP_TERMINATED: Group was terminated.
499 	 *
500 	 * Can no longer be scheduled. The only allowed action is a destruction.
501 	 */
502 	PANTHOR_CS_GROUP_TERMINATED,
503 
504 	/**
505 	 * @PANTHOR_CS_GROUP_UNKNOWN_STATE: Group is an unknown state.
506 	 *
507 	 * The FW returned an inconsistent state. The group is flagged unusable
508 	 * and can no longer be scheduled. The only allowed action is a
509 	 * destruction.
510 	 *
511 	 * When that happens, we also schedule a FW reset, to start from a fresh
512 	 * state.
513 	 */
514 	PANTHOR_CS_GROUP_UNKNOWN_STATE,
515 };
516 
517 /**
518  * struct panthor_group - Scheduling group object
519  */
520 struct panthor_group {
521 	/** @refcount: Reference count */
522 	struct kref refcount;
523 
524 	/** @ptdev: Device. */
525 	struct panthor_device *ptdev;
526 
527 	/** @vm: VM bound to the group. */
528 	struct panthor_vm *vm;
529 
530 	/** @compute_core_mask: Mask of shader cores that can be used for compute jobs. */
531 	u64 compute_core_mask;
532 
533 	/** @fragment_core_mask: Mask of shader cores that can be used for fragment jobs. */
534 	u64 fragment_core_mask;
535 
536 	/** @tiler_core_mask: Mask of tiler cores that can be used for tiler jobs. */
537 	u64 tiler_core_mask;
538 
539 	/** @max_compute_cores: Maximum number of shader cores used for compute jobs. */
540 	u8 max_compute_cores;
541 
542 	/** @max_fragment_cores: Maximum number of shader cores used for fragment jobs. */
543 	u8 max_fragment_cores;
544 
545 	/** @max_tiler_cores: Maximum number of tiler cores used for tiler jobs. */
546 	u8 max_tiler_cores;
547 
548 	/** @priority: Group priority (check panthor_csg_priority). */
549 	u8 priority;
550 
551 	/** @blocked_queues: Bitmask reflecting the blocked queues. */
552 	u32 blocked_queues;
553 
554 	/** @idle_queues: Bitmask reflecting the idle queues. */
555 	u32 idle_queues;
556 
557 	/** @fatal_lock: Lock used to protect access to fatal fields. */
558 	spinlock_t fatal_lock;
559 
560 	/** @fatal_queues: Bitmask reflecting the queues that hit a fatal exception. */
561 	u32 fatal_queues;
562 
563 	/** @tiler_oom: Mask of queues that have a tiler OOM event to process. */
564 	atomic_t tiler_oom;
565 
566 	/** @queue_count: Number of queues in this group. */
567 	u32 queue_count;
568 
569 	/** @queues: Queues owned by this group. */
570 	struct panthor_queue *queues[MAX_CS_PER_CSG];
571 
572 	/**
573 	 * @csg_id: ID of the FW group slot.
574 	 *
575 	 * -1 when the group is not scheduled/active.
576 	 */
577 	int csg_id;
578 
579 	/**
580 	 * @destroyed: True when the group has been destroyed.
581 	 *
582 	 * If a group is destroyed it becomes useless: no further jobs can be submitted
583 	 * to its queues. We simply wait for all references to be dropped so we can
584 	 * release the group object.
585 	 */
586 	bool destroyed;
587 
588 	/**
589 	 * @timedout: True when a timeout occurred on any of the queues owned by
590 	 * this group.
591 	 *
592 	 * Timeouts can be reported by drm_sched or by the FW. In any case, any
593 	 * timeout situation is unrecoverable, and the group becomes useless.
594 	 * We simply wait for all references to be dropped so we can release the
595 	 * group object.
596 	 */
597 	bool timedout;
598 
599 	/**
600 	 * @syncobjs: Pool of per-queue synchronization objects.
601 	 *
602 	 * One sync object per queue. The position of the sync object is
603 	 * determined by the queue index.
604 	 */
605 	struct panthor_kernel_bo *syncobjs;
606 
607 	/** @state: Group state. */
608 	enum panthor_group_state state;
609 
610 	/**
611 	 * @suspend_buf: Suspend buffer.
612 	 *
613 	 * Stores the state of the group and its queues when a group is suspended.
614 	 * Used at resume time to restore the group in its previous state.
615 	 *
616 	 * The size of the suspend buffer is exposed through the FW interface.
617 	 */
618 	struct panthor_kernel_bo *suspend_buf;
619 
620 	/**
621 	 * @protm_suspend_buf: Protection mode suspend buffer.
622 	 *
623 	 * Stores the state of the group and its queues when a group that's in
624 	 * protection mode is suspended.
625 	 *
626 	 * Used at resume time to restore the group in its previous state.
627 	 *
628 	 * The size of the protection mode suspend buffer is exposed through the
629 	 * FW interface.
630 	 */
631 	struct panthor_kernel_bo *protm_suspend_buf;
632 
633 	/** @sync_upd_work: Work used to check/signal job fences. */
634 	struct work_struct sync_upd_work;
635 
636 	/** @tiler_oom_work: Work used to process tiler OOM events happening on this group. */
637 	struct work_struct tiler_oom_work;
638 
639 	/** @term_work: Work used to finish the group termination procedure. */
640 	struct work_struct term_work;
641 
642 	/**
643 	 * @release_work: Work used to release group resources.
644 	 *
645 	 * We need to postpone the group release to avoid a deadlock when
646 	 * the last ref is released in the tick work.
647 	 */
648 	struct work_struct release_work;
649 
650 	/**
651 	 * @run_node: Node used to insert the group in the
652 	 * panthor_group::groups::{runnable,idle} and
653 	 * panthor_group::reset.stopped_groups lists.
654 	 */
655 	struct list_head run_node;
656 
657 	/**
658 	 * @wait_node: Node used to insert the group in the
659 	 * panthor_group::groups::waiting list.
660 	 */
661 	struct list_head wait_node;
662 };
663 
664 /**
665  * group_queue_work() - Queue a group work
666  * @group: Group to queue the work for.
667  * @wname: Work name.
668  *
669  * Grabs a ref and queue a work item to the scheduler workqueue. If
670  * the work was already queued, we release the reference we grabbed.
671  *
672  * Work callbacks must release the reference we grabbed here.
673  */
674 #define group_queue_work(group, wname) \
675 	do { \
676 		group_get(group); \
677 		if (!queue_work((group)->ptdev->scheduler->wq, &(group)->wname ## _work)) \
678 			group_put(group); \
679 	} while (0)
680 
681 /**
682  * sched_queue_work() - Queue a scheduler work.
683  * @sched: Scheduler object.
684  * @wname: Work name.
685  *
686  * Conditionally queues a scheduler work if no reset is pending/in-progress.
687  */
688 #define sched_queue_work(sched, wname) \
689 	do { \
690 		if (!atomic_read(&(sched)->reset.in_progress) && \
691 		    !panthor_device_reset_is_pending((sched)->ptdev)) \
692 			queue_work((sched)->wq, &(sched)->wname ## _work); \
693 	} while (0)
694 
695 /**
696  * sched_queue_delayed_work() - Queue a scheduler delayed work.
697  * @sched: Scheduler object.
698  * @wname: Work name.
699  * @delay: Work delay in jiffies.
700  *
701  * Conditionally queues a scheduler delayed work if no reset is
702  * pending/in-progress.
703  */
704 #define sched_queue_delayed_work(sched, wname, delay) \
705 	do { \
706 		if (!atomic_read(&sched->reset.in_progress) && \
707 		    !panthor_device_reset_is_pending((sched)->ptdev)) \
708 			mod_delayed_work((sched)->wq, &(sched)->wname ## _work, delay); \
709 	} while (0)
710 
711 /*
712  * We currently set the maximum of groups per file to an arbitrary low value.
713  * But this can be updated if we need more.
714  */
715 #define MAX_GROUPS_PER_POOL 128
716 
717 /**
718  * struct panthor_group_pool - Group pool
719  *
720  * Each file get assigned a group pool.
721  */
722 struct panthor_group_pool {
723 	/** @xa: Xarray used to manage group handles. */
724 	struct xarray xa;
725 };
726 
727 /**
728  * struct panthor_job - Used to manage GPU job
729  */
730 struct panthor_job {
731 	/** @base: Inherit from drm_sched_job. */
732 	struct drm_sched_job base;
733 
734 	/** @refcount: Reference count. */
735 	struct kref refcount;
736 
737 	/** @group: Group of the queue this job will be pushed to. */
738 	struct panthor_group *group;
739 
740 	/** @queue_idx: Index of the queue inside @group. */
741 	u32 queue_idx;
742 
743 	/** @call_info: Information about the userspace command stream call. */
744 	struct {
745 		/** @start: GPU address of the userspace command stream. */
746 		u64 start;
747 
748 		/** @size: Size of the userspace command stream. */
749 		u32 size;
750 
751 		/**
752 		 * @latest_flush: Flush ID at the time the userspace command
753 		 * stream was built.
754 		 *
755 		 * Needed for the flush reduction mechanism.
756 		 */
757 		u32 latest_flush;
758 	} call_info;
759 
760 	/** @ringbuf: Position of this job is in the ring buffer. */
761 	struct {
762 		/** @start: Start offset. */
763 		u64 start;
764 
765 		/** @end: End offset. */
766 		u64 end;
767 	} ringbuf;
768 
769 	/**
770 	 * @node: Used to insert the job in the panthor_queue::fence_ctx::in_flight_jobs
771 	 * list.
772 	 */
773 	struct list_head node;
774 
775 	/** @done_fence: Fence signaled when the job is finished or cancelled. */
776 	struct dma_fence *done_fence;
777 };
778 
779 static void
780 panthor_queue_put_syncwait_obj(struct panthor_queue *queue)
781 {
782 	if (queue->syncwait.kmap) {
783 		struct iosys_map map = IOSYS_MAP_INIT_VADDR(queue->syncwait.kmap);
784 
785 		drm_gem_vunmap_unlocked(queue->syncwait.obj, &map);
786 		queue->syncwait.kmap = NULL;
787 	}
788 
789 	drm_gem_object_put(queue->syncwait.obj);
790 	queue->syncwait.obj = NULL;
791 }
792 
793 static void *
794 panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue *queue)
795 {
796 	struct panthor_device *ptdev = group->ptdev;
797 	struct panthor_gem_object *bo;
798 	struct iosys_map map;
799 	int ret;
800 
801 	if (queue->syncwait.kmap)
802 		return queue->syncwait.kmap + queue->syncwait.offset;
803 
804 	bo = panthor_vm_get_bo_for_va(group->vm,
805 				      queue->syncwait.gpu_va,
806 				      &queue->syncwait.offset);
807 	if (drm_WARN_ON(&ptdev->base, IS_ERR_OR_NULL(bo)))
808 		goto err_put_syncwait_obj;
809 
810 	queue->syncwait.obj = &bo->base.base;
811 	ret = drm_gem_vmap_unlocked(queue->syncwait.obj, &map);
812 	if (drm_WARN_ON(&ptdev->base, ret))
813 		goto err_put_syncwait_obj;
814 
815 	queue->syncwait.kmap = map.vaddr;
816 	if (drm_WARN_ON(&ptdev->base, !queue->syncwait.kmap))
817 		goto err_put_syncwait_obj;
818 
819 	return queue->syncwait.kmap + queue->syncwait.offset;
820 
821 err_put_syncwait_obj:
822 	panthor_queue_put_syncwait_obj(queue);
823 	return NULL;
824 }
825 
826 static void group_free_queue(struct panthor_group *group, struct panthor_queue *queue)
827 {
828 	if (IS_ERR_OR_NULL(queue))
829 		return;
830 
831 	if (queue->entity.fence_context)
832 		drm_sched_entity_destroy(&queue->entity);
833 
834 	if (queue->scheduler.ops)
835 		drm_sched_fini(&queue->scheduler);
836 
837 	panthor_queue_put_syncwait_obj(queue);
838 
839 	panthor_kernel_bo_destroy(queue->ringbuf);
840 	panthor_kernel_bo_destroy(queue->iface.mem);
841 
842 	/* Release the last_fence we were holding, if any. */
843 	dma_fence_put(queue->fence_ctx.last_fence);
844 
845 	kfree(queue);
846 }
847 
848 static void group_release_work(struct work_struct *work)
849 {
850 	struct panthor_group *group = container_of(work,
851 						   struct panthor_group,
852 						   release_work);
853 	u32 i;
854 
855 	for (i = 0; i < group->queue_count; i++)
856 		group_free_queue(group, group->queues[i]);
857 
858 	panthor_kernel_bo_destroy(group->suspend_buf);
859 	panthor_kernel_bo_destroy(group->protm_suspend_buf);
860 	panthor_kernel_bo_destroy(group->syncobjs);
861 
862 	panthor_vm_put(group->vm);
863 	kfree(group);
864 }
865 
866 static void group_release(struct kref *kref)
867 {
868 	struct panthor_group *group = container_of(kref,
869 						   struct panthor_group,
870 						   refcount);
871 	struct panthor_device *ptdev = group->ptdev;
872 
873 	drm_WARN_ON(&ptdev->base, group->csg_id >= 0);
874 	drm_WARN_ON(&ptdev->base, !list_empty(&group->run_node));
875 	drm_WARN_ON(&ptdev->base, !list_empty(&group->wait_node));
876 
877 	queue_work(panthor_cleanup_wq, &group->release_work);
878 }
879 
880 static void group_put(struct panthor_group *group)
881 {
882 	if (group)
883 		kref_put(&group->refcount, group_release);
884 }
885 
886 static struct panthor_group *
887 group_get(struct panthor_group *group)
888 {
889 	if (group)
890 		kref_get(&group->refcount);
891 
892 	return group;
893 }
894 
895 /**
896  * group_bind_locked() - Bind a group to a group slot
897  * @group: Group.
898  * @csg_id: Slot.
899  *
900  * Return: 0 on success, a negative error code otherwise.
901  */
902 static int
903 group_bind_locked(struct panthor_group *group, u32 csg_id)
904 {
905 	struct panthor_device *ptdev = group->ptdev;
906 	struct panthor_csg_slot *csg_slot;
907 	int ret;
908 
909 	lockdep_assert_held(&ptdev->scheduler->lock);
910 
911 	if (drm_WARN_ON(&ptdev->base, group->csg_id != -1 || csg_id >= MAX_CSGS ||
912 			ptdev->scheduler->csg_slots[csg_id].group))
913 		return -EINVAL;
914 
915 	ret = panthor_vm_active(group->vm);
916 	if (ret)
917 		return ret;
918 
919 	csg_slot = &ptdev->scheduler->csg_slots[csg_id];
920 	group_get(group);
921 	group->csg_id = csg_id;
922 
923 	/* Dummy doorbell allocation: doorbell is assigned to the group and
924 	 * all queues use the same doorbell.
925 	 *
926 	 * TODO: Implement LRU-based doorbell assignment, so the most often
927 	 * updated queues get their own doorbell, thus avoiding useless checks
928 	 * on queues belonging to the same group that are rarely updated.
929 	 */
930 	for (u32 i = 0; i < group->queue_count; i++)
931 		group->queues[i]->doorbell_id = csg_id + 1;
932 
933 	csg_slot->group = group;
934 
935 	return 0;
936 }
937 
938 /**
939  * group_unbind_locked() - Unbind a group from a slot.
940  * @group: Group to unbind.
941  *
942  * Return: 0 on success, a negative error code otherwise.
943  */
944 static int
945 group_unbind_locked(struct panthor_group *group)
946 {
947 	struct panthor_device *ptdev = group->ptdev;
948 	struct panthor_csg_slot *slot;
949 
950 	lockdep_assert_held(&ptdev->scheduler->lock);
951 
952 	if (drm_WARN_ON(&ptdev->base, group->csg_id < 0 || group->csg_id >= MAX_CSGS))
953 		return -EINVAL;
954 
955 	if (drm_WARN_ON(&ptdev->base, group->state == PANTHOR_CS_GROUP_ACTIVE))
956 		return -EINVAL;
957 
958 	slot = &ptdev->scheduler->csg_slots[group->csg_id];
959 	panthor_vm_idle(group->vm);
960 	group->csg_id = -1;
961 
962 	/* Tiler OOM events will be re-issued next time the group is scheduled. */
963 	atomic_set(&group->tiler_oom, 0);
964 	cancel_work(&group->tiler_oom_work);
965 
966 	for (u32 i = 0; i < group->queue_count; i++)
967 		group->queues[i]->doorbell_id = -1;
968 
969 	slot->group = NULL;
970 
971 	group_put(group);
972 	return 0;
973 }
974 
975 /**
976  * cs_slot_prog_locked() - Program a queue slot
977  * @ptdev: Device.
978  * @csg_id: Group slot ID.
979  * @cs_id: Queue slot ID.
980  *
981  * Program a queue slot with the queue information so things can start being
982  * executed on this queue.
983  *
984  * The group slot must have a group bound to it already (group_bind_locked()).
985  */
986 static void
987 cs_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
988 {
989 	struct panthor_queue *queue = ptdev->scheduler->csg_slots[csg_id].group->queues[cs_id];
990 	struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
991 
992 	lockdep_assert_held(&ptdev->scheduler->lock);
993 
994 	queue->iface.input->extract = queue->iface.output->extract;
995 	drm_WARN_ON(&ptdev->base, queue->iface.input->insert < queue->iface.input->extract);
996 
997 	cs_iface->input->ringbuf_base = panthor_kernel_bo_gpuva(queue->ringbuf);
998 	cs_iface->input->ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
999 	cs_iface->input->ringbuf_input = queue->iface.input_fw_va;
1000 	cs_iface->input->ringbuf_output = queue->iface.output_fw_va;
1001 	cs_iface->input->config = CS_CONFIG_PRIORITY(queue->priority) |
1002 				  CS_CONFIG_DOORBELL(queue->doorbell_id);
1003 	cs_iface->input->ack_irq_mask = ~0;
1004 	panthor_fw_update_reqs(cs_iface, req,
1005 			       CS_IDLE_SYNC_WAIT |
1006 			       CS_IDLE_EMPTY |
1007 			       CS_STATE_START |
1008 			       CS_EXTRACT_EVENT,
1009 			       CS_IDLE_SYNC_WAIT |
1010 			       CS_IDLE_EMPTY |
1011 			       CS_STATE_MASK |
1012 			       CS_EXTRACT_EVENT);
1013 	if (queue->iface.input->insert != queue->iface.input->extract && queue->timeout_suspended) {
1014 		drm_sched_resume_timeout(&queue->scheduler, queue->remaining_time);
1015 		queue->timeout_suspended = false;
1016 	}
1017 }
1018 
1019 /**
1020  * cs_slot_reset_locked() - Reset a queue slot
1021  * @ptdev: Device.
1022  * @csg_id: Group slot.
1023  * @cs_id: Queue slot.
1024  *
1025  * Change the queue slot state to STOP and suspend the queue timeout if
1026  * the queue is not blocked.
1027  *
1028  * The group slot must have a group bound to it (group_bind_locked()).
1029  */
1030 static int
1031 cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
1032 {
1033 	struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
1034 	struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group;
1035 	struct panthor_queue *queue = group->queues[cs_id];
1036 
1037 	lockdep_assert_held(&ptdev->scheduler->lock);
1038 
1039 	panthor_fw_update_reqs(cs_iface, req,
1040 			       CS_STATE_STOP,
1041 			       CS_STATE_MASK);
1042 
1043 	/* If the queue is blocked, we want to keep the timeout running, so
1044 	 * we can detect unbounded waits and kill the group when that happens.
1045 	 */
1046 	if (!(group->blocked_queues & BIT(cs_id)) && !queue->timeout_suspended) {
1047 		queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
1048 		queue->timeout_suspended = true;
1049 		WARN_ON(queue->remaining_time > msecs_to_jiffies(JOB_TIMEOUT_MS));
1050 	}
1051 
1052 	return 0;
1053 }
1054 
1055 /**
1056  * csg_slot_sync_priority_locked() - Synchronize the group slot priority
1057  * @ptdev: Device.
1058  * @csg_id: Group slot ID.
1059  *
1060  * Group slot priority update happens asynchronously. When we receive a
1061  * %CSG_ENDPOINT_CONFIG, we know the update is effective, and can
1062  * reflect it to our panthor_csg_slot object.
1063  */
1064 static void
1065 csg_slot_sync_priority_locked(struct panthor_device *ptdev, u32 csg_id)
1066 {
1067 	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
1068 	struct panthor_fw_csg_iface *csg_iface;
1069 
1070 	lockdep_assert_held(&ptdev->scheduler->lock);
1071 
1072 	csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
1073 	csg_slot->priority = (csg_iface->input->endpoint_req & CSG_EP_REQ_PRIORITY_MASK) >> 28;
1074 }
1075 
1076 /**
1077  * cs_slot_sync_queue_state_locked() - Synchronize the queue slot priority
1078  * @ptdev: Device.
1079  * @csg_id: Group slot.
1080  * @cs_id: Queue slot.
1081  *
1082  * Queue state is updated on group suspend or STATUS_UPDATE event.
1083  */
1084 static void
1085 cs_slot_sync_queue_state_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
1086 {
1087 	struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group;
1088 	struct panthor_queue *queue = group->queues[cs_id];
1089 	struct panthor_fw_cs_iface *cs_iface =
1090 		panthor_fw_get_cs_iface(group->ptdev, csg_id, cs_id);
1091 
1092 	u32 status_wait_cond;
1093 
1094 	switch (cs_iface->output->status_blocked_reason) {
1095 	case CS_STATUS_BLOCKED_REASON_UNBLOCKED:
1096 		if (queue->iface.input->insert == queue->iface.output->extract &&
1097 		    cs_iface->output->status_scoreboards == 0)
1098 			group->idle_queues |= BIT(cs_id);
1099 		break;
1100 
1101 	case CS_STATUS_BLOCKED_REASON_SYNC_WAIT:
1102 		if (list_empty(&group->wait_node)) {
1103 			list_move_tail(&group->wait_node,
1104 				       &group->ptdev->scheduler->groups.waiting);
1105 		}
1106 		group->blocked_queues |= BIT(cs_id);
1107 		queue->syncwait.gpu_va = cs_iface->output->status_wait_sync_ptr;
1108 		queue->syncwait.ref = cs_iface->output->status_wait_sync_value;
1109 		status_wait_cond = cs_iface->output->status_wait & CS_STATUS_WAIT_SYNC_COND_MASK;
1110 		queue->syncwait.gt = status_wait_cond == CS_STATUS_WAIT_SYNC_COND_GT;
1111 		if (cs_iface->output->status_wait & CS_STATUS_WAIT_SYNC_64B) {
1112 			u64 sync_val_hi = cs_iface->output->status_wait_sync_value_hi;
1113 
1114 			queue->syncwait.sync64 = true;
1115 			queue->syncwait.ref |= sync_val_hi << 32;
1116 		} else {
1117 			queue->syncwait.sync64 = false;
1118 		}
1119 		break;
1120 
1121 	default:
1122 		/* Other reasons are not blocking. Consider the queue as runnable
1123 		 * in those cases.
1124 		 */
1125 		break;
1126 	}
1127 }
1128 
1129 static void
1130 csg_slot_sync_queues_state_locked(struct panthor_device *ptdev, u32 csg_id)
1131 {
1132 	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
1133 	struct panthor_group *group = csg_slot->group;
1134 	u32 i;
1135 
1136 	lockdep_assert_held(&ptdev->scheduler->lock);
1137 
1138 	group->idle_queues = 0;
1139 	group->blocked_queues = 0;
1140 
1141 	for (i = 0; i < group->queue_count; i++) {
1142 		if (group->queues[i])
1143 			cs_slot_sync_queue_state_locked(ptdev, csg_id, i);
1144 	}
1145 }
1146 
1147 static void
1148 csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
1149 {
1150 	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
1151 	struct panthor_fw_csg_iface *csg_iface;
1152 	struct panthor_group *group;
1153 	enum panthor_group_state new_state, old_state;
1154 	u32 csg_state;
1155 
1156 	lockdep_assert_held(&ptdev->scheduler->lock);
1157 
1158 	csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
1159 	group = csg_slot->group;
1160 
1161 	if (!group)
1162 		return;
1163 
1164 	old_state = group->state;
1165 	csg_state = csg_iface->output->ack & CSG_STATE_MASK;
1166 	switch (csg_state) {
1167 	case CSG_STATE_START:
1168 	case CSG_STATE_RESUME:
1169 		new_state = PANTHOR_CS_GROUP_ACTIVE;
1170 		break;
1171 	case CSG_STATE_TERMINATE:
1172 		new_state = PANTHOR_CS_GROUP_TERMINATED;
1173 		break;
1174 	case CSG_STATE_SUSPEND:
1175 		new_state = PANTHOR_CS_GROUP_SUSPENDED;
1176 		break;
1177 	default:
1178 		/* The unknown state might be caused by a FW state corruption,
1179 		 * which means the group metadata can't be trusted anymore, and
1180 		 * the SUSPEND operation might propagate the corruption to the
1181 		 * suspend buffers. Flag the group state as unknown to make
1182 		 * sure it's unusable after that point.
1183 		 */
1184 		drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
1185 			csg_id, csg_state);
1186 		new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
1187 		break;
1188 	}
1189 
1190 	if (old_state == new_state)
1191 		return;
1192 
1193 	/* The unknown state might be caused by a FW issue, reset the FW to
1194 	 * take a fresh start.
1195 	 */
1196 	if (new_state == PANTHOR_CS_GROUP_UNKNOWN_STATE)
1197 		panthor_device_schedule_reset(ptdev);
1198 
1199 	if (new_state == PANTHOR_CS_GROUP_SUSPENDED)
1200 		csg_slot_sync_queues_state_locked(ptdev, csg_id);
1201 
1202 	if (old_state == PANTHOR_CS_GROUP_ACTIVE) {
1203 		u32 i;
1204 
1205 		/* Reset the queue slots so we start from a clean
1206 		 * state when starting/resuming a new group on this
1207 		 * CSG slot. No wait needed here, and no ringbell
1208 		 * either, since the CS slot will only be re-used
1209 		 * on the next CSG start operation.
1210 		 */
1211 		for (i = 0; i < group->queue_count; i++) {
1212 			if (group->queues[i])
1213 				cs_slot_reset_locked(ptdev, csg_id, i);
1214 		}
1215 	}
1216 
1217 	group->state = new_state;
1218 }
1219 
1220 static int
1221 csg_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 priority)
1222 {
1223 	struct panthor_fw_csg_iface *csg_iface;
1224 	struct panthor_csg_slot *csg_slot;
1225 	struct panthor_group *group;
1226 	u32 queue_mask = 0, i;
1227 
1228 	lockdep_assert_held(&ptdev->scheduler->lock);
1229 
1230 	if (priority > MAX_CSG_PRIO)
1231 		return -EINVAL;
1232 
1233 	if (drm_WARN_ON(&ptdev->base, csg_id >= MAX_CSGS))
1234 		return -EINVAL;
1235 
1236 	csg_slot = &ptdev->scheduler->csg_slots[csg_id];
1237 	group = csg_slot->group;
1238 	if (!group || group->state == PANTHOR_CS_GROUP_ACTIVE)
1239 		return 0;
1240 
1241 	csg_iface = panthor_fw_get_csg_iface(group->ptdev, csg_id);
1242 
1243 	for (i = 0; i < group->queue_count; i++) {
1244 		if (group->queues[i]) {
1245 			cs_slot_prog_locked(ptdev, csg_id, i);
1246 			queue_mask |= BIT(i);
1247 		}
1248 	}
1249 
1250 	csg_iface->input->allow_compute = group->compute_core_mask;
1251 	csg_iface->input->allow_fragment = group->fragment_core_mask;
1252 	csg_iface->input->allow_other = group->tiler_core_mask;
1253 	csg_iface->input->endpoint_req = CSG_EP_REQ_COMPUTE(group->max_compute_cores) |
1254 					 CSG_EP_REQ_FRAGMENT(group->max_fragment_cores) |
1255 					 CSG_EP_REQ_TILER(group->max_tiler_cores) |
1256 					 CSG_EP_REQ_PRIORITY(priority);
1257 	csg_iface->input->config = panthor_vm_as(group->vm);
1258 
1259 	if (group->suspend_buf)
1260 		csg_iface->input->suspend_buf = panthor_kernel_bo_gpuva(group->suspend_buf);
1261 	else
1262 		csg_iface->input->suspend_buf = 0;
1263 
1264 	if (group->protm_suspend_buf) {
1265 		csg_iface->input->protm_suspend_buf =
1266 			panthor_kernel_bo_gpuva(group->protm_suspend_buf);
1267 	} else {
1268 		csg_iface->input->protm_suspend_buf = 0;
1269 	}
1270 
1271 	csg_iface->input->ack_irq_mask = ~0;
1272 	panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, queue_mask);
1273 	return 0;
1274 }
1275 
1276 static void
1277 cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
1278 				   u32 csg_id, u32 cs_id)
1279 {
1280 	struct panthor_scheduler *sched = ptdev->scheduler;
1281 	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
1282 	struct panthor_group *group = csg_slot->group;
1283 	struct panthor_fw_cs_iface *cs_iface;
1284 	u32 fatal;
1285 	u64 info;
1286 
1287 	lockdep_assert_held(&sched->lock);
1288 
1289 	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
1290 	fatal = cs_iface->output->fatal;
1291 	info = cs_iface->output->fatal_info;
1292 
1293 	if (group)
1294 		group->fatal_queues |= BIT(cs_id);
1295 
1296 	if (CS_EXCEPTION_TYPE(fatal) == DRM_PANTHOR_EXCEPTION_CS_UNRECOVERABLE) {
1297 		/* If this exception is unrecoverable, queue a reset, and make
1298 		 * sure we stop scheduling groups until the reset has happened.
1299 		 */
1300 		panthor_device_schedule_reset(ptdev);
1301 		cancel_delayed_work(&sched->tick_work);
1302 	} else {
1303 		sched_queue_delayed_work(sched, tick, 0);
1304 	}
1305 
1306 	drm_warn(&ptdev->base,
1307 		 "CSG slot %d CS slot: %d\n"
1308 		 "CS_FATAL.EXCEPTION_TYPE: 0x%x (%s)\n"
1309 		 "CS_FATAL.EXCEPTION_DATA: 0x%x\n"
1310 		 "CS_FATAL_INFO.EXCEPTION_DATA: 0x%llx\n",
1311 		 csg_id, cs_id,
1312 		 (unsigned int)CS_EXCEPTION_TYPE(fatal),
1313 		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
1314 		 (unsigned int)CS_EXCEPTION_DATA(fatal),
1315 		 info);
1316 }
1317 
1318 static void
1319 cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
1320 				   u32 csg_id, u32 cs_id)
1321 {
1322 	struct panthor_scheduler *sched = ptdev->scheduler;
1323 	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
1324 	struct panthor_group *group = csg_slot->group;
1325 	struct panthor_queue *queue = group && cs_id < group->queue_count ?
1326 				      group->queues[cs_id] : NULL;
1327 	struct panthor_fw_cs_iface *cs_iface;
1328 	u32 fault;
1329 	u64 info;
1330 
1331 	lockdep_assert_held(&sched->lock);
1332 
1333 	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
1334 	fault = cs_iface->output->fault;
1335 	info = cs_iface->output->fault_info;
1336 
1337 	if (queue && CS_EXCEPTION_TYPE(fault) == DRM_PANTHOR_EXCEPTION_CS_INHERIT_FAULT) {
1338 		u64 cs_extract = queue->iface.output->extract;
1339 		struct panthor_job *job;
1340 
1341 		spin_lock(&queue->fence_ctx.lock);
1342 		list_for_each_entry(job, &queue->fence_ctx.in_flight_jobs, node) {
1343 			if (cs_extract >= job->ringbuf.end)
1344 				continue;
1345 
1346 			if (cs_extract < job->ringbuf.start)
1347 				break;
1348 
1349 			dma_fence_set_error(job->done_fence, -EINVAL);
1350 		}
1351 		spin_unlock(&queue->fence_ctx.lock);
1352 	}
1353 
1354 	drm_warn(&ptdev->base,
1355 		 "CSG slot %d CS slot: %d\n"
1356 		 "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
1357 		 "CS_FAULT.EXCEPTION_DATA: 0x%x\n"
1358 		 "CS_FAULT_INFO.EXCEPTION_DATA: 0x%llx\n",
1359 		 csg_id, cs_id,
1360 		 (unsigned int)CS_EXCEPTION_TYPE(fault),
1361 		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
1362 		 (unsigned int)CS_EXCEPTION_DATA(fault),
1363 		 info);
1364 }
1365 
1366 static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
1367 {
1368 	struct panthor_device *ptdev = group->ptdev;
1369 	struct panthor_scheduler *sched = ptdev->scheduler;
1370 	u32 renderpasses_in_flight, pending_frag_count;
1371 	struct panthor_heap_pool *heaps = NULL;
1372 	u64 heap_address, new_chunk_va = 0;
1373 	u32 vt_start, vt_end, frag_end;
1374 	int ret, csg_id;
1375 
1376 	mutex_lock(&sched->lock);
1377 	csg_id = group->csg_id;
1378 	if (csg_id >= 0) {
1379 		struct panthor_fw_cs_iface *cs_iface;
1380 
1381 		cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
1382 		heaps = panthor_vm_get_heap_pool(group->vm, false);
1383 		heap_address = cs_iface->output->heap_address;
1384 		vt_start = cs_iface->output->heap_vt_start;
1385 		vt_end = cs_iface->output->heap_vt_end;
1386 		frag_end = cs_iface->output->heap_frag_end;
1387 		renderpasses_in_flight = vt_start - frag_end;
1388 		pending_frag_count = vt_end - frag_end;
1389 	}
1390 	mutex_unlock(&sched->lock);
1391 
1392 	/* The group got scheduled out, we stop here. We will get a new tiler OOM event
1393 	 * when it's scheduled again.
1394 	 */
1395 	if (unlikely(csg_id < 0))
1396 		return 0;
1397 
1398 	if (IS_ERR(heaps) || frag_end > vt_end || vt_end >= vt_start) {
1399 		ret = -EINVAL;
1400 	} else {
1401 		/* We do the allocation without holding the scheduler lock to avoid
1402 		 * blocking the scheduling.
1403 		 */
1404 		ret = panthor_heap_grow(heaps, heap_address,
1405 					renderpasses_in_flight,
1406 					pending_frag_count, &new_chunk_va);
1407 	}
1408 
1409 	/* If the heap context doesn't have memory for us, we want to let the
1410 	 * FW try to reclaim memory by waiting for fragment jobs to land or by
1411 	 * executing the tiler OOM exception handler, which is supposed to
1412 	 * implement incremental rendering.
1413 	 */
1414 	if (ret && ret != -ENOMEM) {
1415 		drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
1416 		group->fatal_queues |= BIT(cs_id);
1417 		sched_queue_delayed_work(sched, tick, 0);
1418 		goto out_put_heap_pool;
1419 	}
1420 
1421 	mutex_lock(&sched->lock);
1422 	csg_id = group->csg_id;
1423 	if (csg_id >= 0) {
1424 		struct panthor_fw_csg_iface *csg_iface;
1425 		struct panthor_fw_cs_iface *cs_iface;
1426 
1427 		csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
1428 		cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
1429 
1430 		cs_iface->input->heap_start = new_chunk_va;
1431 		cs_iface->input->heap_end = new_chunk_va;
1432 		panthor_fw_update_reqs(cs_iface, req, cs_iface->output->ack, CS_TILER_OOM);
1433 		panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, BIT(cs_id));
1434 		panthor_fw_ring_csg_doorbells(ptdev, BIT(csg_id));
1435 	}
1436 	mutex_unlock(&sched->lock);
1437 
1438 	/* We allocated a chunck, but couldn't link it to the heap
1439 	 * context because the group was scheduled out while we were
1440 	 * allocating memory. We need to return this chunk to the heap.
1441 	 */
1442 	if (unlikely(csg_id < 0 && new_chunk_va))
1443 		panthor_heap_return_chunk(heaps, heap_address, new_chunk_va);
1444 
1445 	ret = 0;
1446 
1447 out_put_heap_pool:
1448 	panthor_heap_pool_put(heaps);
1449 	return ret;
1450 }
1451 
1452 static void group_tiler_oom_work(struct work_struct *work)
1453 {
1454 	struct panthor_group *group =
1455 		container_of(work, struct panthor_group, tiler_oom_work);
1456 	u32 tiler_oom = atomic_xchg(&group->tiler_oom, 0);
1457 
1458 	while (tiler_oom) {
1459 		u32 cs_id = ffs(tiler_oom) - 1;
1460 
1461 		group_process_tiler_oom(group, cs_id);
1462 		tiler_oom &= ~BIT(cs_id);
1463 	}
1464 
1465 	group_put(group);
1466 }
1467 
1468 static void
1469 cs_slot_process_tiler_oom_event_locked(struct panthor_device *ptdev,
1470 				       u32 csg_id, u32 cs_id)
1471 {
1472 	struct panthor_scheduler *sched = ptdev->scheduler;
1473 	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
1474 	struct panthor_group *group = csg_slot->group;
1475 
1476 	lockdep_assert_held(&sched->lock);
1477 
1478 	if (drm_WARN_ON(&ptdev->base, !group))
1479 		return;
1480 
1481 	atomic_or(BIT(cs_id), &group->tiler_oom);
1482 
1483 	/* We don't use group_queue_work() here because we want to queue the
1484 	 * work item to the heap_alloc_wq.
1485 	 */
1486 	group_get(group);
1487 	if (!queue_work(sched->heap_alloc_wq, &group->tiler_oom_work))
1488 		group_put(group);
1489 }
1490 
1491 static bool cs_slot_process_irq_locked(struct panthor_device *ptdev,
1492 				       u32 csg_id, u32 cs_id)
1493 {
1494 	struct panthor_fw_cs_iface *cs_iface;
1495 	u32 req, ack, events;
1496 
1497 	lockdep_assert_held(&ptdev->scheduler->lock);
1498 
1499 	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
1500 	req = cs_iface->input->req;
1501 	ack = cs_iface->output->ack;
1502 	events = (req ^ ack) & CS_EVT_MASK;
1503 
1504 	if (events & CS_FATAL)
1505 		cs_slot_process_fatal_event_locked(ptdev, csg_id, cs_id);
1506 
1507 	if (events & CS_FAULT)
1508 		cs_slot_process_fault_event_locked(ptdev, csg_id, cs_id);
1509 
1510 	if (events & CS_TILER_OOM)
1511 		cs_slot_process_tiler_oom_event_locked(ptdev, csg_id, cs_id);
1512 
1513 	/* We don't acknowledge the TILER_OOM event since its handling is
1514 	 * deferred to a separate work.
1515 	 */
1516 	panthor_fw_update_reqs(cs_iface, req, ack, CS_FATAL | CS_FAULT);
1517 
1518 	return (events & (CS_FAULT | CS_TILER_OOM)) != 0;
1519 }
1520 
1521 static void csg_slot_sync_idle_state_locked(struct panthor_device *ptdev, u32 csg_id)
1522 {
1523 	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
1524 	struct panthor_fw_csg_iface *csg_iface;
1525 
1526 	lockdep_assert_held(&ptdev->scheduler->lock);
1527 
1528 	csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
1529 	csg_slot->idle = csg_iface->output->status_state & CSG_STATUS_STATE_IS_IDLE;
1530 }
1531 
1532 static void csg_slot_process_idle_event_locked(struct panthor_device *ptdev, u32 csg_id)
1533 {
1534 	struct panthor_scheduler *sched = ptdev->scheduler;
1535 
1536 	lockdep_assert_held(&sched->lock);
1537 
1538 	sched->might_have_idle_groups = true;
1539 
1540 	/* Schedule a tick so we can evict idle groups and schedule non-idle
1541 	 * ones. This will also update runtime PM and devfreq busy/idle states,
1542 	 * so the device can lower its frequency or get suspended.
1543 	 */
1544 	sched_queue_delayed_work(sched, tick, 0);
1545 }
1546 
1547 static void csg_slot_sync_update_locked(struct panthor_device *ptdev,
1548 					u32 csg_id)
1549 {
1550 	struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id];
1551 	struct panthor_group *group = csg_slot->group;
1552 
1553 	lockdep_assert_held(&ptdev->scheduler->lock);
1554 
1555 	if (group)
1556 		group_queue_work(group, sync_upd);
1557 
1558 	sched_queue_work(ptdev->scheduler, sync_upd);
1559 }
1560 
1561 static void
1562 csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 csg_id)
1563 {
1564 	struct panthor_scheduler *sched = ptdev->scheduler;
1565 	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
1566 	struct panthor_group *group = csg_slot->group;
1567 
1568 	lockdep_assert_held(&sched->lock);
1569 
1570 	drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id);
1571 
1572 	group = csg_slot->group;
1573 	if (!drm_WARN_ON(&ptdev->base, !group))
1574 		group->timedout = true;
1575 
1576 	sched_queue_delayed_work(sched, tick, 0);
1577 }
1578 
1579 static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 csg_id)
1580 {
1581 	u32 req, ack, cs_irq_req, cs_irq_ack, cs_irqs, csg_events;
1582 	struct panthor_fw_csg_iface *csg_iface;
1583 	u32 ring_cs_db_mask = 0;
1584 
1585 	lockdep_assert_held(&ptdev->scheduler->lock);
1586 
1587 	if (drm_WARN_ON(&ptdev->base, csg_id >= ptdev->scheduler->csg_slot_count))
1588 		return;
1589 
1590 	csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
1591 	req = READ_ONCE(csg_iface->input->req);
1592 	ack = READ_ONCE(csg_iface->output->ack);
1593 	cs_irq_req = READ_ONCE(csg_iface->output->cs_irq_req);
1594 	cs_irq_ack = READ_ONCE(csg_iface->input->cs_irq_ack);
1595 	csg_events = (req ^ ack) & CSG_EVT_MASK;
1596 
1597 	/* There may not be any pending CSG/CS interrupts to process */
1598 	if (req == ack && cs_irq_req == cs_irq_ack)
1599 		return;
1600 
1601 	/* Immediately set IRQ_ACK bits to be same as the IRQ_REQ bits before
1602 	 * examining the CS_ACK & CS_REQ bits. This would ensure that Host
1603 	 * doesn't miss an interrupt for the CS in the race scenario where
1604 	 * whilst Host is servicing an interrupt for the CS, firmware sends
1605 	 * another interrupt for that CS.
1606 	 */
1607 	csg_iface->input->cs_irq_ack = cs_irq_req;
1608 
1609 	panthor_fw_update_reqs(csg_iface, req, ack,
1610 			       CSG_SYNC_UPDATE |
1611 			       CSG_IDLE |
1612 			       CSG_PROGRESS_TIMER_EVENT);
1613 
1614 	if (csg_events & CSG_IDLE)
1615 		csg_slot_process_idle_event_locked(ptdev, csg_id);
1616 
1617 	if (csg_events & CSG_PROGRESS_TIMER_EVENT)
1618 		csg_slot_process_progress_timer_event_locked(ptdev, csg_id);
1619 
1620 	cs_irqs = cs_irq_req ^ cs_irq_ack;
1621 	while (cs_irqs) {
1622 		u32 cs_id = ffs(cs_irqs) - 1;
1623 
1624 		if (cs_slot_process_irq_locked(ptdev, csg_id, cs_id))
1625 			ring_cs_db_mask |= BIT(cs_id);
1626 
1627 		cs_irqs &= ~BIT(cs_id);
1628 	}
1629 
1630 	if (csg_events & CSG_SYNC_UPDATE)
1631 		csg_slot_sync_update_locked(ptdev, csg_id);
1632 
1633 	if (ring_cs_db_mask)
1634 		panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, ring_cs_db_mask);
1635 
1636 	panthor_fw_ring_csg_doorbells(ptdev, BIT(csg_id));
1637 }
1638 
1639 static void sched_process_idle_event_locked(struct panthor_device *ptdev)
1640 {
1641 	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
1642 
1643 	lockdep_assert_held(&ptdev->scheduler->lock);
1644 
1645 	/* Acknowledge the idle event and schedule a tick. */
1646 	panthor_fw_update_reqs(glb_iface, req, glb_iface->output->ack, GLB_IDLE);
1647 	sched_queue_delayed_work(ptdev->scheduler, tick, 0);
1648 }
1649 
1650 /**
1651  * sched_process_global_irq_locked() - Process the scheduling part of a global IRQ
1652  * @ptdev: Device.
1653  */
1654 static void sched_process_global_irq_locked(struct panthor_device *ptdev)
1655 {
1656 	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
1657 	u32 req, ack, evts;
1658 
1659 	lockdep_assert_held(&ptdev->scheduler->lock);
1660 
1661 	req = READ_ONCE(glb_iface->input->req);
1662 	ack = READ_ONCE(glb_iface->output->ack);
1663 	evts = (req ^ ack) & GLB_EVT_MASK;
1664 
1665 	if (evts & GLB_IDLE)
1666 		sched_process_idle_event_locked(ptdev);
1667 }
1668 
1669 static void process_fw_events_work(struct work_struct *work)
1670 {
1671 	struct panthor_scheduler *sched = container_of(work, struct panthor_scheduler,
1672 						      fw_events_work);
1673 	u32 events = atomic_xchg(&sched->fw_events, 0);
1674 	struct panthor_device *ptdev = sched->ptdev;
1675 
1676 	mutex_lock(&sched->lock);
1677 
1678 	if (events & JOB_INT_GLOBAL_IF) {
1679 		sched_process_global_irq_locked(ptdev);
1680 		events &= ~JOB_INT_GLOBAL_IF;
1681 	}
1682 
1683 	while (events) {
1684 		u32 csg_id = ffs(events) - 1;
1685 
1686 		sched_process_csg_irq_locked(ptdev, csg_id);
1687 		events &= ~BIT(csg_id);
1688 	}
1689 
1690 	mutex_unlock(&sched->lock);
1691 }
1692 
1693 /**
1694  * panthor_sched_report_fw_events() - Report FW events to the scheduler.
1695  */
1696 void panthor_sched_report_fw_events(struct panthor_device *ptdev, u32 events)
1697 {
1698 	if (!ptdev->scheduler)
1699 		return;
1700 
1701 	atomic_or(events, &ptdev->scheduler->fw_events);
1702 	sched_queue_work(ptdev->scheduler, fw_events);
1703 }
1704 
1705 static const char *fence_get_driver_name(struct dma_fence *fence)
1706 {
1707 	return "panthor";
1708 }
1709 
1710 static const char *queue_fence_get_timeline_name(struct dma_fence *fence)
1711 {
1712 	return "queue-fence";
1713 }
1714 
1715 static const struct dma_fence_ops panthor_queue_fence_ops = {
1716 	.get_driver_name = fence_get_driver_name,
1717 	.get_timeline_name = queue_fence_get_timeline_name,
1718 };
1719 
1720 struct panthor_csg_slots_upd_ctx {
1721 	u32 update_mask;
1722 	u32 timedout_mask;
1723 	struct {
1724 		u32 value;
1725 		u32 mask;
1726 	} requests[MAX_CSGS];
1727 };
1728 
1729 static void csgs_upd_ctx_init(struct panthor_csg_slots_upd_ctx *ctx)
1730 {
1731 	memset(ctx, 0, sizeof(*ctx));
1732 }
1733 
1734 static void csgs_upd_ctx_queue_reqs(struct panthor_device *ptdev,
1735 				    struct panthor_csg_slots_upd_ctx *ctx,
1736 				    u32 csg_id, u32 value, u32 mask)
1737 {
1738 	if (drm_WARN_ON(&ptdev->base, !mask) ||
1739 	    drm_WARN_ON(&ptdev->base, csg_id >= ptdev->scheduler->csg_slot_count))
1740 		return;
1741 
1742 	ctx->requests[csg_id].value = (ctx->requests[csg_id].value & ~mask) | (value & mask);
1743 	ctx->requests[csg_id].mask |= mask;
1744 	ctx->update_mask |= BIT(csg_id);
1745 }
1746 
1747 static int csgs_upd_ctx_apply_locked(struct panthor_device *ptdev,
1748 				     struct panthor_csg_slots_upd_ctx *ctx)
1749 {
1750 	struct panthor_scheduler *sched = ptdev->scheduler;
1751 	u32 update_slots = ctx->update_mask;
1752 
1753 	lockdep_assert_held(&sched->lock);
1754 
1755 	if (!ctx->update_mask)
1756 		return 0;
1757 
1758 	while (update_slots) {
1759 		struct panthor_fw_csg_iface *csg_iface;
1760 		u32 csg_id = ffs(update_slots) - 1;
1761 
1762 		update_slots &= ~BIT(csg_id);
1763 		csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
1764 		panthor_fw_update_reqs(csg_iface, req,
1765 				       ctx->requests[csg_id].value,
1766 				       ctx->requests[csg_id].mask);
1767 	}
1768 
1769 	panthor_fw_ring_csg_doorbells(ptdev, ctx->update_mask);
1770 
1771 	update_slots = ctx->update_mask;
1772 	while (update_slots) {
1773 		struct panthor_fw_csg_iface *csg_iface;
1774 		u32 csg_id = ffs(update_slots) - 1;
1775 		u32 req_mask = ctx->requests[csg_id].mask, acked;
1776 		int ret;
1777 
1778 		update_slots &= ~BIT(csg_id);
1779 		csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
1780 
1781 		ret = panthor_fw_csg_wait_acks(ptdev, csg_id, req_mask, &acked, 100);
1782 
1783 		if (acked & CSG_ENDPOINT_CONFIG)
1784 			csg_slot_sync_priority_locked(ptdev, csg_id);
1785 
1786 		if (acked & CSG_STATE_MASK)
1787 			csg_slot_sync_state_locked(ptdev, csg_id);
1788 
1789 		if (acked & CSG_STATUS_UPDATE) {
1790 			csg_slot_sync_queues_state_locked(ptdev, csg_id);
1791 			csg_slot_sync_idle_state_locked(ptdev, csg_id);
1792 		}
1793 
1794 		if (ret && acked != req_mask &&
1795 		    ((csg_iface->input->req ^ csg_iface->output->ack) & req_mask) != 0) {
1796 			drm_err(&ptdev->base, "CSG %d update request timedout", csg_id);
1797 			ctx->timedout_mask |= BIT(csg_id);
1798 		}
1799 	}
1800 
1801 	if (ctx->timedout_mask)
1802 		return -ETIMEDOUT;
1803 
1804 	return 0;
1805 }
1806 
1807 struct panthor_sched_tick_ctx {
1808 	struct list_head old_groups[PANTHOR_CSG_PRIORITY_COUNT];
1809 	struct list_head groups[PANTHOR_CSG_PRIORITY_COUNT];
1810 	u32 idle_group_count;
1811 	u32 group_count;
1812 	enum panthor_csg_priority min_priority;
1813 	struct panthor_vm *vms[MAX_CS_PER_CSG];
1814 	u32 as_count;
1815 	bool immediate_tick;
1816 	u32 csg_upd_failed_mask;
1817 };
1818 
1819 static bool
1820 tick_ctx_is_full(const struct panthor_scheduler *sched,
1821 		 const struct panthor_sched_tick_ctx *ctx)
1822 {
1823 	return ctx->group_count == sched->csg_slot_count;
1824 }
1825 
1826 static bool
1827 group_is_idle(struct panthor_group *group)
1828 {
1829 	struct panthor_device *ptdev = group->ptdev;
1830 	u32 inactive_queues;
1831 
1832 	if (group->csg_id >= 0)
1833 		return ptdev->scheduler->csg_slots[group->csg_id].idle;
1834 
1835 	inactive_queues = group->idle_queues | group->blocked_queues;
1836 	return hweight32(inactive_queues) == group->queue_count;
1837 }
1838 
1839 static bool
1840 group_can_run(struct panthor_group *group)
1841 {
1842 	return group->state != PANTHOR_CS_GROUP_TERMINATED &&
1843 	       group->state != PANTHOR_CS_GROUP_UNKNOWN_STATE &&
1844 	       !group->destroyed && group->fatal_queues == 0 &&
1845 	       !group->timedout;
1846 }
1847 
1848 static void
1849 tick_ctx_pick_groups_from_list(const struct panthor_scheduler *sched,
1850 			       struct panthor_sched_tick_ctx *ctx,
1851 			       struct list_head *queue,
1852 			       bool skip_idle_groups,
1853 			       bool owned_by_tick_ctx)
1854 {
1855 	struct panthor_group *group, *tmp;
1856 
1857 	if (tick_ctx_is_full(sched, ctx))
1858 		return;
1859 
1860 	list_for_each_entry_safe(group, tmp, queue, run_node) {
1861 		u32 i;
1862 
1863 		if (!group_can_run(group))
1864 			continue;
1865 
1866 		if (skip_idle_groups && group_is_idle(group))
1867 			continue;
1868 
1869 		for (i = 0; i < ctx->as_count; i++) {
1870 			if (ctx->vms[i] == group->vm)
1871 				break;
1872 		}
1873 
1874 		if (i == ctx->as_count && ctx->as_count == sched->as_slot_count)
1875 			continue;
1876 
1877 		if (!owned_by_tick_ctx)
1878 			group_get(group);
1879 
1880 		list_move_tail(&group->run_node, &ctx->groups[group->priority]);
1881 		ctx->group_count++;
1882 		if (group_is_idle(group))
1883 			ctx->idle_group_count++;
1884 
1885 		if (i == ctx->as_count)
1886 			ctx->vms[ctx->as_count++] = group->vm;
1887 
1888 		if (ctx->min_priority > group->priority)
1889 			ctx->min_priority = group->priority;
1890 
1891 		if (tick_ctx_is_full(sched, ctx))
1892 			return;
1893 	}
1894 }
1895 
1896 static void
1897 tick_ctx_insert_old_group(struct panthor_scheduler *sched,
1898 			  struct panthor_sched_tick_ctx *ctx,
1899 			  struct panthor_group *group,
1900 			  bool full_tick)
1901 {
1902 	struct panthor_csg_slot *csg_slot = &sched->csg_slots[group->csg_id];
1903 	struct panthor_group *other_group;
1904 
1905 	if (!full_tick) {
1906 		list_add_tail(&group->run_node, &ctx->old_groups[group->priority]);
1907 		return;
1908 	}
1909 
1910 	/* Rotate to make sure groups with lower CSG slot
1911 	 * priorities have a chance to get a higher CSG slot
1912 	 * priority next time they get picked. This priority
1913 	 * has an impact on resource request ordering, so it's
1914 	 * important to make sure we don't let one group starve
1915 	 * all other groups with the same group priority.
1916 	 */
1917 	list_for_each_entry(other_group,
1918 			    &ctx->old_groups[csg_slot->group->priority],
1919 			    run_node) {
1920 		struct panthor_csg_slot *other_csg_slot = &sched->csg_slots[other_group->csg_id];
1921 
1922 		if (other_csg_slot->priority > csg_slot->priority) {
1923 			list_add_tail(&csg_slot->group->run_node, &other_group->run_node);
1924 			return;
1925 		}
1926 	}
1927 
1928 	list_add_tail(&group->run_node, &ctx->old_groups[group->priority]);
1929 }
1930 
1931 static void
1932 tick_ctx_init(struct panthor_scheduler *sched,
1933 	      struct panthor_sched_tick_ctx *ctx,
1934 	      bool full_tick)
1935 {
1936 	struct panthor_device *ptdev = sched->ptdev;
1937 	struct panthor_csg_slots_upd_ctx upd_ctx;
1938 	int ret;
1939 	u32 i;
1940 
1941 	memset(ctx, 0, sizeof(*ctx));
1942 	csgs_upd_ctx_init(&upd_ctx);
1943 
1944 	ctx->min_priority = PANTHOR_CSG_PRIORITY_COUNT;
1945 	for (i = 0; i < ARRAY_SIZE(ctx->groups); i++) {
1946 		INIT_LIST_HEAD(&ctx->groups[i]);
1947 		INIT_LIST_HEAD(&ctx->old_groups[i]);
1948 	}
1949 
1950 	for (i = 0; i < sched->csg_slot_count; i++) {
1951 		struct panthor_csg_slot *csg_slot = &sched->csg_slots[i];
1952 		struct panthor_group *group = csg_slot->group;
1953 		struct panthor_fw_csg_iface *csg_iface;
1954 
1955 		if (!group)
1956 			continue;
1957 
1958 		csg_iface = panthor_fw_get_csg_iface(ptdev, i);
1959 		group_get(group);
1960 
1961 		/* If there was unhandled faults on the VM, force processing of
1962 		 * CSG IRQs, so we can flag the faulty queue.
1963 		 */
1964 		if (panthor_vm_has_unhandled_faults(group->vm)) {
1965 			sched_process_csg_irq_locked(ptdev, i);
1966 
1967 			/* No fatal fault reported, flag all queues as faulty. */
1968 			if (!group->fatal_queues)
1969 				group->fatal_queues |= GENMASK(group->queue_count - 1, 0);
1970 		}
1971 
1972 		tick_ctx_insert_old_group(sched, ctx, group, full_tick);
1973 		csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i,
1974 					csg_iface->output->ack ^ CSG_STATUS_UPDATE,
1975 					CSG_STATUS_UPDATE);
1976 	}
1977 
1978 	ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
1979 	if (ret) {
1980 		panthor_device_schedule_reset(ptdev);
1981 		ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask;
1982 	}
1983 }
1984 
1985 #define NUM_INSTRS_PER_SLOT		16
1986 
1987 static void
1988 group_term_post_processing(struct panthor_group *group)
1989 {
1990 	struct panthor_job *job, *tmp;
1991 	LIST_HEAD(faulty_jobs);
1992 	bool cookie;
1993 	u32 i = 0;
1994 
1995 	if (drm_WARN_ON(&group->ptdev->base, group_can_run(group)))
1996 		return;
1997 
1998 	cookie = dma_fence_begin_signalling();
1999 	for (i = 0; i < group->queue_count; i++) {
2000 		struct panthor_queue *queue = group->queues[i];
2001 		struct panthor_syncobj_64b *syncobj;
2002 		int err;
2003 
2004 		if (group->fatal_queues & BIT(i))
2005 			err = -EINVAL;
2006 		else if (group->timedout)
2007 			err = -ETIMEDOUT;
2008 		else
2009 			err = -ECANCELED;
2010 
2011 		if (!queue)
2012 			continue;
2013 
2014 		spin_lock(&queue->fence_ctx.lock);
2015 		list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
2016 			list_move_tail(&job->node, &faulty_jobs);
2017 			dma_fence_set_error(job->done_fence, err);
2018 			dma_fence_signal_locked(job->done_fence);
2019 		}
2020 		spin_unlock(&queue->fence_ctx.lock);
2021 
2022 		/* Manually update the syncobj seqno to unblock waiters. */
2023 		syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj));
2024 		syncobj->status = ~0;
2025 		syncobj->seqno = atomic64_read(&queue->fence_ctx.seqno);
2026 		sched_queue_work(group->ptdev->scheduler, sync_upd);
2027 	}
2028 	dma_fence_end_signalling(cookie);
2029 
2030 	list_for_each_entry_safe(job, tmp, &faulty_jobs, node) {
2031 		list_del_init(&job->node);
2032 		panthor_job_put(&job->base);
2033 	}
2034 }
2035 
2036 static void group_term_work(struct work_struct *work)
2037 {
2038 	struct panthor_group *group =
2039 		container_of(work, struct panthor_group, term_work);
2040 
2041 	group_term_post_processing(group);
2042 	group_put(group);
2043 }
2044 
2045 static void
2046 tick_ctx_cleanup(struct panthor_scheduler *sched,
2047 		 struct panthor_sched_tick_ctx *ctx)
2048 {
2049 	struct panthor_group *group, *tmp;
2050 	u32 i;
2051 
2052 	for (i = 0; i < ARRAY_SIZE(ctx->old_groups); i++) {
2053 		list_for_each_entry_safe(group, tmp, &ctx->old_groups[i], run_node) {
2054 			/* If everything went fine, we should only have groups
2055 			 * to be terminated in the old_groups lists.
2056 			 */
2057 			drm_WARN_ON(&group->ptdev->base, !ctx->csg_upd_failed_mask &&
2058 				    group_can_run(group));
2059 
2060 			if (!group_can_run(group)) {
2061 				list_del_init(&group->run_node);
2062 				list_del_init(&group->wait_node);
2063 				group_queue_work(group, term);
2064 			} else if (group->csg_id >= 0) {
2065 				list_del_init(&group->run_node);
2066 			} else {
2067 				list_move(&group->run_node,
2068 					  group_is_idle(group) ?
2069 					  &sched->groups.idle[group->priority] :
2070 					  &sched->groups.runnable[group->priority]);
2071 			}
2072 			group_put(group);
2073 		}
2074 	}
2075 
2076 	for (i = 0; i < ARRAY_SIZE(ctx->groups); i++) {
2077 		/* If everything went fine, the groups to schedule lists should
2078 		 * be empty.
2079 		 */
2080 		drm_WARN_ON(&group->ptdev->base,
2081 			    !ctx->csg_upd_failed_mask && !list_empty(&ctx->groups[i]));
2082 
2083 		list_for_each_entry_safe(group, tmp, &ctx->groups[i], run_node) {
2084 			if (group->csg_id >= 0) {
2085 				list_del_init(&group->run_node);
2086 			} else {
2087 				list_move(&group->run_node,
2088 					  group_is_idle(group) ?
2089 					  &sched->groups.idle[group->priority] :
2090 					  &sched->groups.runnable[group->priority]);
2091 			}
2092 			group_put(group);
2093 		}
2094 	}
2095 }
2096 
2097 static void
2098 tick_ctx_apply(struct panthor_scheduler *sched, struct panthor_sched_tick_ctx *ctx)
2099 {
2100 	struct panthor_group *group, *tmp;
2101 	struct panthor_device *ptdev = sched->ptdev;
2102 	struct panthor_csg_slot *csg_slot;
2103 	int prio, new_csg_prio = MAX_CSG_PRIO, i;
2104 	u32 free_csg_slots = 0;
2105 	struct panthor_csg_slots_upd_ctx upd_ctx;
2106 	int ret;
2107 
2108 	csgs_upd_ctx_init(&upd_ctx);
2109 
2110 	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
2111 		/* Suspend or terminate evicted groups. */
2112 		list_for_each_entry(group, &ctx->old_groups[prio], run_node) {
2113 			bool term = !group_can_run(group);
2114 			int csg_id = group->csg_id;
2115 
2116 			if (drm_WARN_ON(&ptdev->base, csg_id < 0))
2117 				continue;
2118 
2119 			csg_slot = &sched->csg_slots[csg_id];
2120 			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
2121 						term ? CSG_STATE_TERMINATE : CSG_STATE_SUSPEND,
2122 						CSG_STATE_MASK);
2123 		}
2124 
2125 		/* Update priorities on already running groups. */
2126 		list_for_each_entry(group, &ctx->groups[prio], run_node) {
2127 			struct panthor_fw_csg_iface *csg_iface;
2128 			int csg_id = group->csg_id;
2129 
2130 			if (csg_id < 0) {
2131 				new_csg_prio--;
2132 				continue;
2133 			}
2134 
2135 			csg_slot = &sched->csg_slots[csg_id];
2136 			csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
2137 			if (csg_slot->priority == new_csg_prio) {
2138 				new_csg_prio--;
2139 				continue;
2140 			}
2141 
2142 			panthor_fw_update_reqs(csg_iface, endpoint_req,
2143 					       CSG_EP_REQ_PRIORITY(new_csg_prio),
2144 					       CSG_EP_REQ_PRIORITY_MASK);
2145 			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
2146 						csg_iface->output->ack ^ CSG_ENDPOINT_CONFIG,
2147 						CSG_ENDPOINT_CONFIG);
2148 			new_csg_prio--;
2149 		}
2150 	}
2151 
2152 	ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
2153 	if (ret) {
2154 		panthor_device_schedule_reset(ptdev);
2155 		ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask;
2156 		return;
2157 	}
2158 
2159 	/* Unbind evicted groups. */
2160 	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
2161 		list_for_each_entry(group, &ctx->old_groups[prio], run_node) {
2162 			/* This group is gone. Process interrupts to clear
2163 			 * any pending interrupts before we start the new
2164 			 * group.
2165 			 */
2166 			if (group->csg_id >= 0)
2167 				sched_process_csg_irq_locked(ptdev, group->csg_id);
2168 
2169 			group_unbind_locked(group);
2170 		}
2171 	}
2172 
2173 	for (i = 0; i < sched->csg_slot_count; i++) {
2174 		if (!sched->csg_slots[i].group)
2175 			free_csg_slots |= BIT(i);
2176 	}
2177 
2178 	csgs_upd_ctx_init(&upd_ctx);
2179 	new_csg_prio = MAX_CSG_PRIO;
2180 
2181 	/* Start new groups. */
2182 	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
2183 		list_for_each_entry(group, &ctx->groups[prio], run_node) {
2184 			int csg_id = group->csg_id;
2185 			struct panthor_fw_csg_iface *csg_iface;
2186 
2187 			if (csg_id >= 0) {
2188 				new_csg_prio--;
2189 				continue;
2190 			}
2191 
2192 			csg_id = ffs(free_csg_slots) - 1;
2193 			if (drm_WARN_ON(&ptdev->base, csg_id < 0))
2194 				break;
2195 
2196 			csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
2197 			csg_slot = &sched->csg_slots[csg_id];
2198 			group_bind_locked(group, csg_id);
2199 			csg_slot_prog_locked(ptdev, csg_id, new_csg_prio--);
2200 			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
2201 						group->state == PANTHOR_CS_GROUP_SUSPENDED ?
2202 						CSG_STATE_RESUME : CSG_STATE_START,
2203 						CSG_STATE_MASK);
2204 			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
2205 						csg_iface->output->ack ^ CSG_ENDPOINT_CONFIG,
2206 						CSG_ENDPOINT_CONFIG);
2207 			free_csg_slots &= ~BIT(csg_id);
2208 		}
2209 	}
2210 
2211 	ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
2212 	if (ret) {
2213 		panthor_device_schedule_reset(ptdev);
2214 		ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask;
2215 		return;
2216 	}
2217 
2218 	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
2219 		list_for_each_entry_safe(group, tmp, &ctx->groups[prio], run_node) {
2220 			list_del_init(&group->run_node);
2221 
2222 			/* If the group has been destroyed while we were
2223 			 * scheduling, ask for an immediate tick to
2224 			 * re-evaluate as soon as possible and get rid of
2225 			 * this dangling group.
2226 			 */
2227 			if (group->destroyed)
2228 				ctx->immediate_tick = true;
2229 			group_put(group);
2230 		}
2231 
2232 		/* Return evicted groups to the idle or run queues. Groups
2233 		 * that can no longer be run (because they've been destroyed
2234 		 * or experienced an unrecoverable error) will be scheduled
2235 		 * for destruction in tick_ctx_cleanup().
2236 		 */
2237 		list_for_each_entry_safe(group, tmp, &ctx->old_groups[prio], run_node) {
2238 			if (!group_can_run(group))
2239 				continue;
2240 
2241 			if (group_is_idle(group))
2242 				list_move_tail(&group->run_node, &sched->groups.idle[prio]);
2243 			else
2244 				list_move_tail(&group->run_node, &sched->groups.runnable[prio]);
2245 			group_put(group);
2246 		}
2247 	}
2248 
2249 	sched->used_csg_slot_count = ctx->group_count;
2250 	sched->might_have_idle_groups = ctx->idle_group_count > 0;
2251 }
2252 
2253 static u64
2254 tick_ctx_update_resched_target(struct panthor_scheduler *sched,
2255 			       const struct panthor_sched_tick_ctx *ctx)
2256 {
2257 	/* We had space left, no need to reschedule until some external event happens. */
2258 	if (!tick_ctx_is_full(sched, ctx))
2259 		goto no_tick;
2260 
2261 	/* If idle groups were scheduled, no need to wake up until some external
2262 	 * event happens (group unblocked, new job submitted, ...).
2263 	 */
2264 	if (ctx->idle_group_count)
2265 		goto no_tick;
2266 
2267 	if (drm_WARN_ON(&sched->ptdev->base, ctx->min_priority >= PANTHOR_CSG_PRIORITY_COUNT))
2268 		goto no_tick;
2269 
2270 	/* If there are groups of the same priority waiting, we need to
2271 	 * keep the scheduler ticking, otherwise, we'll just wait for
2272 	 * new groups with higher priority to be queued.
2273 	 */
2274 	if (!list_empty(&sched->groups.runnable[ctx->min_priority])) {
2275 		u64 resched_target = sched->last_tick + sched->tick_period;
2276 
2277 		if (time_before64(sched->resched_target, sched->last_tick) ||
2278 		    time_before64(resched_target, sched->resched_target))
2279 			sched->resched_target = resched_target;
2280 
2281 		return sched->resched_target - sched->last_tick;
2282 	}
2283 
2284 no_tick:
2285 	sched->resched_target = U64_MAX;
2286 	return U64_MAX;
2287 }
2288 
2289 static void tick_work(struct work_struct *work)
2290 {
2291 	struct panthor_scheduler *sched = container_of(work, struct panthor_scheduler,
2292 						      tick_work.work);
2293 	struct panthor_device *ptdev = sched->ptdev;
2294 	struct panthor_sched_tick_ctx ctx;
2295 	u64 remaining_jiffies = 0, resched_delay;
2296 	u64 now = get_jiffies_64();
2297 	int prio, ret, cookie;
2298 
2299 	if (!drm_dev_enter(&ptdev->base, &cookie))
2300 		return;
2301 
2302 	ret = pm_runtime_resume_and_get(ptdev->base.dev);
2303 	if (drm_WARN_ON(&ptdev->base, ret))
2304 		goto out_dev_exit;
2305 
2306 	if (time_before64(now, sched->resched_target))
2307 		remaining_jiffies = sched->resched_target - now;
2308 
2309 	mutex_lock(&sched->lock);
2310 	if (panthor_device_reset_is_pending(sched->ptdev))
2311 		goto out_unlock;
2312 
2313 	tick_ctx_init(sched, &ctx, remaining_jiffies != 0);
2314 	if (ctx.csg_upd_failed_mask)
2315 		goto out_cleanup_ctx;
2316 
2317 	if (remaining_jiffies) {
2318 		/* Scheduling forced in the middle of a tick. Only RT groups
2319 		 * can preempt non-RT ones. Currently running RT groups can't be
2320 		 * preempted.
2321 		 */
2322 		for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1;
2323 		     prio >= 0 && !tick_ctx_is_full(sched, &ctx);
2324 		     prio--) {
2325 			tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio],
2326 						       true, true);
2327 			if (prio == PANTHOR_CSG_PRIORITY_RT) {
2328 				tick_ctx_pick_groups_from_list(sched, &ctx,
2329 							       &sched->groups.runnable[prio],
2330 							       true, false);
2331 			}
2332 		}
2333 	}
2334 
2335 	/* First pick non-idle groups */
2336 	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1;
2337 	     prio >= 0 && !tick_ctx_is_full(sched, &ctx);
2338 	     prio--) {
2339 		tick_ctx_pick_groups_from_list(sched, &ctx, &sched->groups.runnable[prio],
2340 					       true, false);
2341 		tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio], true, true);
2342 	}
2343 
2344 	/* If we have free CSG slots left, pick idle groups */
2345 	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1;
2346 	     prio >= 0 && !tick_ctx_is_full(sched, &ctx);
2347 	     prio--) {
2348 		/* Check the old_group queue first to avoid reprogramming the slots */
2349 		tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio], false, true);
2350 		tick_ctx_pick_groups_from_list(sched, &ctx, &sched->groups.idle[prio],
2351 					       false, false);
2352 	}
2353 
2354 	tick_ctx_apply(sched, &ctx);
2355 	if (ctx.csg_upd_failed_mask)
2356 		goto out_cleanup_ctx;
2357 
2358 	if (ctx.idle_group_count == ctx.group_count) {
2359 		panthor_devfreq_record_idle(sched->ptdev);
2360 		if (sched->pm.has_ref) {
2361 			pm_runtime_put_autosuspend(ptdev->base.dev);
2362 			sched->pm.has_ref = false;
2363 		}
2364 	} else {
2365 		panthor_devfreq_record_busy(sched->ptdev);
2366 		if (!sched->pm.has_ref) {
2367 			pm_runtime_get(ptdev->base.dev);
2368 			sched->pm.has_ref = true;
2369 		}
2370 	}
2371 
2372 	sched->last_tick = now;
2373 	resched_delay = tick_ctx_update_resched_target(sched, &ctx);
2374 	if (ctx.immediate_tick)
2375 		resched_delay = 0;
2376 
2377 	if (resched_delay != U64_MAX)
2378 		sched_queue_delayed_work(sched, tick, resched_delay);
2379 
2380 out_cleanup_ctx:
2381 	tick_ctx_cleanup(sched, &ctx);
2382 
2383 out_unlock:
2384 	mutex_unlock(&sched->lock);
2385 	pm_runtime_mark_last_busy(ptdev->base.dev);
2386 	pm_runtime_put_autosuspend(ptdev->base.dev);
2387 
2388 out_dev_exit:
2389 	drm_dev_exit(cookie);
2390 }
2391 
2392 static int panthor_queue_eval_syncwait(struct panthor_group *group, u8 queue_idx)
2393 {
2394 	struct panthor_queue *queue = group->queues[queue_idx];
2395 	union {
2396 		struct panthor_syncobj_64b sync64;
2397 		struct panthor_syncobj_32b sync32;
2398 	} *syncobj;
2399 	bool result;
2400 	u64 value;
2401 
2402 	syncobj = panthor_queue_get_syncwait_obj(group, queue);
2403 	if (!syncobj)
2404 		return -EINVAL;
2405 
2406 	value = queue->syncwait.sync64 ?
2407 		syncobj->sync64.seqno :
2408 		syncobj->sync32.seqno;
2409 
2410 	if (queue->syncwait.gt)
2411 		result = value > queue->syncwait.ref;
2412 	else
2413 		result = value <= queue->syncwait.ref;
2414 
2415 	if (result)
2416 		panthor_queue_put_syncwait_obj(queue);
2417 
2418 	return result;
2419 }
2420 
2421 static void sync_upd_work(struct work_struct *work)
2422 {
2423 	struct panthor_scheduler *sched = container_of(work,
2424 						      struct panthor_scheduler,
2425 						      sync_upd_work);
2426 	struct panthor_group *group, *tmp;
2427 	bool immediate_tick = false;
2428 
2429 	mutex_lock(&sched->lock);
2430 	list_for_each_entry_safe(group, tmp, &sched->groups.waiting, wait_node) {
2431 		u32 tested_queues = group->blocked_queues;
2432 		u32 unblocked_queues = 0;
2433 
2434 		while (tested_queues) {
2435 			u32 cs_id = ffs(tested_queues) - 1;
2436 			int ret;
2437 
2438 			ret = panthor_queue_eval_syncwait(group, cs_id);
2439 			drm_WARN_ON(&group->ptdev->base, ret < 0);
2440 			if (ret)
2441 				unblocked_queues |= BIT(cs_id);
2442 
2443 			tested_queues &= ~BIT(cs_id);
2444 		}
2445 
2446 		if (unblocked_queues) {
2447 			group->blocked_queues &= ~unblocked_queues;
2448 
2449 			if (group->csg_id < 0) {
2450 				list_move(&group->run_node,
2451 					  &sched->groups.runnable[group->priority]);
2452 				if (group->priority == PANTHOR_CSG_PRIORITY_RT)
2453 					immediate_tick = true;
2454 			}
2455 		}
2456 
2457 		if (!group->blocked_queues)
2458 			list_del_init(&group->wait_node);
2459 	}
2460 	mutex_unlock(&sched->lock);
2461 
2462 	if (immediate_tick)
2463 		sched_queue_delayed_work(sched, tick, 0);
2464 }
2465 
2466 static void group_schedule_locked(struct panthor_group *group, u32 queue_mask)
2467 {
2468 	struct panthor_device *ptdev = group->ptdev;
2469 	struct panthor_scheduler *sched = ptdev->scheduler;
2470 	struct list_head *queue = &sched->groups.runnable[group->priority];
2471 	u64 delay_jiffies = 0;
2472 	bool was_idle;
2473 	u64 now;
2474 
2475 	if (!group_can_run(group))
2476 		return;
2477 
2478 	/* All updated queues are blocked, no need to wake up the scheduler. */
2479 	if ((queue_mask & group->blocked_queues) == queue_mask)
2480 		return;
2481 
2482 	was_idle = group_is_idle(group);
2483 	group->idle_queues &= ~queue_mask;
2484 
2485 	/* Don't mess up with the lists if we're in a middle of a reset. */
2486 	if (atomic_read(&sched->reset.in_progress))
2487 		return;
2488 
2489 	if (was_idle && !group_is_idle(group))
2490 		list_move_tail(&group->run_node, queue);
2491 
2492 	/* RT groups are preemptive. */
2493 	if (group->priority == PANTHOR_CSG_PRIORITY_RT) {
2494 		sched_queue_delayed_work(sched, tick, 0);
2495 		return;
2496 	}
2497 
2498 	/* Some groups might be idle, force an immediate tick to
2499 	 * re-evaluate.
2500 	 */
2501 	if (sched->might_have_idle_groups) {
2502 		sched_queue_delayed_work(sched, tick, 0);
2503 		return;
2504 	}
2505 
2506 	/* Scheduler is ticking, nothing to do. */
2507 	if (sched->resched_target != U64_MAX) {
2508 		/* If there are free slots, force immediating ticking. */
2509 		if (sched->used_csg_slot_count < sched->csg_slot_count)
2510 			sched_queue_delayed_work(sched, tick, 0);
2511 
2512 		return;
2513 	}
2514 
2515 	/* Scheduler tick was off, recalculate the resched_target based on the
2516 	 * last tick event, and queue the scheduler work.
2517 	 */
2518 	now = get_jiffies_64();
2519 	sched->resched_target = sched->last_tick + sched->tick_period;
2520 	if (sched->used_csg_slot_count == sched->csg_slot_count &&
2521 	    time_before64(now, sched->resched_target))
2522 		delay_jiffies = min_t(unsigned long, sched->resched_target - now, ULONG_MAX);
2523 
2524 	sched_queue_delayed_work(sched, tick, delay_jiffies);
2525 }
2526 
2527 static void queue_stop(struct panthor_queue *queue,
2528 		       struct panthor_job *bad_job)
2529 {
2530 	drm_sched_stop(&queue->scheduler, bad_job ? &bad_job->base : NULL);
2531 }
2532 
2533 static void queue_start(struct panthor_queue *queue)
2534 {
2535 	struct panthor_job *job;
2536 
2537 	/* Re-assign the parent fences. */
2538 	list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
2539 		job->base.s_fence->parent = dma_fence_get(job->done_fence);
2540 
2541 	drm_sched_start(&queue->scheduler, true);
2542 }
2543 
2544 static void panthor_group_stop(struct panthor_group *group)
2545 {
2546 	struct panthor_scheduler *sched = group->ptdev->scheduler;
2547 
2548 	lockdep_assert_held(&sched->reset.lock);
2549 
2550 	for (u32 i = 0; i < group->queue_count; i++)
2551 		queue_stop(group->queues[i], NULL);
2552 
2553 	group_get(group);
2554 	list_move_tail(&group->run_node, &sched->reset.stopped_groups);
2555 }
2556 
2557 static void panthor_group_start(struct panthor_group *group)
2558 {
2559 	struct panthor_scheduler *sched = group->ptdev->scheduler;
2560 
2561 	lockdep_assert_held(&group->ptdev->scheduler->reset.lock);
2562 
2563 	for (u32 i = 0; i < group->queue_count; i++)
2564 		queue_start(group->queues[i]);
2565 
2566 	if (group_can_run(group)) {
2567 		list_move_tail(&group->run_node,
2568 			       group_is_idle(group) ?
2569 			       &sched->groups.idle[group->priority] :
2570 			       &sched->groups.runnable[group->priority]);
2571 	} else {
2572 		list_del_init(&group->run_node);
2573 		list_del_init(&group->wait_node);
2574 		group_queue_work(group, term);
2575 	}
2576 
2577 	group_put(group);
2578 }
2579 
2580 static void panthor_sched_immediate_tick(struct panthor_device *ptdev)
2581 {
2582 	struct panthor_scheduler *sched = ptdev->scheduler;
2583 
2584 	sched_queue_delayed_work(sched, tick, 0);
2585 }
2586 
2587 /**
2588  * panthor_sched_report_mmu_fault() - Report MMU faults to the scheduler.
2589  */
2590 void panthor_sched_report_mmu_fault(struct panthor_device *ptdev)
2591 {
2592 	/* Force a tick to immediately kill faulty groups. */
2593 	if (ptdev->scheduler)
2594 		panthor_sched_immediate_tick(ptdev);
2595 }
2596 
2597 void panthor_sched_resume(struct panthor_device *ptdev)
2598 {
2599 	/* Force a tick to re-evaluate after a resume. */
2600 	panthor_sched_immediate_tick(ptdev);
2601 }
2602 
2603 void panthor_sched_suspend(struct panthor_device *ptdev)
2604 {
2605 	struct panthor_scheduler *sched = ptdev->scheduler;
2606 	struct panthor_csg_slots_upd_ctx upd_ctx;
2607 	struct panthor_group *group;
2608 	u32 suspended_slots;
2609 	u32 i;
2610 
2611 	mutex_lock(&sched->lock);
2612 	csgs_upd_ctx_init(&upd_ctx);
2613 	for (i = 0; i < sched->csg_slot_count; i++) {
2614 		struct panthor_csg_slot *csg_slot = &sched->csg_slots[i];
2615 
2616 		if (csg_slot->group) {
2617 			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i,
2618 						group_can_run(csg_slot->group) ?
2619 						CSG_STATE_SUSPEND : CSG_STATE_TERMINATE,
2620 						CSG_STATE_MASK);
2621 		}
2622 	}
2623 
2624 	suspended_slots = upd_ctx.update_mask;
2625 
2626 	csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
2627 	suspended_slots &= ~upd_ctx.timedout_mask;
2628 
2629 	if (upd_ctx.timedout_mask) {
2630 		u32 slot_mask = upd_ctx.timedout_mask;
2631 
2632 		drm_err(&ptdev->base, "CSG suspend failed, escalating to termination");
2633 		csgs_upd_ctx_init(&upd_ctx);
2634 		while (slot_mask) {
2635 			u32 csg_id = ffs(slot_mask) - 1;
2636 
2637 			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
2638 						CSG_STATE_TERMINATE,
2639 						CSG_STATE_MASK);
2640 			slot_mask &= ~BIT(csg_id);
2641 		}
2642 
2643 		csgs_upd_ctx_apply_locked(ptdev, &upd_ctx);
2644 
2645 		slot_mask = upd_ctx.timedout_mask;
2646 		while (slot_mask) {
2647 			u32 csg_id = ffs(slot_mask) - 1;
2648 			struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
2649 
2650 			/* Terminate command timedout, but the soft-reset will
2651 			 * automatically terminate all active groups, so let's
2652 			 * force the state to halted here.
2653 			 */
2654 			if (csg_slot->group->state != PANTHOR_CS_GROUP_TERMINATED)
2655 				csg_slot->group->state = PANTHOR_CS_GROUP_TERMINATED;
2656 			slot_mask &= ~BIT(csg_id);
2657 		}
2658 	}
2659 
2660 	/* Flush L2 and LSC caches to make sure suspend state is up-to-date.
2661 	 * If the flush fails, flag all queues for termination.
2662 	 */
2663 	if (suspended_slots) {
2664 		bool flush_caches_failed = false;
2665 		u32 slot_mask = suspended_slots;
2666 
2667 		if (panthor_gpu_flush_caches(ptdev, CACHE_CLEAN, CACHE_CLEAN, 0))
2668 			flush_caches_failed = true;
2669 
2670 		while (slot_mask) {
2671 			u32 csg_id = ffs(slot_mask) - 1;
2672 			struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
2673 
2674 			if (flush_caches_failed)
2675 				csg_slot->group->state = PANTHOR_CS_GROUP_TERMINATED;
2676 			else
2677 				csg_slot_sync_update_locked(ptdev, csg_id);
2678 
2679 			slot_mask &= ~BIT(csg_id);
2680 		}
2681 	}
2682 
2683 	for (i = 0; i < sched->csg_slot_count; i++) {
2684 		struct panthor_csg_slot *csg_slot = &sched->csg_slots[i];
2685 
2686 		group = csg_slot->group;
2687 		if (!group)
2688 			continue;
2689 
2690 		group_get(group);
2691 
2692 		if (group->csg_id >= 0)
2693 			sched_process_csg_irq_locked(ptdev, group->csg_id);
2694 
2695 		group_unbind_locked(group);
2696 
2697 		drm_WARN_ON(&group->ptdev->base, !list_empty(&group->run_node));
2698 
2699 		if (group_can_run(group)) {
2700 			list_add(&group->run_node,
2701 				 &sched->groups.idle[group->priority]);
2702 		} else {
2703 			/* We don't bother stopping the scheduler if the group is
2704 			 * faulty, the group termination work will finish the job.
2705 			 */
2706 			list_del_init(&group->wait_node);
2707 			group_queue_work(group, term);
2708 		}
2709 		group_put(group);
2710 	}
2711 	mutex_unlock(&sched->lock);
2712 }
2713 
2714 void panthor_sched_pre_reset(struct panthor_device *ptdev)
2715 {
2716 	struct panthor_scheduler *sched = ptdev->scheduler;
2717 	struct panthor_group *group, *group_tmp;
2718 	u32 i;
2719 
2720 	mutex_lock(&sched->reset.lock);
2721 	atomic_set(&sched->reset.in_progress, true);
2722 
2723 	/* Cancel all scheduler works. Once this is done, these works can't be
2724 	 * scheduled again until the reset operation is complete.
2725 	 */
2726 	cancel_work_sync(&sched->sync_upd_work);
2727 	cancel_delayed_work_sync(&sched->tick_work);
2728 
2729 	panthor_sched_suspend(ptdev);
2730 
2731 	/* Stop all groups that might still accept jobs, so we don't get passed
2732 	 * new jobs while we're resetting.
2733 	 */
2734 	for (i = 0; i < ARRAY_SIZE(sched->groups.runnable); i++) {
2735 		/* All groups should be in the idle lists. */
2736 		drm_WARN_ON(&ptdev->base, !list_empty(&sched->groups.runnable[i]));
2737 		list_for_each_entry_safe(group, group_tmp, &sched->groups.runnable[i], run_node)
2738 			panthor_group_stop(group);
2739 	}
2740 
2741 	for (i = 0; i < ARRAY_SIZE(sched->groups.idle); i++) {
2742 		list_for_each_entry_safe(group, group_tmp, &sched->groups.idle[i], run_node)
2743 			panthor_group_stop(group);
2744 	}
2745 
2746 	mutex_unlock(&sched->reset.lock);
2747 }
2748 
2749 void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed)
2750 {
2751 	struct panthor_scheduler *sched = ptdev->scheduler;
2752 	struct panthor_group *group, *group_tmp;
2753 
2754 	mutex_lock(&sched->reset.lock);
2755 
2756 	list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) {
2757 		/* Consider all previously running group as terminated if the
2758 		 * reset failed.
2759 		 */
2760 		if (reset_failed)
2761 			group->state = PANTHOR_CS_GROUP_TERMINATED;
2762 
2763 		panthor_group_start(group);
2764 	}
2765 
2766 	/* We're done resetting the GPU, clear the reset.in_progress bit so we can
2767 	 * kick the scheduler.
2768 	 */
2769 	atomic_set(&sched->reset.in_progress, false);
2770 	mutex_unlock(&sched->reset.lock);
2771 
2772 	/* No need to queue a tick and update syncs if the reset failed. */
2773 	if (!reset_failed) {
2774 		sched_queue_delayed_work(sched, tick, 0);
2775 		sched_queue_work(sched, sync_upd);
2776 	}
2777 }
2778 
2779 static void group_sync_upd_work(struct work_struct *work)
2780 {
2781 	struct panthor_group *group =
2782 		container_of(work, struct panthor_group, sync_upd_work);
2783 	struct panthor_job *job, *job_tmp;
2784 	LIST_HEAD(done_jobs);
2785 	u32 queue_idx;
2786 	bool cookie;
2787 
2788 	cookie = dma_fence_begin_signalling();
2789 	for (queue_idx = 0; queue_idx < group->queue_count; queue_idx++) {
2790 		struct panthor_queue *queue = group->queues[queue_idx];
2791 		struct panthor_syncobj_64b *syncobj;
2792 
2793 		if (!queue)
2794 			continue;
2795 
2796 		syncobj = group->syncobjs->kmap + (queue_idx * sizeof(*syncobj));
2797 
2798 		spin_lock(&queue->fence_ctx.lock);
2799 		list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
2800 			if (syncobj->seqno < job->done_fence->seqno)
2801 				break;
2802 
2803 			list_move_tail(&job->node, &done_jobs);
2804 			dma_fence_signal_locked(job->done_fence);
2805 		}
2806 		spin_unlock(&queue->fence_ctx.lock);
2807 	}
2808 	dma_fence_end_signalling(cookie);
2809 
2810 	list_for_each_entry_safe(job, job_tmp, &done_jobs, node) {
2811 		list_del_init(&job->node);
2812 		panthor_job_put(&job->base);
2813 	}
2814 
2815 	group_put(group);
2816 }
2817 
2818 static struct dma_fence *
2819 queue_run_job(struct drm_sched_job *sched_job)
2820 {
2821 	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
2822 	struct panthor_group *group = job->group;
2823 	struct panthor_queue *queue = group->queues[job->queue_idx];
2824 	struct panthor_device *ptdev = group->ptdev;
2825 	struct panthor_scheduler *sched = ptdev->scheduler;
2826 	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
2827 	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
2828 	u64 addr_reg = ptdev->csif_info.cs_reg_count -
2829 		       ptdev->csif_info.unpreserved_cs_reg_count;
2830 	u64 val_reg = addr_reg + 2;
2831 	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
2832 			job->queue_idx * sizeof(struct panthor_syncobj_64b);
2833 	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
2834 	struct dma_fence *done_fence;
2835 	int ret;
2836 
2837 	u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
2838 		/* MOV32 rX+2, cs.latest_flush */
2839 		(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
2840 
2841 		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
2842 		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
2843 
2844 		/* MOV48 rX:rX+1, cs.start */
2845 		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
2846 
2847 		/* MOV32 rX+2, cs.size */
2848 		(2ull << 56) | (val_reg << 48) | job->call_info.size,
2849 
2850 		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
2851 		(3ull << 56) | (1 << 16),
2852 
2853 		/* CALL rX:rX+1, rX+2 */
2854 		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
2855 
2856 		/* MOV48 rX:rX+1, sync_addr */
2857 		(1ull << 56) | (addr_reg << 48) | sync_addr,
2858 
2859 		/* MOV48 rX+2, #1 */
2860 		(1ull << 56) | (val_reg << 48) | 1,
2861 
2862 		/* WAIT(all) */
2863 		(3ull << 56) | (waitall_mask << 16),
2864 
2865 		/* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
2866 		(51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
2867 
2868 		/* ERROR_BARRIER, so we can recover from faults at job
2869 		 * boundaries.
2870 		 */
2871 		(47ull << 56),
2872 	};
2873 
2874 	/* Need to be cacheline aligned to please the prefetcher. */
2875 	static_assert(sizeof(call_instrs) % 64 == 0,
2876 		      "call_instrs is not aligned on a cacheline");
2877 
2878 	/* Stream size is zero, nothing to do except making sure all previously
2879 	 * submitted jobs are done before we signal the
2880 	 * drm_sched_job::s_fence::finished fence.
2881 	 */
2882 	if (!job->call_info.size) {
2883 		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
2884 		return dma_fence_get(job->done_fence);
2885 	}
2886 
2887 	ret = pm_runtime_resume_and_get(ptdev->base.dev);
2888 	if (drm_WARN_ON(&ptdev->base, ret))
2889 		return ERR_PTR(ret);
2890 
2891 	mutex_lock(&sched->lock);
2892 	if (!group_can_run(group)) {
2893 		done_fence = ERR_PTR(-ECANCELED);
2894 		goto out_unlock;
2895 	}
2896 
2897 	dma_fence_init(job->done_fence,
2898 		       &panthor_queue_fence_ops,
2899 		       &queue->fence_ctx.lock,
2900 		       queue->fence_ctx.id,
2901 		       atomic64_inc_return(&queue->fence_ctx.seqno));
2902 
2903 	memcpy(queue->ringbuf->kmap + ringbuf_insert,
2904 	       call_instrs, sizeof(call_instrs));
2905 
2906 	panthor_job_get(&job->base);
2907 	spin_lock(&queue->fence_ctx.lock);
2908 	list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
2909 	spin_unlock(&queue->fence_ctx.lock);
2910 
2911 	job->ringbuf.start = queue->iface.input->insert;
2912 	job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
2913 
2914 	/* Make sure the ring buffer is updated before the INSERT
2915 	 * register.
2916 	 */
2917 	wmb();
2918 
2919 	queue->iface.input->extract = queue->iface.output->extract;
2920 	queue->iface.input->insert = job->ringbuf.end;
2921 
2922 	if (group->csg_id < 0) {
2923 		/* If the queue is blocked, we want to keep the timeout running, so we
2924 		 * can detect unbounded waits and kill the group when that happens.
2925 		 * Otherwise, we suspend the timeout so the time we spend waiting for
2926 		 * a CSG slot is not counted.
2927 		 */
2928 		if (!(group->blocked_queues & BIT(job->queue_idx)) &&
2929 		    !queue->timeout_suspended) {
2930 			queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
2931 			queue->timeout_suspended = true;
2932 		}
2933 
2934 		group_schedule_locked(group, BIT(job->queue_idx));
2935 	} else {
2936 		gpu_write(ptdev, CSF_DOORBELL(queue->doorbell_id), 1);
2937 		if (!sched->pm.has_ref &&
2938 		    !(group->blocked_queues & BIT(job->queue_idx))) {
2939 			pm_runtime_get(ptdev->base.dev);
2940 			sched->pm.has_ref = true;
2941 		}
2942 		panthor_devfreq_record_busy(sched->ptdev);
2943 	}
2944 
2945 	/* Update the last fence. */
2946 	dma_fence_put(queue->fence_ctx.last_fence);
2947 	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
2948 
2949 	done_fence = dma_fence_get(job->done_fence);
2950 
2951 out_unlock:
2952 	mutex_unlock(&sched->lock);
2953 	pm_runtime_mark_last_busy(ptdev->base.dev);
2954 	pm_runtime_put_autosuspend(ptdev->base.dev);
2955 
2956 	return done_fence;
2957 }
2958 
2959 static enum drm_gpu_sched_stat
2960 queue_timedout_job(struct drm_sched_job *sched_job)
2961 {
2962 	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
2963 	struct panthor_group *group = job->group;
2964 	struct panthor_device *ptdev = group->ptdev;
2965 	struct panthor_scheduler *sched = ptdev->scheduler;
2966 	struct panthor_queue *queue = group->queues[job->queue_idx];
2967 
2968 	drm_warn(&ptdev->base, "job timeout\n");
2969 
2970 	drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress));
2971 
2972 	queue_stop(queue, job);
2973 
2974 	mutex_lock(&sched->lock);
2975 	group->timedout = true;
2976 	if (group->csg_id >= 0) {
2977 		sched_queue_delayed_work(ptdev->scheduler, tick, 0);
2978 	} else {
2979 		/* Remove from the run queues, so the scheduler can't
2980 		 * pick the group on the next tick.
2981 		 */
2982 		list_del_init(&group->run_node);
2983 		list_del_init(&group->wait_node);
2984 
2985 		group_queue_work(group, term);
2986 	}
2987 	mutex_unlock(&sched->lock);
2988 
2989 	queue_start(queue);
2990 
2991 	return DRM_GPU_SCHED_STAT_NOMINAL;
2992 }
2993 
2994 static void queue_free_job(struct drm_sched_job *sched_job)
2995 {
2996 	drm_sched_job_cleanup(sched_job);
2997 	panthor_job_put(sched_job);
2998 }
2999 
3000 static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
3001 	.run_job = queue_run_job,
3002 	.timedout_job = queue_timedout_job,
3003 	.free_job = queue_free_job,
3004 };
3005 
3006 static struct panthor_queue *
3007 group_create_queue(struct panthor_group *group,
3008 		   const struct drm_panthor_queue_create *args)
3009 {
3010 	struct drm_gpu_scheduler *drm_sched;
3011 	struct panthor_queue *queue;
3012 	int ret;
3013 
3014 	if (args->pad[0] || args->pad[1] || args->pad[2])
3015 		return ERR_PTR(-EINVAL);
3016 
3017 	if (args->ringbuf_size < SZ_4K || args->ringbuf_size > SZ_64K ||
3018 	    !is_power_of_2(args->ringbuf_size))
3019 		return ERR_PTR(-EINVAL);
3020 
3021 	if (args->priority > CSF_MAX_QUEUE_PRIO)
3022 		return ERR_PTR(-EINVAL);
3023 
3024 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
3025 	if (!queue)
3026 		return ERR_PTR(-ENOMEM);
3027 
3028 	queue->fence_ctx.id = dma_fence_context_alloc(1);
3029 	spin_lock_init(&queue->fence_ctx.lock);
3030 	INIT_LIST_HEAD(&queue->fence_ctx.in_flight_jobs);
3031 
3032 	queue->priority = args->priority;
3033 
3034 	queue->ringbuf = panthor_kernel_bo_create(group->ptdev, group->vm,
3035 						  args->ringbuf_size,
3036 						  DRM_PANTHOR_BO_NO_MMAP,
3037 						  DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
3038 						  DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
3039 						  PANTHOR_VM_KERNEL_AUTO_VA);
3040 	if (IS_ERR(queue->ringbuf)) {
3041 		ret = PTR_ERR(queue->ringbuf);
3042 		goto err_free_queue;
3043 	}
3044 
3045 	ret = panthor_kernel_bo_vmap(queue->ringbuf);
3046 	if (ret)
3047 		goto err_free_queue;
3048 
3049 	queue->iface.mem = panthor_fw_alloc_queue_iface_mem(group->ptdev,
3050 							    &queue->iface.input,
3051 							    &queue->iface.output,
3052 							    &queue->iface.input_fw_va,
3053 							    &queue->iface.output_fw_va);
3054 	if (IS_ERR(queue->iface.mem)) {
3055 		ret = PTR_ERR(queue->iface.mem);
3056 		goto err_free_queue;
3057 	}
3058 
3059 	ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
3060 			     group->ptdev->scheduler->wq, 1,
3061 			     args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
3062 			     0, msecs_to_jiffies(JOB_TIMEOUT_MS),
3063 			     group->ptdev->reset.wq,
3064 			     NULL, "panthor-queue", group->ptdev->base.dev);
3065 	if (ret)
3066 		goto err_free_queue;
3067 
3068 	drm_sched = &queue->scheduler;
3069 	ret = drm_sched_entity_init(&queue->entity, 0, &drm_sched, 1, NULL);
3070 
3071 	return queue;
3072 
3073 err_free_queue:
3074 	group_free_queue(group, queue);
3075 	return ERR_PTR(ret);
3076 }
3077 
3078 #define MAX_GROUPS_PER_POOL		128
3079 
3080 int panthor_group_create(struct panthor_file *pfile,
3081 			 const struct drm_panthor_group_create *group_args,
3082 			 const struct drm_panthor_queue_create *queue_args)
3083 {
3084 	struct panthor_device *ptdev = pfile->ptdev;
3085 	struct panthor_group_pool *gpool = pfile->groups;
3086 	struct panthor_scheduler *sched = ptdev->scheduler;
3087 	struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0);
3088 	struct panthor_group *group = NULL;
3089 	u32 gid, i, suspend_size;
3090 	int ret;
3091 
3092 	if (group_args->pad)
3093 		return -EINVAL;
3094 
3095 	if (group_args->priority >= PANTHOR_CSG_PRIORITY_COUNT)
3096 		return -EINVAL;
3097 
3098 	if ((group_args->compute_core_mask & ~ptdev->gpu_info.shader_present) ||
3099 	    (group_args->fragment_core_mask & ~ptdev->gpu_info.shader_present) ||
3100 	    (group_args->tiler_core_mask & ~ptdev->gpu_info.tiler_present))
3101 		return -EINVAL;
3102 
3103 	if (hweight64(group_args->compute_core_mask) < group_args->max_compute_cores ||
3104 	    hweight64(group_args->fragment_core_mask) < group_args->max_fragment_cores ||
3105 	    hweight64(group_args->tiler_core_mask) < group_args->max_tiler_cores)
3106 		return -EINVAL;
3107 
3108 	group = kzalloc(sizeof(*group), GFP_KERNEL);
3109 	if (!group)
3110 		return -ENOMEM;
3111 
3112 	spin_lock_init(&group->fatal_lock);
3113 	kref_init(&group->refcount);
3114 	group->state = PANTHOR_CS_GROUP_CREATED;
3115 	group->csg_id = -1;
3116 
3117 	group->ptdev = ptdev;
3118 	group->max_compute_cores = group_args->max_compute_cores;
3119 	group->compute_core_mask = group_args->compute_core_mask;
3120 	group->max_fragment_cores = group_args->max_fragment_cores;
3121 	group->fragment_core_mask = group_args->fragment_core_mask;
3122 	group->max_tiler_cores = group_args->max_tiler_cores;
3123 	group->tiler_core_mask = group_args->tiler_core_mask;
3124 	group->priority = group_args->priority;
3125 
3126 	INIT_LIST_HEAD(&group->wait_node);
3127 	INIT_LIST_HEAD(&group->run_node);
3128 	INIT_WORK(&group->term_work, group_term_work);
3129 	INIT_WORK(&group->sync_upd_work, group_sync_upd_work);
3130 	INIT_WORK(&group->tiler_oom_work, group_tiler_oom_work);
3131 	INIT_WORK(&group->release_work, group_release_work);
3132 
3133 	group->vm = panthor_vm_pool_get_vm(pfile->vms, group_args->vm_id);
3134 	if (!group->vm) {
3135 		ret = -EINVAL;
3136 		goto err_put_group;
3137 	}
3138 
3139 	suspend_size = csg_iface->control->suspend_size;
3140 	group->suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, suspend_size);
3141 	if (IS_ERR(group->suspend_buf)) {
3142 		ret = PTR_ERR(group->suspend_buf);
3143 		group->suspend_buf = NULL;
3144 		goto err_put_group;
3145 	}
3146 
3147 	suspend_size = csg_iface->control->protm_suspend_size;
3148 	group->protm_suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, suspend_size);
3149 	if (IS_ERR(group->protm_suspend_buf)) {
3150 		ret = PTR_ERR(group->protm_suspend_buf);
3151 		group->protm_suspend_buf = NULL;
3152 		goto err_put_group;
3153 	}
3154 
3155 	group->syncobjs = panthor_kernel_bo_create(ptdev, group->vm,
3156 						   group_args->queues.count *
3157 						   sizeof(struct panthor_syncobj_64b),
3158 						   DRM_PANTHOR_BO_NO_MMAP,
3159 						   DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
3160 						   DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
3161 						   PANTHOR_VM_KERNEL_AUTO_VA);
3162 	if (IS_ERR(group->syncobjs)) {
3163 		ret = PTR_ERR(group->syncobjs);
3164 		goto err_put_group;
3165 	}
3166 
3167 	ret = panthor_kernel_bo_vmap(group->syncobjs);
3168 	if (ret)
3169 		goto err_put_group;
3170 
3171 	memset(group->syncobjs->kmap, 0,
3172 	       group_args->queues.count * sizeof(struct panthor_syncobj_64b));
3173 
3174 	for (i = 0; i < group_args->queues.count; i++) {
3175 		group->queues[i] = group_create_queue(group, &queue_args[i]);
3176 		if (IS_ERR(group->queues[i])) {
3177 			ret = PTR_ERR(group->queues[i]);
3178 			group->queues[i] = NULL;
3179 			goto err_put_group;
3180 		}
3181 
3182 		group->queue_count++;
3183 	}
3184 
3185 	group->idle_queues = GENMASK(group->queue_count - 1, 0);
3186 
3187 	ret = xa_alloc(&gpool->xa, &gid, group, XA_LIMIT(1, MAX_GROUPS_PER_POOL), GFP_KERNEL);
3188 	if (ret)
3189 		goto err_put_group;
3190 
3191 	mutex_lock(&sched->reset.lock);
3192 	if (atomic_read(&sched->reset.in_progress)) {
3193 		panthor_group_stop(group);
3194 	} else {
3195 		mutex_lock(&sched->lock);
3196 		list_add_tail(&group->run_node,
3197 			      &sched->groups.idle[group->priority]);
3198 		mutex_unlock(&sched->lock);
3199 	}
3200 	mutex_unlock(&sched->reset.lock);
3201 
3202 	return gid;
3203 
3204 err_put_group:
3205 	group_put(group);
3206 	return ret;
3207 }
3208 
3209 int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle)
3210 {
3211 	struct panthor_group_pool *gpool = pfile->groups;
3212 	struct panthor_device *ptdev = pfile->ptdev;
3213 	struct panthor_scheduler *sched = ptdev->scheduler;
3214 	struct panthor_group *group;
3215 
3216 	group = xa_erase(&gpool->xa, group_handle);
3217 	if (!group)
3218 		return -EINVAL;
3219 
3220 	for (u32 i = 0; i < group->queue_count; i++) {
3221 		if (group->queues[i])
3222 			drm_sched_entity_destroy(&group->queues[i]->entity);
3223 	}
3224 
3225 	mutex_lock(&sched->reset.lock);
3226 	mutex_lock(&sched->lock);
3227 	group->destroyed = true;
3228 	if (group->csg_id >= 0) {
3229 		sched_queue_delayed_work(sched, tick, 0);
3230 	} else if (!atomic_read(&sched->reset.in_progress)) {
3231 		/* Remove from the run queues, so the scheduler can't
3232 		 * pick the group on the next tick.
3233 		 */
3234 		list_del_init(&group->run_node);
3235 		list_del_init(&group->wait_node);
3236 		group_queue_work(group, term);
3237 	}
3238 	mutex_unlock(&sched->lock);
3239 	mutex_unlock(&sched->reset.lock);
3240 
3241 	group_put(group);
3242 	return 0;
3243 }
3244 
3245 int panthor_group_get_state(struct panthor_file *pfile,
3246 			    struct drm_panthor_group_get_state *get_state)
3247 {
3248 	struct panthor_group_pool *gpool = pfile->groups;
3249 	struct panthor_device *ptdev = pfile->ptdev;
3250 	struct panthor_scheduler *sched = ptdev->scheduler;
3251 	struct panthor_group *group;
3252 
3253 	if (get_state->pad)
3254 		return -EINVAL;
3255 
3256 	group = group_get(xa_load(&gpool->xa, get_state->group_handle));
3257 	if (!group)
3258 		return -EINVAL;
3259 
3260 	memset(get_state, 0, sizeof(*get_state));
3261 
3262 	mutex_lock(&sched->lock);
3263 	if (group->timedout)
3264 		get_state->state |= DRM_PANTHOR_GROUP_STATE_TIMEDOUT;
3265 	if (group->fatal_queues) {
3266 		get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT;
3267 		get_state->fatal_queues = group->fatal_queues;
3268 	}
3269 	mutex_unlock(&sched->lock);
3270 
3271 	group_put(group);
3272 	return 0;
3273 }
3274 
3275 int panthor_group_pool_create(struct panthor_file *pfile)
3276 {
3277 	struct panthor_group_pool *gpool;
3278 
3279 	gpool = kzalloc(sizeof(*gpool), GFP_KERNEL);
3280 	if (!gpool)
3281 		return -ENOMEM;
3282 
3283 	xa_init_flags(&gpool->xa, XA_FLAGS_ALLOC1);
3284 	pfile->groups = gpool;
3285 	return 0;
3286 }
3287 
3288 void panthor_group_pool_destroy(struct panthor_file *pfile)
3289 {
3290 	struct panthor_group_pool *gpool = pfile->groups;
3291 	struct panthor_group *group;
3292 	unsigned long i;
3293 
3294 	if (IS_ERR_OR_NULL(gpool))
3295 		return;
3296 
3297 	xa_for_each(&gpool->xa, i, group)
3298 		panthor_group_destroy(pfile, i);
3299 
3300 	xa_destroy(&gpool->xa);
3301 	kfree(gpool);
3302 	pfile->groups = NULL;
3303 }
3304 
3305 static void job_release(struct kref *ref)
3306 {
3307 	struct panthor_job *job = container_of(ref, struct panthor_job, refcount);
3308 
3309 	drm_WARN_ON(&job->group->ptdev->base, !list_empty(&job->node));
3310 
3311 	if (job->base.s_fence)
3312 		drm_sched_job_cleanup(&job->base);
3313 
3314 	if (job->done_fence && job->done_fence->ops)
3315 		dma_fence_put(job->done_fence);
3316 	else
3317 		dma_fence_free(job->done_fence);
3318 
3319 	group_put(job->group);
3320 
3321 	kfree(job);
3322 }
3323 
3324 struct drm_sched_job *panthor_job_get(struct drm_sched_job *sched_job)
3325 {
3326 	if (sched_job) {
3327 		struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
3328 
3329 		kref_get(&job->refcount);
3330 	}
3331 
3332 	return sched_job;
3333 }
3334 
3335 void panthor_job_put(struct drm_sched_job *sched_job)
3336 {
3337 	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
3338 
3339 	if (sched_job)
3340 		kref_put(&job->refcount, job_release);
3341 }
3342 
3343 struct panthor_vm *panthor_job_vm(struct drm_sched_job *sched_job)
3344 {
3345 	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
3346 
3347 	return job->group->vm;
3348 }
3349 
3350 struct drm_sched_job *
3351 panthor_job_create(struct panthor_file *pfile,
3352 		   u16 group_handle,
3353 		   const struct drm_panthor_queue_submit *qsubmit)
3354 {
3355 	struct panthor_group_pool *gpool = pfile->groups;
3356 	struct panthor_job *job;
3357 	int ret;
3358 
3359 	if (qsubmit->pad)
3360 		return ERR_PTR(-EINVAL);
3361 
3362 	/* If stream_addr is zero, so stream_size should be. */
3363 	if ((qsubmit->stream_size == 0) != (qsubmit->stream_addr == 0))
3364 		return ERR_PTR(-EINVAL);
3365 
3366 	/* Make sure the address is aligned on 64-byte (cacheline) and the size is
3367 	 * aligned on 8-byte (instruction size).
3368 	 */
3369 	if ((qsubmit->stream_addr & 63) || (qsubmit->stream_size & 7))
3370 		return ERR_PTR(-EINVAL);
3371 
3372 	/* bits 24:30 must be zero. */
3373 	if (qsubmit->latest_flush & GENMASK(30, 24))
3374 		return ERR_PTR(-EINVAL);
3375 
3376 	job = kzalloc(sizeof(*job), GFP_KERNEL);
3377 	if (!job)
3378 		return ERR_PTR(-ENOMEM);
3379 
3380 	kref_init(&job->refcount);
3381 	job->queue_idx = qsubmit->queue_index;
3382 	job->call_info.size = qsubmit->stream_size;
3383 	job->call_info.start = qsubmit->stream_addr;
3384 	job->call_info.latest_flush = qsubmit->latest_flush;
3385 	INIT_LIST_HEAD(&job->node);
3386 
3387 	job->group = group_get(xa_load(&gpool->xa, group_handle));
3388 	if (!job->group) {
3389 		ret = -EINVAL;
3390 		goto err_put_job;
3391 	}
3392 
3393 	if (job->queue_idx >= job->group->queue_count ||
3394 	    !job->group->queues[job->queue_idx]) {
3395 		ret = -EINVAL;
3396 		goto err_put_job;
3397 	}
3398 
3399 	/* Empty command streams don't need a fence, they'll pick the one from
3400 	 * the previously submitted job.
3401 	 */
3402 	if (job->call_info.size) {
3403 		job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
3404 		if (!job->done_fence) {
3405 			ret = -ENOMEM;
3406 			goto err_put_job;
3407 		}
3408 	}
3409 
3410 	ret = drm_sched_job_init(&job->base,
3411 				 &job->group->queues[job->queue_idx]->entity,
3412 				 1, job->group);
3413 	if (ret)
3414 		goto err_put_job;
3415 
3416 	return &job->base;
3417 
3418 err_put_job:
3419 	panthor_job_put(&job->base);
3420 	return ERR_PTR(ret);
3421 }
3422 
3423 void panthor_job_update_resvs(struct drm_exec *exec, struct drm_sched_job *sched_job)
3424 {
3425 	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
3426 
3427 	/* Still not sure why we want USAGE_WRITE for external objects, since I
3428 	 * was assuming this would be handled through explicit syncs being imported
3429 	 * to external BOs with DMA_BUF_IOCTL_IMPORT_SYNC_FILE, but other drivers
3430 	 * seem to pass DMA_RESV_USAGE_WRITE, so there must be a good reason.
3431 	 */
3432 	panthor_vm_update_resvs(job->group->vm, exec, &sched_job->s_fence->finished,
3433 				DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_WRITE);
3434 }
3435 
3436 void panthor_sched_unplug(struct panthor_device *ptdev)
3437 {
3438 	struct panthor_scheduler *sched = ptdev->scheduler;
3439 
3440 	cancel_delayed_work_sync(&sched->tick_work);
3441 
3442 	mutex_lock(&sched->lock);
3443 	if (sched->pm.has_ref) {
3444 		pm_runtime_put(ptdev->base.dev);
3445 		sched->pm.has_ref = false;
3446 	}
3447 	mutex_unlock(&sched->lock);
3448 }
3449 
3450 static void panthor_sched_fini(struct drm_device *ddev, void *res)
3451 {
3452 	struct panthor_scheduler *sched = res;
3453 	int prio;
3454 
3455 	if (!sched || !sched->csg_slot_count)
3456 		return;
3457 
3458 	cancel_delayed_work_sync(&sched->tick_work);
3459 
3460 	if (sched->wq)
3461 		destroy_workqueue(sched->wq);
3462 
3463 	if (sched->heap_alloc_wq)
3464 		destroy_workqueue(sched->heap_alloc_wq);
3465 
3466 	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
3467 		drm_WARN_ON(ddev, !list_empty(&sched->groups.runnable[prio]));
3468 		drm_WARN_ON(ddev, !list_empty(&sched->groups.idle[prio]));
3469 	}
3470 
3471 	drm_WARN_ON(ddev, !list_empty(&sched->groups.waiting));
3472 }
3473 
3474 int panthor_sched_init(struct panthor_device *ptdev)
3475 {
3476 	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
3477 	struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0);
3478 	struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, 0, 0);
3479 	struct panthor_scheduler *sched;
3480 	u32 gpu_as_count, num_groups;
3481 	int prio, ret;
3482 
3483 	sched = drmm_kzalloc(&ptdev->base, sizeof(*sched), GFP_KERNEL);
3484 	if (!sched)
3485 		return -ENOMEM;
3486 
3487 	/* The highest bit in JOB_INT_* is reserved for globabl IRQs. That
3488 	 * leaves 31 bits for CSG IRQs, hence the MAX_CSGS clamp here.
3489 	 */
3490 	num_groups = min_t(u32, MAX_CSGS, glb_iface->control->group_num);
3491 
3492 	/* The FW-side scheduler might deadlock if two groups with the same
3493 	 * priority try to access a set of resources that overlaps, with part
3494 	 * of the resources being allocated to one group and the other part to
3495 	 * the other group, both groups waiting for the remaining resources to
3496 	 * be allocated. To avoid that, it is recommended to assign each CSG a
3497 	 * different priority. In theory we could allow several groups to have
3498 	 * the same CSG priority if they don't request the same resources, but
3499 	 * that makes the scheduling logic more complicated, so let's clamp
3500 	 * the number of CSG slots to MAX_CSG_PRIO + 1 for now.
3501 	 */
3502 	num_groups = min_t(u32, MAX_CSG_PRIO + 1, num_groups);
3503 
3504 	/* We need at least one AS for the MCU and one for the GPU contexts. */
3505 	gpu_as_count = hweight32(ptdev->gpu_info.as_present & GENMASK(31, 1));
3506 	if (!gpu_as_count) {
3507 		drm_err(&ptdev->base, "Not enough AS (%d, expected at least 2)",
3508 			gpu_as_count + 1);
3509 		return -EINVAL;
3510 	}
3511 
3512 	sched->ptdev = ptdev;
3513 	sched->sb_slot_count = CS_FEATURES_SCOREBOARDS(cs_iface->control->features);
3514 	sched->csg_slot_count = num_groups;
3515 	sched->cs_slot_count = csg_iface->control->stream_num;
3516 	sched->as_slot_count = gpu_as_count;
3517 	ptdev->csif_info.csg_slot_count = sched->csg_slot_count;
3518 	ptdev->csif_info.cs_slot_count = sched->cs_slot_count;
3519 	ptdev->csif_info.scoreboard_slot_count = sched->sb_slot_count;
3520 
3521 	sched->last_tick = 0;
3522 	sched->resched_target = U64_MAX;
3523 	sched->tick_period = msecs_to_jiffies(10);
3524 	INIT_DELAYED_WORK(&sched->tick_work, tick_work);
3525 	INIT_WORK(&sched->sync_upd_work, sync_upd_work);
3526 	INIT_WORK(&sched->fw_events_work, process_fw_events_work);
3527 
3528 	ret = drmm_mutex_init(&ptdev->base, &sched->lock);
3529 	if (ret)
3530 		return ret;
3531 
3532 	for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) {
3533 		INIT_LIST_HEAD(&sched->groups.runnable[prio]);
3534 		INIT_LIST_HEAD(&sched->groups.idle[prio]);
3535 	}
3536 	INIT_LIST_HEAD(&sched->groups.waiting);
3537 
3538 	ret = drmm_mutex_init(&ptdev->base, &sched->reset.lock);
3539 	if (ret)
3540 		return ret;
3541 
3542 	INIT_LIST_HEAD(&sched->reset.stopped_groups);
3543 
3544 	/* sched->heap_alloc_wq will be used for heap chunk allocation on
3545 	 * tiler OOM events, which means we can't use the same workqueue for
3546 	 * the scheduler because works queued by the scheduler are in
3547 	 * the dma-signalling path. Allocate a dedicated heap_alloc_wq to
3548 	 * work around this limitation.
3549 	 *
3550 	 * FIXME: Ultimately, what we need is a failable/non-blocking GEM
3551 	 * allocation path that we can call when a heap OOM is reported. The
3552 	 * FW is smart enough to fall back on other methods if the kernel can't
3553 	 * allocate memory, and fail the tiling job if none of these
3554 	 * countermeasures worked.
3555 	 *
3556 	 * Set WQ_MEM_RECLAIM on sched->wq to unblock the situation when the
3557 	 * system is running out of memory.
3558 	 */
3559 	sched->heap_alloc_wq = alloc_workqueue("panthor-heap-alloc", WQ_UNBOUND, 0);
3560 	sched->wq = alloc_workqueue("panthor-csf-sched", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
3561 	if (!sched->wq || !sched->heap_alloc_wq) {
3562 		panthor_sched_fini(&ptdev->base, sched);
3563 		drm_err(&ptdev->base, "Failed to allocate the workqueues");
3564 		return -ENOMEM;
3565 	}
3566 
3567 	ret = drmm_add_action_or_reset(&ptdev->base, panthor_sched_fini, sched);
3568 	if (ret)
3569 		return ret;
3570 
3571 	ptdev->scheduler = sched;
3572 	return 0;
3573 }
3574