xref: /linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision c01044cc819160323f3ca4acd44fca487c4432e6)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 #include "shmem_utils.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 
181 struct virtual_engine {
182 	struct intel_engine_cs base;
183 	struct intel_context context;
184 
185 	/*
186 	 * We allow only a single request through the virtual engine at a time
187 	 * (each request in the timeline waits for the completion fence of
188 	 * the previous before being submitted). By restricting ourselves to
189 	 * only submitting a single request, each request is placed on to a
190 	 * physical to maximise load spreading (by virtue of the late greedy
191 	 * scheduling -- each real engine takes the next available request
192 	 * upon idling).
193 	 */
194 	struct i915_request *request;
195 
196 	/*
197 	 * We keep a rbtree of available virtual engines inside each physical
198 	 * engine, sorted by priority. Here we preallocate the nodes we need
199 	 * for the virtual engine, indexed by physical_engine->id.
200 	 */
201 	struct ve_node {
202 		struct rb_node rb;
203 		int prio;
204 	} nodes[I915_NUM_ENGINES];
205 
206 	/*
207 	 * Keep track of bonded pairs -- restrictions upon on our selection
208 	 * of physical engines any particular request may be submitted to.
209 	 * If we receive a submit-fence from a master engine, we will only
210 	 * use one of sibling_mask physical engines.
211 	 */
212 	struct ve_bond {
213 		const struct intel_engine_cs *master;
214 		intel_engine_mask_t sibling_mask;
215 	} *bonds;
216 	unsigned int num_bonds;
217 
218 	/* And finally, which physical engines this virtual engine maps onto. */
219 	unsigned int num_siblings;
220 	struct intel_engine_cs *siblings[];
221 };
222 
223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224 {
225 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
226 	return container_of(engine, struct virtual_engine, base);
227 }
228 
229 static int __execlists_context_alloc(struct intel_context *ce,
230 				     struct intel_engine_cs *engine);
231 
232 static void execlists_init_reg_state(u32 *reg_state,
233 				     const struct intel_context *ce,
234 				     const struct intel_engine_cs *engine,
235 				     const struct intel_ring *ring,
236 				     bool close);
237 static void
238 __execlists_update_reg_state(const struct intel_context *ce,
239 			     const struct intel_engine_cs *engine,
240 			     u32 head);
241 
242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
243 {
244 	if (INTEL_GEN(engine->i915) >= 12)
245 		return 0x60;
246 	else if (INTEL_GEN(engine->i915) >= 9)
247 		return 0x54;
248 	else if (engine->class == RENDER_CLASS)
249 		return 0x58;
250 	else
251 		return -1;
252 }
253 
254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
255 {
256 	if (INTEL_GEN(engine->i915) >= 12)
257 		return 0x74;
258 	else if (INTEL_GEN(engine->i915) >= 9)
259 		return 0x68;
260 	else if (engine->class == RENDER_CLASS)
261 		return 0xd8;
262 	else
263 		return -1;
264 }
265 
266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
267 {
268 	if (INTEL_GEN(engine->i915) >= 12)
269 		return 0x12;
270 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
271 		return 0x18;
272 	else
273 		return -1;
274 }
275 
276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
277 {
278 	int x;
279 
280 	x = lrc_ring_wa_bb_per_ctx(engine);
281 	if (x < 0)
282 		return x;
283 
284 	return x + 2;
285 }
286 
287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
288 {
289 	int x;
290 
291 	x = lrc_ring_indirect_ptr(engine);
292 	if (x < 0)
293 		return x;
294 
295 	return x + 2;
296 }
297 
298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
299 {
300 	if (engine->class != RENDER_CLASS)
301 		return -1;
302 
303 	if (INTEL_GEN(engine->i915) >= 12)
304 		return 0xb6;
305 	else if (INTEL_GEN(engine->i915) >= 11)
306 		return 0xaa;
307 	else
308 		return -1;
309 }
310 
311 static u32
312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
313 {
314 	switch (INTEL_GEN(engine->i915)) {
315 	default:
316 		MISSING_CASE(INTEL_GEN(engine->i915));
317 		fallthrough;
318 	case 12:
319 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
320 	case 11:
321 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322 	case 10:
323 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324 	case 9:
325 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326 	case 8:
327 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328 	}
329 }
330 
331 static void
332 lrc_ring_setup_indirect_ctx(u32 *regs,
333 			    const struct intel_engine_cs *engine,
334 			    u32 ctx_bb_ggtt_addr,
335 			    u32 size)
336 {
337 	GEM_BUG_ON(!size);
338 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
339 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
340 	regs[lrc_ring_indirect_ptr(engine) + 1] =
341 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
342 
343 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
344 	regs[lrc_ring_indirect_offset(engine) + 1] =
345 		lrc_ring_indirect_offset_default(engine) << 6;
346 }
347 
348 static u32 intel_context_get_runtime(const struct intel_context *ce)
349 {
350 	/*
351 	 * We can use either ppHWSP[16] which is recorded before the context
352 	 * switch (and so excludes the cost of context switches) or use the
353 	 * value from the context image itself, which is saved/restored earlier
354 	 * and so includes the cost of the save.
355 	 */
356 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
357 }
358 
359 static void mark_eio(struct i915_request *rq)
360 {
361 	if (i915_request_completed(rq))
362 		return;
363 
364 	GEM_BUG_ON(i915_request_signaled(rq));
365 
366 	i915_request_set_error_once(rq, -EIO);
367 	i915_request_mark_complete(rq);
368 }
369 
370 static struct i915_request *
371 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
372 {
373 	struct i915_request *active = rq;
374 
375 	rcu_read_lock();
376 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
377 		if (i915_request_completed(rq))
378 			break;
379 
380 		active = rq;
381 	}
382 	rcu_read_unlock();
383 
384 	return active;
385 }
386 
387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
388 {
389 	return (i915_ggtt_offset(engine->status_page.vma) +
390 		I915_GEM_HWS_PREEMPT_ADDR);
391 }
392 
393 static inline void
394 ring_set_paused(const struct intel_engine_cs *engine, int state)
395 {
396 	/*
397 	 * We inspect HWS_PREEMPT with a semaphore inside
398 	 * engine->emit_fini_breadcrumb. If the dword is true,
399 	 * the ring is paused as the semaphore will busywait
400 	 * until the dword is false.
401 	 */
402 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
403 	if (state)
404 		wmb();
405 }
406 
407 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
408 {
409 	return rb_entry(rb, struct i915_priolist, node);
410 }
411 
412 static inline int rq_prio(const struct i915_request *rq)
413 {
414 	return READ_ONCE(rq->sched.attr.priority);
415 }
416 
417 static int effective_prio(const struct i915_request *rq)
418 {
419 	int prio = rq_prio(rq);
420 
421 	/*
422 	 * If this request is special and must not be interrupted at any
423 	 * cost, so be it. Note we are only checking the most recent request
424 	 * in the context and so may be masking an earlier vip request. It
425 	 * is hoped that under the conditions where nopreempt is used, this
426 	 * will not matter (i.e. all requests to that context will be
427 	 * nopreempt for as long as desired).
428 	 */
429 	if (i915_request_has_nopreempt(rq))
430 		prio = I915_PRIORITY_UNPREEMPTABLE;
431 
432 	return prio;
433 }
434 
435 static int queue_prio(const struct intel_engine_execlists *execlists)
436 {
437 	struct i915_priolist *p;
438 	struct rb_node *rb;
439 
440 	rb = rb_first_cached(&execlists->queue);
441 	if (!rb)
442 		return INT_MIN;
443 
444 	/*
445 	 * As the priolist[] are inverted, with the highest priority in [0],
446 	 * we have to flip the index value to become priority.
447 	 */
448 	p = to_priolist(rb);
449 	if (!I915_USER_PRIORITY_SHIFT)
450 		return p->priority;
451 
452 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
453 }
454 
455 static inline bool need_preempt(const struct intel_engine_cs *engine,
456 				const struct i915_request *rq,
457 				struct rb_node *rb)
458 {
459 	int last_prio;
460 
461 	if (!intel_engine_has_semaphores(engine))
462 		return false;
463 
464 	/*
465 	 * Check if the current priority hint merits a preemption attempt.
466 	 *
467 	 * We record the highest value priority we saw during rescheduling
468 	 * prior to this dequeue, therefore we know that if it is strictly
469 	 * less than the current tail of ESLP[0], we do not need to force
470 	 * a preempt-to-idle cycle.
471 	 *
472 	 * However, the priority hint is a mere hint that we may need to
473 	 * preempt. If that hint is stale or we may be trying to preempt
474 	 * ourselves, ignore the request.
475 	 *
476 	 * More naturally we would write
477 	 *      prio >= max(0, last);
478 	 * except that we wish to prevent triggering preemption at the same
479 	 * priority level: the task that is running should remain running
480 	 * to preserve FIFO ordering of dependencies.
481 	 */
482 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
483 	if (engine->execlists.queue_priority_hint <= last_prio)
484 		return false;
485 
486 	/*
487 	 * Check against the first request in ELSP[1], it will, thanks to the
488 	 * power of PI, be the highest priority of that context.
489 	 */
490 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
491 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
492 		return true;
493 
494 	if (rb) {
495 		struct virtual_engine *ve =
496 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
497 		bool preempt = false;
498 
499 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
500 			struct i915_request *next;
501 
502 			rcu_read_lock();
503 			next = READ_ONCE(ve->request);
504 			if (next)
505 				preempt = rq_prio(next) > last_prio;
506 			rcu_read_unlock();
507 		}
508 
509 		if (preempt)
510 			return preempt;
511 	}
512 
513 	/*
514 	 * If the inflight context did not trigger the preemption, then maybe
515 	 * it was the set of queued requests? Pick the highest priority in
516 	 * the queue (the first active priolist) and see if it deserves to be
517 	 * running instead of ELSP[0].
518 	 *
519 	 * The highest priority request in the queue can not be either
520 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
521 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
522 	 */
523 	return queue_prio(&engine->execlists) > last_prio;
524 }
525 
526 __maybe_unused static inline bool
527 assert_priority_queue(const struct i915_request *prev,
528 		      const struct i915_request *next)
529 {
530 	/*
531 	 * Without preemption, the prev may refer to the still active element
532 	 * which we refuse to let go.
533 	 *
534 	 * Even with preemption, there are times when we think it is better not
535 	 * to preempt and leave an ostensibly lower priority request in flight.
536 	 */
537 	if (i915_request_is_active(prev))
538 		return true;
539 
540 	return rq_prio(prev) >= rq_prio(next);
541 }
542 
543 /*
544  * The context descriptor encodes various attributes of a context,
545  * including its GTT address and some flags. Because it's fairly
546  * expensive to calculate, we'll just do it once and cache the result,
547  * which remains valid until the context is unpinned.
548  *
549  * This is what a descriptor looks like, from LSB to MSB::
550  *
551  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
552  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
553  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
554  *      bits 53-54:    mbz, reserved for use by hardware
555  *      bits 55-63:    group ID, currently unused and set to 0
556  *
557  * Starting from Gen11, the upper dword of the descriptor has a new format:
558  *
559  *      bits 32-36:    reserved
560  *      bits 37-47:    SW context ID
561  *      bits 48:53:    engine instance
562  *      bit 54:        mbz, reserved for use by hardware
563  *      bits 55-60:    SW counter
564  *      bits 61-63:    engine class
565  *
566  * engine info, SW context ID and SW counter need to form a unique number
567  * (Context ID) per lrc.
568  */
569 static u32
570 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
571 {
572 	u32 desc;
573 
574 	desc = INTEL_LEGACY_32B_CONTEXT;
575 	if (i915_vm_is_4lvl(ce->vm))
576 		desc = INTEL_LEGACY_64B_CONTEXT;
577 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
578 
579 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
580 	if (IS_GEN(engine->i915, 8))
581 		desc |= GEN8_CTX_L3LLC_COHERENT;
582 
583 	return i915_ggtt_offset(ce->state) | desc;
584 }
585 
586 static inline unsigned int dword_in_page(void *addr)
587 {
588 	return offset_in_page(addr) / sizeof(u32);
589 }
590 
591 static void set_offsets(u32 *regs,
592 			const u8 *data,
593 			const struct intel_engine_cs *engine,
594 			bool clear)
595 #define NOP(x) (BIT(7) | (x))
596 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
597 #define POSTED BIT(0)
598 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
599 #define REG16(x) \
600 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
601 	(((x) >> 2) & 0x7f)
602 #define END(total_state_size) 0, (total_state_size)
603 {
604 	const u32 base = engine->mmio_base;
605 
606 	while (*data) {
607 		u8 count, flags;
608 
609 		if (*data & BIT(7)) { /* skip */
610 			count = *data++ & ~BIT(7);
611 			if (clear)
612 				memset32(regs, MI_NOOP, count);
613 			regs += count;
614 			continue;
615 		}
616 
617 		count = *data & 0x3f;
618 		flags = *data >> 6;
619 		data++;
620 
621 		*regs = MI_LOAD_REGISTER_IMM(count);
622 		if (flags & POSTED)
623 			*regs |= MI_LRI_FORCE_POSTED;
624 		if (INTEL_GEN(engine->i915) >= 11)
625 			*regs |= MI_LRI_LRM_CS_MMIO;
626 		regs++;
627 
628 		GEM_BUG_ON(!count);
629 		do {
630 			u32 offset = 0;
631 			u8 v;
632 
633 			do {
634 				v = *data++;
635 				offset <<= 7;
636 				offset |= v & ~BIT(7);
637 			} while (v & BIT(7));
638 
639 			regs[0] = base + (offset << 2);
640 			if (clear)
641 				regs[1] = 0;
642 			regs += 2;
643 		} while (--count);
644 	}
645 
646 	if (clear) {
647 		u8 count = *++data;
648 
649 		/* Clear past the tail for HW access */
650 		GEM_BUG_ON(dword_in_page(regs) > count);
651 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
652 
653 		/* Close the batch; used mainly by live_lrc_layout() */
654 		*regs = MI_BATCH_BUFFER_END;
655 		if (INTEL_GEN(engine->i915) >= 10)
656 			*regs |= BIT(0);
657 	}
658 }
659 
660 static const u8 gen8_xcs_offsets[] = {
661 	NOP(1),
662 	LRI(11, 0),
663 	REG16(0x244),
664 	REG(0x034),
665 	REG(0x030),
666 	REG(0x038),
667 	REG(0x03c),
668 	REG(0x168),
669 	REG(0x140),
670 	REG(0x110),
671 	REG(0x11c),
672 	REG(0x114),
673 	REG(0x118),
674 
675 	NOP(9),
676 	LRI(9, 0),
677 	REG16(0x3a8),
678 	REG16(0x28c),
679 	REG16(0x288),
680 	REG16(0x284),
681 	REG16(0x280),
682 	REG16(0x27c),
683 	REG16(0x278),
684 	REG16(0x274),
685 	REG16(0x270),
686 
687 	NOP(13),
688 	LRI(2, 0),
689 	REG16(0x200),
690 	REG(0x028),
691 
692 	END(80)
693 };
694 
695 static const u8 gen9_xcs_offsets[] = {
696 	NOP(1),
697 	LRI(14, POSTED),
698 	REG16(0x244),
699 	REG(0x034),
700 	REG(0x030),
701 	REG(0x038),
702 	REG(0x03c),
703 	REG(0x168),
704 	REG(0x140),
705 	REG(0x110),
706 	REG(0x11c),
707 	REG(0x114),
708 	REG(0x118),
709 	REG(0x1c0),
710 	REG(0x1c4),
711 	REG(0x1c8),
712 
713 	NOP(3),
714 	LRI(9, POSTED),
715 	REG16(0x3a8),
716 	REG16(0x28c),
717 	REG16(0x288),
718 	REG16(0x284),
719 	REG16(0x280),
720 	REG16(0x27c),
721 	REG16(0x278),
722 	REG16(0x274),
723 	REG16(0x270),
724 
725 	NOP(13),
726 	LRI(1, POSTED),
727 	REG16(0x200),
728 
729 	NOP(13),
730 	LRI(44, POSTED),
731 	REG(0x028),
732 	REG(0x09c),
733 	REG(0x0c0),
734 	REG(0x178),
735 	REG(0x17c),
736 	REG16(0x358),
737 	REG(0x170),
738 	REG(0x150),
739 	REG(0x154),
740 	REG(0x158),
741 	REG16(0x41c),
742 	REG16(0x600),
743 	REG16(0x604),
744 	REG16(0x608),
745 	REG16(0x60c),
746 	REG16(0x610),
747 	REG16(0x614),
748 	REG16(0x618),
749 	REG16(0x61c),
750 	REG16(0x620),
751 	REG16(0x624),
752 	REG16(0x628),
753 	REG16(0x62c),
754 	REG16(0x630),
755 	REG16(0x634),
756 	REG16(0x638),
757 	REG16(0x63c),
758 	REG16(0x640),
759 	REG16(0x644),
760 	REG16(0x648),
761 	REG16(0x64c),
762 	REG16(0x650),
763 	REG16(0x654),
764 	REG16(0x658),
765 	REG16(0x65c),
766 	REG16(0x660),
767 	REG16(0x664),
768 	REG16(0x668),
769 	REG16(0x66c),
770 	REG16(0x670),
771 	REG16(0x674),
772 	REG16(0x678),
773 	REG16(0x67c),
774 	REG(0x068),
775 
776 	END(176)
777 };
778 
779 static const u8 gen12_xcs_offsets[] = {
780 	NOP(1),
781 	LRI(13, POSTED),
782 	REG16(0x244),
783 	REG(0x034),
784 	REG(0x030),
785 	REG(0x038),
786 	REG(0x03c),
787 	REG(0x168),
788 	REG(0x140),
789 	REG(0x110),
790 	REG(0x1c0),
791 	REG(0x1c4),
792 	REG(0x1c8),
793 	REG(0x180),
794 	REG16(0x2b4),
795 
796 	NOP(5),
797 	LRI(9, POSTED),
798 	REG16(0x3a8),
799 	REG16(0x28c),
800 	REG16(0x288),
801 	REG16(0x284),
802 	REG16(0x280),
803 	REG16(0x27c),
804 	REG16(0x278),
805 	REG16(0x274),
806 	REG16(0x270),
807 
808 	END(80)
809 };
810 
811 static const u8 gen8_rcs_offsets[] = {
812 	NOP(1),
813 	LRI(14, POSTED),
814 	REG16(0x244),
815 	REG(0x034),
816 	REG(0x030),
817 	REG(0x038),
818 	REG(0x03c),
819 	REG(0x168),
820 	REG(0x140),
821 	REG(0x110),
822 	REG(0x11c),
823 	REG(0x114),
824 	REG(0x118),
825 	REG(0x1c0),
826 	REG(0x1c4),
827 	REG(0x1c8),
828 
829 	NOP(3),
830 	LRI(9, POSTED),
831 	REG16(0x3a8),
832 	REG16(0x28c),
833 	REG16(0x288),
834 	REG16(0x284),
835 	REG16(0x280),
836 	REG16(0x27c),
837 	REG16(0x278),
838 	REG16(0x274),
839 	REG16(0x270),
840 
841 	NOP(13),
842 	LRI(1, 0),
843 	REG(0x0c8),
844 
845 	END(80)
846 };
847 
848 static const u8 gen9_rcs_offsets[] = {
849 	NOP(1),
850 	LRI(14, POSTED),
851 	REG16(0x244),
852 	REG(0x34),
853 	REG(0x30),
854 	REG(0x38),
855 	REG(0x3c),
856 	REG(0x168),
857 	REG(0x140),
858 	REG(0x110),
859 	REG(0x11c),
860 	REG(0x114),
861 	REG(0x118),
862 	REG(0x1c0),
863 	REG(0x1c4),
864 	REG(0x1c8),
865 
866 	NOP(3),
867 	LRI(9, POSTED),
868 	REG16(0x3a8),
869 	REG16(0x28c),
870 	REG16(0x288),
871 	REG16(0x284),
872 	REG16(0x280),
873 	REG16(0x27c),
874 	REG16(0x278),
875 	REG16(0x274),
876 	REG16(0x270),
877 
878 	NOP(13),
879 	LRI(1, 0),
880 	REG(0xc8),
881 
882 	NOP(13),
883 	LRI(44, POSTED),
884 	REG(0x28),
885 	REG(0x9c),
886 	REG(0xc0),
887 	REG(0x178),
888 	REG(0x17c),
889 	REG16(0x358),
890 	REG(0x170),
891 	REG(0x150),
892 	REG(0x154),
893 	REG(0x158),
894 	REG16(0x41c),
895 	REG16(0x600),
896 	REG16(0x604),
897 	REG16(0x608),
898 	REG16(0x60c),
899 	REG16(0x610),
900 	REG16(0x614),
901 	REG16(0x618),
902 	REG16(0x61c),
903 	REG16(0x620),
904 	REG16(0x624),
905 	REG16(0x628),
906 	REG16(0x62c),
907 	REG16(0x630),
908 	REG16(0x634),
909 	REG16(0x638),
910 	REG16(0x63c),
911 	REG16(0x640),
912 	REG16(0x644),
913 	REG16(0x648),
914 	REG16(0x64c),
915 	REG16(0x650),
916 	REG16(0x654),
917 	REG16(0x658),
918 	REG16(0x65c),
919 	REG16(0x660),
920 	REG16(0x664),
921 	REG16(0x668),
922 	REG16(0x66c),
923 	REG16(0x670),
924 	REG16(0x674),
925 	REG16(0x678),
926 	REG16(0x67c),
927 	REG(0x68),
928 
929 	END(176)
930 };
931 
932 static const u8 gen11_rcs_offsets[] = {
933 	NOP(1),
934 	LRI(15, POSTED),
935 	REG16(0x244),
936 	REG(0x034),
937 	REG(0x030),
938 	REG(0x038),
939 	REG(0x03c),
940 	REG(0x168),
941 	REG(0x140),
942 	REG(0x110),
943 	REG(0x11c),
944 	REG(0x114),
945 	REG(0x118),
946 	REG(0x1c0),
947 	REG(0x1c4),
948 	REG(0x1c8),
949 	REG(0x180),
950 
951 	NOP(1),
952 	LRI(9, POSTED),
953 	REG16(0x3a8),
954 	REG16(0x28c),
955 	REG16(0x288),
956 	REG16(0x284),
957 	REG16(0x280),
958 	REG16(0x27c),
959 	REG16(0x278),
960 	REG16(0x274),
961 	REG16(0x270),
962 
963 	LRI(1, POSTED),
964 	REG(0x1b0),
965 
966 	NOP(10),
967 	LRI(1, 0),
968 	REG(0x0c8),
969 
970 	END(80)
971 };
972 
973 static const u8 gen12_rcs_offsets[] = {
974 	NOP(1),
975 	LRI(13, POSTED),
976 	REG16(0x244),
977 	REG(0x034),
978 	REG(0x030),
979 	REG(0x038),
980 	REG(0x03c),
981 	REG(0x168),
982 	REG(0x140),
983 	REG(0x110),
984 	REG(0x1c0),
985 	REG(0x1c4),
986 	REG(0x1c8),
987 	REG(0x180),
988 	REG16(0x2b4),
989 
990 	NOP(5),
991 	LRI(9, POSTED),
992 	REG16(0x3a8),
993 	REG16(0x28c),
994 	REG16(0x288),
995 	REG16(0x284),
996 	REG16(0x280),
997 	REG16(0x27c),
998 	REG16(0x278),
999 	REG16(0x274),
1000 	REG16(0x270),
1001 
1002 	LRI(3, POSTED),
1003 	REG(0x1b0),
1004 	REG16(0x5a8),
1005 	REG16(0x5ac),
1006 
1007 	NOP(6),
1008 	LRI(1, 0),
1009 	REG(0x0c8),
1010 	NOP(3 + 9 + 1),
1011 
1012 	LRI(51, POSTED),
1013 	REG16(0x588),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG(0x028),
1020 	REG(0x09c),
1021 	REG(0x0c0),
1022 	REG(0x178),
1023 	REG(0x17c),
1024 	REG16(0x358),
1025 	REG(0x170),
1026 	REG(0x150),
1027 	REG(0x154),
1028 	REG(0x158),
1029 	REG16(0x41c),
1030 	REG16(0x600),
1031 	REG16(0x604),
1032 	REG16(0x608),
1033 	REG16(0x60c),
1034 	REG16(0x610),
1035 	REG16(0x614),
1036 	REG16(0x618),
1037 	REG16(0x61c),
1038 	REG16(0x620),
1039 	REG16(0x624),
1040 	REG16(0x628),
1041 	REG16(0x62c),
1042 	REG16(0x630),
1043 	REG16(0x634),
1044 	REG16(0x638),
1045 	REG16(0x63c),
1046 	REG16(0x640),
1047 	REG16(0x644),
1048 	REG16(0x648),
1049 	REG16(0x64c),
1050 	REG16(0x650),
1051 	REG16(0x654),
1052 	REG16(0x658),
1053 	REG16(0x65c),
1054 	REG16(0x660),
1055 	REG16(0x664),
1056 	REG16(0x668),
1057 	REG16(0x66c),
1058 	REG16(0x670),
1059 	REG16(0x674),
1060 	REG16(0x678),
1061 	REG16(0x67c),
1062 	REG(0x068),
1063 	REG(0x084),
1064 	NOP(1),
1065 
1066 	END(192)
1067 };
1068 
1069 #undef END
1070 #undef REG16
1071 #undef REG
1072 #undef LRI
1073 #undef NOP
1074 
1075 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1076 {
1077 	/*
1078 	 * The gen12+ lists only have the registers we program in the basic
1079 	 * default state. We rely on the context image using relative
1080 	 * addressing to automatic fixup the register state between the
1081 	 * physical engines for virtual engine.
1082 	 */
1083 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1084 		   !intel_engine_has_relative_mmio(engine));
1085 
1086 	if (engine->class == RENDER_CLASS) {
1087 		if (INTEL_GEN(engine->i915) >= 12)
1088 			return gen12_rcs_offsets;
1089 		else if (INTEL_GEN(engine->i915) >= 11)
1090 			return gen11_rcs_offsets;
1091 		else if (INTEL_GEN(engine->i915) >= 9)
1092 			return gen9_rcs_offsets;
1093 		else
1094 			return gen8_rcs_offsets;
1095 	} else {
1096 		if (INTEL_GEN(engine->i915) >= 12)
1097 			return gen12_xcs_offsets;
1098 		else if (INTEL_GEN(engine->i915) >= 9)
1099 			return gen9_xcs_offsets;
1100 		else
1101 			return gen8_xcs_offsets;
1102 	}
1103 }
1104 
1105 static struct i915_request *
1106 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1107 {
1108 	struct i915_request *rq, *rn, *active = NULL;
1109 	struct list_head *pl;
1110 	int prio = I915_PRIORITY_INVALID;
1111 
1112 	lockdep_assert_held(&engine->active.lock);
1113 
1114 	list_for_each_entry_safe_reverse(rq, rn,
1115 					 &engine->active.requests,
1116 					 sched.link) {
1117 		if (i915_request_completed(rq))
1118 			continue; /* XXX */
1119 
1120 		__i915_request_unsubmit(rq);
1121 
1122 		/*
1123 		 * Push the request back into the queue for later resubmission.
1124 		 * If this request is not native to this physical engine (i.e.
1125 		 * it came from a virtual source), push it back onto the virtual
1126 		 * engine so that it can be moved across onto another physical
1127 		 * engine as load dictates.
1128 		 */
1129 		if (likely(rq->execution_mask == engine->mask)) {
1130 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1131 			if (rq_prio(rq) != prio) {
1132 				prio = rq_prio(rq);
1133 				pl = i915_sched_lookup_priolist(engine, prio);
1134 			}
1135 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1136 
1137 			list_move(&rq->sched.link, pl);
1138 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1139 
1140 			/* Check in case we rollback so far we wrap [size/2] */
1141 			if (intel_ring_direction(rq->ring,
1142 						 intel_ring_wrap(rq->ring,
1143 								 rq->tail),
1144 						 rq->ring->tail) > 0)
1145 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146 
1147 			active = rq;
1148 		} else {
1149 			struct intel_engine_cs *owner = rq->context->engine;
1150 
1151 			/*
1152 			 * Decouple the virtual breadcrumb before moving it
1153 			 * back to the virtual engine -- we don't want the
1154 			 * request to complete in the background and try
1155 			 * and cancel the breadcrumb on the virtual engine
1156 			 * (instead of the old engine where it is linked)!
1157 			 */
1158 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1159 				     &rq->fence.flags)) {
1160 				spin_lock_nested(&rq->lock,
1161 						 SINGLE_DEPTH_NESTING);
1162 				i915_request_cancel_breadcrumb(rq);
1163 				spin_unlock(&rq->lock);
1164 			}
1165 			WRITE_ONCE(rq->engine, owner);
1166 			owner->submit_request(rq);
1167 			active = NULL;
1168 		}
1169 	}
1170 
1171 	return active;
1172 }
1173 
1174 struct i915_request *
1175 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1176 {
1177 	struct intel_engine_cs *engine =
1178 		container_of(execlists, typeof(*engine), execlists);
1179 
1180 	return __unwind_incomplete_requests(engine);
1181 }
1182 
1183 static inline void
1184 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1185 {
1186 	/*
1187 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1188 	 * The compiler should eliminate this function as dead-code.
1189 	 */
1190 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1191 		return;
1192 
1193 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1194 				   status, rq);
1195 }
1196 
1197 static void intel_engine_context_in(struct intel_engine_cs *engine)
1198 {
1199 	unsigned long flags;
1200 
1201 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1202 		return;
1203 
1204 	write_seqlock_irqsave(&engine->stats.lock, flags);
1205 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1206 		engine->stats.start = ktime_get();
1207 		atomic_inc(&engine->stats.active);
1208 	}
1209 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1210 }
1211 
1212 static void intel_engine_context_out(struct intel_engine_cs *engine)
1213 {
1214 	unsigned long flags;
1215 
1216 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1217 
1218 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1219 		return;
1220 
1221 	write_seqlock_irqsave(&engine->stats.lock, flags);
1222 	if (atomic_dec_and_test(&engine->stats.active)) {
1223 		engine->stats.total =
1224 			ktime_add(engine->stats.total,
1225 				  ktime_sub(ktime_get(), engine->stats.start));
1226 	}
1227 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1228 }
1229 
1230 static void
1231 execlists_check_context(const struct intel_context *ce,
1232 			const struct intel_engine_cs *engine)
1233 {
1234 	const struct intel_ring *ring = ce->ring;
1235 	u32 *regs = ce->lrc_reg_state;
1236 	bool valid = true;
1237 	int x;
1238 
1239 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1240 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1241 		       engine->name,
1242 		       regs[CTX_RING_START],
1243 		       i915_ggtt_offset(ring->vma));
1244 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1245 		valid = false;
1246 	}
1247 
1248 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1249 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1250 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1251 		       engine->name,
1252 		       regs[CTX_RING_CTL],
1253 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1254 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1255 		valid = false;
1256 	}
1257 
1258 	x = lrc_ring_mi_mode(engine);
1259 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1260 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1261 		       engine->name, regs[x + 1]);
1262 		regs[x + 1] &= ~STOP_RING;
1263 		regs[x + 1] |= STOP_RING << 16;
1264 		valid = false;
1265 	}
1266 
1267 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1268 }
1269 
1270 static void restore_default_state(struct intel_context *ce,
1271 				  struct intel_engine_cs *engine)
1272 {
1273 	u32 *regs;
1274 
1275 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1276 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1277 
1278 	ce->runtime.last = intel_context_get_runtime(ce);
1279 }
1280 
1281 static void reset_active(struct i915_request *rq,
1282 			 struct intel_engine_cs *engine)
1283 {
1284 	struct intel_context * const ce = rq->context;
1285 	u32 head;
1286 
1287 	/*
1288 	 * The executing context has been cancelled. We want to prevent
1289 	 * further execution along this context and propagate the error on
1290 	 * to anything depending on its results.
1291 	 *
1292 	 * In __i915_request_submit(), we apply the -EIO and remove the
1293 	 * requests' payloads for any banned requests. But first, we must
1294 	 * rewind the context back to the start of the incomplete request so
1295 	 * that we do not jump back into the middle of the batch.
1296 	 *
1297 	 * We preserve the breadcrumbs and semaphores of the incomplete
1298 	 * requests so that inter-timeline dependencies (i.e other timelines)
1299 	 * remain correctly ordered. And we defer to __i915_request_submit()
1300 	 * so that all asynchronous waits are correctly handled.
1301 	 */
1302 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1303 		     rq->fence.context, rq->fence.seqno);
1304 
1305 	/* On resubmission of the active request, payload will be scrubbed */
1306 	if (i915_request_completed(rq))
1307 		head = rq->tail;
1308 	else
1309 		head = active_request(ce->timeline, rq)->head;
1310 	head = intel_ring_wrap(ce->ring, head);
1311 
1312 	/* Scrub the context image to prevent replaying the previous batch */
1313 	restore_default_state(ce, engine);
1314 	__execlists_update_reg_state(ce, engine, head);
1315 
1316 	/* We've switched away, so this should be a no-op, but intent matters */
1317 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1318 }
1319 
1320 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1321 {
1322 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1323 	ce->runtime.num_underflow += dt < 0;
1324 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1325 #endif
1326 }
1327 
1328 static void intel_context_update_runtime(struct intel_context *ce)
1329 {
1330 	u32 old;
1331 	s32 dt;
1332 
1333 	if (intel_context_is_barrier(ce))
1334 		return;
1335 
1336 	old = ce->runtime.last;
1337 	ce->runtime.last = intel_context_get_runtime(ce);
1338 	dt = ce->runtime.last - old;
1339 
1340 	if (unlikely(dt <= 0)) {
1341 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1342 			 old, ce->runtime.last, dt);
1343 		st_update_runtime_underflow(ce, dt);
1344 		return;
1345 	}
1346 
1347 	ewma_runtime_add(&ce->runtime.avg, dt);
1348 	ce->runtime.total += dt;
1349 }
1350 
1351 static inline struct intel_engine_cs *
1352 __execlists_schedule_in(struct i915_request *rq)
1353 {
1354 	struct intel_engine_cs * const engine = rq->engine;
1355 	struct intel_context * const ce = rq->context;
1356 
1357 	intel_context_get(ce);
1358 
1359 	if (unlikely(intel_context_is_banned(ce)))
1360 		reset_active(rq, engine);
1361 
1362 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1363 		execlists_check_context(ce, engine);
1364 
1365 	if (ce->tag) {
1366 		/* Use a fixed tag for OA and friends */
1367 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1368 		ce->lrc.ccid = ce->tag;
1369 	} else {
1370 		/* We don't need a strict matching tag, just different values */
1371 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1372 
1373 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1374 		clear_bit(tag - 1, &engine->context_tag);
1375 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1376 
1377 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1378 	}
1379 
1380 	ce->lrc.ccid |= engine->execlists.ccid;
1381 
1382 	__intel_gt_pm_get(engine->gt);
1383 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1384 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1385 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1386 	intel_engine_context_in(engine);
1387 
1388 	return engine;
1389 }
1390 
1391 static inline struct i915_request *
1392 execlists_schedule_in(struct i915_request *rq, int idx)
1393 {
1394 	struct intel_context * const ce = rq->context;
1395 	struct intel_engine_cs *old;
1396 
1397 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1398 	trace_i915_request_in(rq, idx);
1399 
1400 	old = READ_ONCE(ce->inflight);
1401 	do {
1402 		if (!old) {
1403 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1404 			break;
1405 		}
1406 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1407 
1408 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1409 	return i915_request_get(rq);
1410 }
1411 
1412 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1413 {
1414 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1415 	struct i915_request *next = READ_ONCE(ve->request);
1416 
1417 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1418 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1419 }
1420 
1421 static inline void
1422 __execlists_schedule_out(struct i915_request *rq,
1423 			 struct intel_engine_cs * const engine,
1424 			 unsigned int ccid)
1425 {
1426 	struct intel_context * const ce = rq->context;
1427 
1428 	/*
1429 	 * NB process_csb() is not under the engine->active.lock and hence
1430 	 * schedule_out can race with schedule_in meaning that we should
1431 	 * refrain from doing non-trivial work here.
1432 	 */
1433 
1434 	/*
1435 	 * If we have just completed this context, the engine may now be
1436 	 * idle and we want to re-enter powersaving.
1437 	 */
1438 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1439 	    i915_request_completed(rq))
1440 		intel_engine_add_retire(engine, ce->timeline);
1441 
1442 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1443 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1444 	if (ccid < BITS_PER_LONG) {
1445 		GEM_BUG_ON(ccid == 0);
1446 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1447 		set_bit(ccid - 1, &engine->context_tag);
1448 	}
1449 
1450 	intel_context_update_runtime(ce);
1451 	intel_engine_context_out(engine);
1452 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1453 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1454 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1455 	intel_gt_pm_put_async(engine->gt);
1456 
1457 	/*
1458 	 * If this is part of a virtual engine, its next request may
1459 	 * have been blocked waiting for access to the active context.
1460 	 * We have to kick all the siblings again in case we need to
1461 	 * switch (e.g. the next request is not runnable on this
1462 	 * engine). Hopefully, we will already have submitted the next
1463 	 * request before the tasklet runs and do not need to rebuild
1464 	 * each virtual tree and kick everyone again.
1465 	 */
1466 	if (ce->engine != engine)
1467 		kick_siblings(rq, ce);
1468 
1469 	intel_context_put(ce);
1470 }
1471 
1472 static inline void
1473 execlists_schedule_out(struct i915_request *rq)
1474 {
1475 	struct intel_context * const ce = rq->context;
1476 	struct intel_engine_cs *cur, *old;
1477 	u32 ccid;
1478 
1479 	trace_i915_request_out(rq);
1480 
1481 	ccid = rq->context->lrc.ccid;
1482 	old = READ_ONCE(ce->inflight);
1483 	do
1484 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1485 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1486 	if (!cur)
1487 		__execlists_schedule_out(rq, old, ccid);
1488 
1489 	i915_request_put(rq);
1490 }
1491 
1492 static u64 execlists_update_context(struct i915_request *rq)
1493 {
1494 	struct intel_context *ce = rq->context;
1495 	u64 desc = ce->lrc.desc;
1496 	u32 tail, prev;
1497 
1498 	/*
1499 	 * WaIdleLiteRestore:bdw,skl
1500 	 *
1501 	 * We should never submit the context with the same RING_TAIL twice
1502 	 * just in case we submit an empty ring, which confuses the HW.
1503 	 *
1504 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1505 	 * the normal request to be able to always advance the RING_TAIL on
1506 	 * subsequent resubmissions (for lite restore). Should that fail us,
1507 	 * and we try and submit the same tail again, force the context
1508 	 * reload.
1509 	 *
1510 	 * If we need to return to a preempted context, we need to skip the
1511 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1512 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1513 	 * an earlier request.
1514 	 */
1515 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1516 	prev = rq->ring->tail;
1517 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1518 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1519 		desc |= CTX_DESC_FORCE_RESTORE;
1520 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1521 	rq->tail = rq->wa_tail;
1522 
1523 	/*
1524 	 * Make sure the context image is complete before we submit it to HW.
1525 	 *
1526 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1527 	 * an uncached write such as our mmio register access, the empirical
1528 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1529 	 * may not be visible to the HW prior to the completion of the UC
1530 	 * register write and that we may begin execution from the context
1531 	 * before its image is complete leading to invalid PD chasing.
1532 	 */
1533 	wmb();
1534 
1535 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1536 	return desc;
1537 }
1538 
1539 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1540 {
1541 	if (execlists->ctrl_reg) {
1542 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1543 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1544 	} else {
1545 		writel(upper_32_bits(desc), execlists->submit_reg);
1546 		writel(lower_32_bits(desc), execlists->submit_reg);
1547 	}
1548 }
1549 
1550 static __maybe_unused char *
1551 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1552 {
1553 	if (!rq)
1554 		return "";
1555 
1556 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1557 		 prefix,
1558 		 rq->context->lrc.ccid,
1559 		 rq->fence.context, rq->fence.seqno,
1560 		 i915_request_completed(rq) ? "!" :
1561 		 i915_request_started(rq) ? "*" :
1562 		 "",
1563 		 rq_prio(rq));
1564 
1565 	return buf;
1566 }
1567 
1568 static __maybe_unused void
1569 trace_ports(const struct intel_engine_execlists *execlists,
1570 	    const char *msg,
1571 	    struct i915_request * const *ports)
1572 {
1573 	const struct intel_engine_cs *engine =
1574 		container_of(execlists, typeof(*engine), execlists);
1575 	char __maybe_unused p0[40], p1[40];
1576 
1577 	if (!ports[0])
1578 		return;
1579 
1580 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1581 		     dump_port(p0, sizeof(p0), "", ports[0]),
1582 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1583 }
1584 
1585 static inline bool
1586 reset_in_progress(const struct intel_engine_execlists *execlists)
1587 {
1588 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1589 }
1590 
1591 static __maybe_unused bool
1592 assert_pending_valid(const struct intel_engine_execlists *execlists,
1593 		     const char *msg)
1594 {
1595 	struct intel_engine_cs *engine =
1596 		container_of(execlists, typeof(*engine), execlists);
1597 	struct i915_request * const *port, *rq;
1598 	struct intel_context *ce = NULL;
1599 	bool sentinel = false;
1600 	u32 ccid = -1;
1601 
1602 	trace_ports(execlists, msg, execlists->pending);
1603 
1604 	/* We may be messing around with the lists during reset, lalala */
1605 	if (reset_in_progress(execlists))
1606 		return true;
1607 
1608 	if (!execlists->pending[0]) {
1609 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1610 			      engine->name);
1611 		return false;
1612 	}
1613 
1614 	if (execlists->pending[execlists_num_ports(execlists)]) {
1615 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1616 			      engine->name, execlists_num_ports(execlists));
1617 		return false;
1618 	}
1619 
1620 	for (port = execlists->pending; (rq = *port); port++) {
1621 		unsigned long flags;
1622 		bool ok = true;
1623 
1624 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1625 		GEM_BUG_ON(!i915_request_is_active(rq));
1626 
1627 		if (ce == rq->context) {
1628 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1629 				      engine->name,
1630 				      ce->timeline->fence_context,
1631 				      port - execlists->pending);
1632 			return false;
1633 		}
1634 		ce = rq->context;
1635 
1636 		if (ccid == ce->lrc.ccid) {
1637 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1638 				      engine->name,
1639 				      ccid, ce->timeline->fence_context,
1640 				      port - execlists->pending);
1641 			return false;
1642 		}
1643 		ccid = ce->lrc.ccid;
1644 
1645 		/*
1646 		 * Sentinels are supposed to be the last request so they flush
1647 		 * the current execution off the HW. Check that they are the only
1648 		 * request in the pending submission.
1649 		 */
1650 		if (sentinel) {
1651 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1652 				      engine->name,
1653 				      ce->timeline->fence_context,
1654 				      port - execlists->pending);
1655 			return false;
1656 		}
1657 		sentinel = i915_request_has_sentinel(rq);
1658 
1659 		/* Hold tightly onto the lock to prevent concurrent retires! */
1660 		if (!spin_trylock_irqsave(&rq->lock, flags))
1661 			continue;
1662 
1663 		if (i915_request_completed(rq))
1664 			goto unlock;
1665 
1666 		if (i915_active_is_idle(&ce->active) &&
1667 		    !intel_context_is_barrier(ce)) {
1668 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1669 				      engine->name,
1670 				      ce->timeline->fence_context,
1671 				      port - execlists->pending);
1672 			ok = false;
1673 			goto unlock;
1674 		}
1675 
1676 		if (!i915_vma_is_pinned(ce->state)) {
1677 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1678 				      engine->name,
1679 				      ce->timeline->fence_context,
1680 				      port - execlists->pending);
1681 			ok = false;
1682 			goto unlock;
1683 		}
1684 
1685 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1686 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1687 				      engine->name,
1688 				      ce->timeline->fence_context,
1689 				      port - execlists->pending);
1690 			ok = false;
1691 			goto unlock;
1692 		}
1693 
1694 unlock:
1695 		spin_unlock_irqrestore(&rq->lock, flags);
1696 		if (!ok)
1697 			return false;
1698 	}
1699 
1700 	return ce;
1701 }
1702 
1703 static void execlists_submit_ports(struct intel_engine_cs *engine)
1704 {
1705 	struct intel_engine_execlists *execlists = &engine->execlists;
1706 	unsigned int n;
1707 
1708 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1709 
1710 	/*
1711 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1712 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1713 	 * not be relinquished until the device is idle (see
1714 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1715 	 * that all ELSP are drained i.e. we have processed the CSB,
1716 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1717 	 */
1718 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1719 
1720 	/*
1721 	 * ELSQ note: the submit queue is not cleared after being submitted
1722 	 * to the HW so we need to make sure we always clean it up. This is
1723 	 * currently ensured by the fact that we always write the same number
1724 	 * of elsq entries, keep this in mind before changing the loop below.
1725 	 */
1726 	for (n = execlists_num_ports(execlists); n--; ) {
1727 		struct i915_request *rq = execlists->pending[n];
1728 
1729 		write_desc(execlists,
1730 			   rq ? execlists_update_context(rq) : 0,
1731 			   n);
1732 	}
1733 
1734 	/* we need to manually load the submit queue */
1735 	if (execlists->ctrl_reg)
1736 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1737 }
1738 
1739 static bool ctx_single_port_submission(const struct intel_context *ce)
1740 {
1741 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1742 		intel_context_force_single_submission(ce));
1743 }
1744 
1745 static bool can_merge_ctx(const struct intel_context *prev,
1746 			  const struct intel_context *next)
1747 {
1748 	if (prev != next)
1749 		return false;
1750 
1751 	if (ctx_single_port_submission(prev))
1752 		return false;
1753 
1754 	return true;
1755 }
1756 
1757 static unsigned long i915_request_flags(const struct i915_request *rq)
1758 {
1759 	return READ_ONCE(rq->fence.flags);
1760 }
1761 
1762 static bool can_merge_rq(const struct i915_request *prev,
1763 			 const struct i915_request *next)
1764 {
1765 	GEM_BUG_ON(prev == next);
1766 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1767 
1768 	/*
1769 	 * We do not submit known completed requests. Therefore if the next
1770 	 * request is already completed, we can pretend to merge it in
1771 	 * with the previous context (and we will skip updating the ELSP
1772 	 * and tracking). Thus hopefully keeping the ELSP full with active
1773 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1774 	 * us.
1775 	 */
1776 	if (i915_request_completed(next))
1777 		return true;
1778 
1779 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1780 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1781 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1782 		return false;
1783 
1784 	if (!can_merge_ctx(prev->context, next->context))
1785 		return false;
1786 
1787 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1788 	return true;
1789 }
1790 
1791 static void virtual_update_register_offsets(u32 *regs,
1792 					    struct intel_engine_cs *engine)
1793 {
1794 	set_offsets(regs, reg_offsets(engine), engine, false);
1795 }
1796 
1797 static bool virtual_matches(const struct virtual_engine *ve,
1798 			    const struct i915_request *rq,
1799 			    const struct intel_engine_cs *engine)
1800 {
1801 	const struct intel_engine_cs *inflight;
1802 
1803 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1804 		return false;
1805 
1806 	/*
1807 	 * We track when the HW has completed saving the context image
1808 	 * (i.e. when we have seen the final CS event switching out of
1809 	 * the context) and must not overwrite the context image before
1810 	 * then. This restricts us to only using the active engine
1811 	 * while the previous virtualized request is inflight (so
1812 	 * we reuse the register offsets). This is a very small
1813 	 * hystersis on the greedy seelction algorithm.
1814 	 */
1815 	inflight = intel_context_inflight(&ve->context);
1816 	if (inflight && inflight != engine)
1817 		return false;
1818 
1819 	return true;
1820 }
1821 
1822 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve)
1823 {
1824 	/*
1825 	 * All the outstanding signals on ve->siblings[0] must have
1826 	 * been completed, just pending the interrupt handler. As those
1827 	 * signals still refer to the old sibling (via rq->engine), we must
1828 	 * transfer those to the old irq_worker to keep our locking
1829 	 * consistent.
1830 	 */
1831 	intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context);
1832 }
1833 
1834 #define for_each_waiter(p__, rq__) \
1835 	list_for_each_entry_lockless(p__, \
1836 				     &(rq__)->sched.waiters_list, \
1837 				     wait_link)
1838 
1839 #define for_each_signaler(p__, rq__) \
1840 	list_for_each_entry_rcu(p__, \
1841 				&(rq__)->sched.signalers_list, \
1842 				signal_link)
1843 
1844 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1845 {
1846 	LIST_HEAD(list);
1847 
1848 	/*
1849 	 * We want to move the interrupted request to the back of
1850 	 * the round-robin list (i.e. its priority level), but
1851 	 * in doing so, we must then move all requests that were in
1852 	 * flight and were waiting for the interrupted request to
1853 	 * be run after it again.
1854 	 */
1855 	do {
1856 		struct i915_dependency *p;
1857 
1858 		GEM_BUG_ON(i915_request_is_active(rq));
1859 		list_move_tail(&rq->sched.link, pl);
1860 
1861 		for_each_waiter(p, rq) {
1862 			struct i915_request *w =
1863 				container_of(p->waiter, typeof(*w), sched);
1864 
1865 			if (p->flags & I915_DEPENDENCY_WEAK)
1866 				continue;
1867 
1868 			/* Leave semaphores spinning on the other engines */
1869 			if (w->engine != rq->engine)
1870 				continue;
1871 
1872 			/* No waiter should start before its signaler */
1873 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1874 				   i915_request_started(w) &&
1875 				   !i915_request_completed(rq));
1876 
1877 			GEM_BUG_ON(i915_request_is_active(w));
1878 			if (!i915_request_is_ready(w))
1879 				continue;
1880 
1881 			if (rq_prio(w) < rq_prio(rq))
1882 				continue;
1883 
1884 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1885 			list_move_tail(&w->sched.link, &list);
1886 		}
1887 
1888 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1889 	} while (rq);
1890 }
1891 
1892 static void defer_active(struct intel_engine_cs *engine)
1893 {
1894 	struct i915_request *rq;
1895 
1896 	rq = __unwind_incomplete_requests(engine);
1897 	if (!rq)
1898 		return;
1899 
1900 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1901 }
1902 
1903 static bool
1904 need_timeslice(const struct intel_engine_cs *engine,
1905 	       const struct i915_request *rq,
1906 	       const struct rb_node *rb)
1907 {
1908 	int hint;
1909 
1910 	if (!intel_engine_has_timeslices(engine))
1911 		return false;
1912 
1913 	hint = engine->execlists.queue_priority_hint;
1914 
1915 	if (rb) {
1916 		const struct virtual_engine *ve =
1917 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1918 		const struct intel_engine_cs *inflight =
1919 			intel_context_inflight(&ve->context);
1920 
1921 		if (!inflight || inflight == engine) {
1922 			struct i915_request *next;
1923 
1924 			rcu_read_lock();
1925 			next = READ_ONCE(ve->request);
1926 			if (next)
1927 				hint = max(hint, rq_prio(next));
1928 			rcu_read_unlock();
1929 		}
1930 	}
1931 
1932 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1933 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1934 
1935 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1936 	return hint >= effective_prio(rq);
1937 }
1938 
1939 static bool
1940 timeslice_yield(const struct intel_engine_execlists *el,
1941 		const struct i915_request *rq)
1942 {
1943 	/*
1944 	 * Once bitten, forever smitten!
1945 	 *
1946 	 * If the active context ever busy-waited on a semaphore,
1947 	 * it will be treated as a hog until the end of its timeslice (i.e.
1948 	 * until it is scheduled out and replaced by a new submission,
1949 	 * possibly even its own lite-restore). The HW only sends an interrupt
1950 	 * on the first miss, and we do know if that semaphore has been
1951 	 * signaled, or even if it is now stuck on another semaphore. Play
1952 	 * safe, yield if it might be stuck -- it will be given a fresh
1953 	 * timeslice in the near future.
1954 	 */
1955 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1956 }
1957 
1958 static bool
1959 timeslice_expired(const struct intel_engine_execlists *el,
1960 		  const struct i915_request *rq)
1961 {
1962 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1963 }
1964 
1965 static int
1966 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1967 {
1968 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1969 		return engine->execlists.queue_priority_hint;
1970 
1971 	return rq_prio(list_next_entry(rq, sched.link));
1972 }
1973 
1974 static inline unsigned long
1975 timeslice(const struct intel_engine_cs *engine)
1976 {
1977 	return READ_ONCE(engine->props.timeslice_duration_ms);
1978 }
1979 
1980 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1981 {
1982 	const struct intel_engine_execlists *execlists = &engine->execlists;
1983 	const struct i915_request *rq = *execlists->active;
1984 
1985 	if (!rq || i915_request_completed(rq))
1986 		return 0;
1987 
1988 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1989 		return 0;
1990 
1991 	return timeslice(engine);
1992 }
1993 
1994 static void set_timeslice(struct intel_engine_cs *engine)
1995 {
1996 	unsigned long duration;
1997 
1998 	if (!intel_engine_has_timeslices(engine))
1999 		return;
2000 
2001 	duration = active_timeslice(engine);
2002 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2003 
2004 	set_timer_ms(&engine->execlists.timer, duration);
2005 }
2006 
2007 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2008 {
2009 	struct intel_engine_execlists *execlists = &engine->execlists;
2010 	unsigned long duration;
2011 
2012 	if (!intel_engine_has_timeslices(engine))
2013 		return;
2014 
2015 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2016 	if (prio == INT_MIN)
2017 		return;
2018 
2019 	if (timer_pending(&execlists->timer))
2020 		return;
2021 
2022 	duration = timeslice(engine);
2023 	ENGINE_TRACE(engine,
2024 		     "start timeslicing, prio:%d, interval:%lu",
2025 		     prio, duration);
2026 
2027 	set_timer_ms(&execlists->timer, duration);
2028 }
2029 
2030 static void record_preemption(struct intel_engine_execlists *execlists)
2031 {
2032 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2033 }
2034 
2035 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2036 					    const struct i915_request *rq)
2037 {
2038 	if (!rq)
2039 		return 0;
2040 
2041 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2042 	if (unlikely(intel_context_is_banned(rq->context)))
2043 		return 1;
2044 
2045 	return READ_ONCE(engine->props.preempt_timeout_ms);
2046 }
2047 
2048 static void set_preempt_timeout(struct intel_engine_cs *engine,
2049 				const struct i915_request *rq)
2050 {
2051 	if (!intel_engine_has_preempt_reset(engine))
2052 		return;
2053 
2054 	set_timer_ms(&engine->execlists.preempt,
2055 		     active_preempt_timeout(engine, rq));
2056 }
2057 
2058 static inline void clear_ports(struct i915_request **ports, int count)
2059 {
2060 	memset_p((void **)ports, NULL, count);
2061 }
2062 
2063 static inline void
2064 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2065 {
2066 	/* A memcpy_p() would be very useful here! */
2067 	while (count--)
2068 		WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2069 }
2070 
2071 static void execlists_dequeue(struct intel_engine_cs *engine)
2072 {
2073 	struct intel_engine_execlists * const execlists = &engine->execlists;
2074 	struct i915_request **port = execlists->pending;
2075 	struct i915_request ** const last_port = port + execlists->port_mask;
2076 	struct i915_request * const *active;
2077 	struct i915_request *last;
2078 	struct rb_node *rb;
2079 	bool submit = false;
2080 
2081 	/*
2082 	 * Hardware submission is through 2 ports. Conceptually each port
2083 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2084 	 * static for a context, and unique to each, so we only execute
2085 	 * requests belonging to a single context from each ring. RING_HEAD
2086 	 * is maintained by the CS in the context image, it marks the place
2087 	 * where it got up to last time, and through RING_TAIL we tell the CS
2088 	 * where we want to execute up to this time.
2089 	 *
2090 	 * In this list the requests are in order of execution. Consecutive
2091 	 * requests from the same context are adjacent in the ringbuffer. We
2092 	 * can combine these requests into a single RING_TAIL update:
2093 	 *
2094 	 *              RING_HEAD...req1...req2
2095 	 *                                    ^- RING_TAIL
2096 	 * since to execute req2 the CS must first execute req1.
2097 	 *
2098 	 * Our goal then is to point each port to the end of a consecutive
2099 	 * sequence of requests as being the most optimal (fewest wake ups
2100 	 * and context switches) submission.
2101 	 */
2102 
2103 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2104 		struct virtual_engine *ve =
2105 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2106 		struct i915_request *rq = READ_ONCE(ve->request);
2107 
2108 		if (!rq) { /* lazily cleanup after another engine handled rq */
2109 			rb_erase_cached(rb, &execlists->virtual);
2110 			RB_CLEAR_NODE(rb);
2111 			rb = rb_first_cached(&execlists->virtual);
2112 			continue;
2113 		}
2114 
2115 		if (!virtual_matches(ve, rq, engine)) {
2116 			rb = rb_next(rb);
2117 			continue;
2118 		}
2119 
2120 		break;
2121 	}
2122 
2123 	/*
2124 	 * If the queue is higher priority than the last
2125 	 * request in the currently active context, submit afresh.
2126 	 * We will resubmit again afterwards in case we need to split
2127 	 * the active context to interject the preemption request,
2128 	 * i.e. we will retrigger preemption following the ack in case
2129 	 * of trouble.
2130 	 */
2131 	active = READ_ONCE(execlists->active);
2132 
2133 	/*
2134 	 * In theory we can skip over completed contexts that have not
2135 	 * yet been processed by events (as those events are in flight):
2136 	 *
2137 	 * while ((last = *active) && i915_request_completed(last))
2138 	 *	active++;
2139 	 *
2140 	 * However, the GPU cannot handle this as it will ultimately
2141 	 * find itself trying to jump back into a context it has just
2142 	 * completed and barf.
2143 	 */
2144 
2145 	if ((last = *active)) {
2146 		if (need_preempt(engine, last, rb)) {
2147 			if (i915_request_completed(last)) {
2148 				tasklet_hi_schedule(&execlists->tasklet);
2149 				return;
2150 			}
2151 
2152 			ENGINE_TRACE(engine,
2153 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2154 				     last->fence.context,
2155 				     last->fence.seqno,
2156 				     last->sched.attr.priority,
2157 				     execlists->queue_priority_hint);
2158 			record_preemption(execlists);
2159 
2160 			/*
2161 			 * Don't let the RING_HEAD advance past the breadcrumb
2162 			 * as we unwind (and until we resubmit) so that we do
2163 			 * not accidentally tell it to go backwards.
2164 			 */
2165 			ring_set_paused(engine, 1);
2166 
2167 			/*
2168 			 * Note that we have not stopped the GPU at this point,
2169 			 * so we are unwinding the incomplete requests as they
2170 			 * remain inflight and so by the time we do complete
2171 			 * the preemption, some of the unwound requests may
2172 			 * complete!
2173 			 */
2174 			__unwind_incomplete_requests(engine);
2175 
2176 			last = NULL;
2177 		} else if (need_timeslice(engine, last, rb) &&
2178 			   timeslice_expired(execlists, last)) {
2179 			if (i915_request_completed(last)) {
2180 				tasklet_hi_schedule(&execlists->tasklet);
2181 				return;
2182 			}
2183 
2184 			ENGINE_TRACE(engine,
2185 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2186 				     last->fence.context,
2187 				     last->fence.seqno,
2188 				     last->sched.attr.priority,
2189 				     execlists->queue_priority_hint,
2190 				     yesno(timeslice_yield(execlists, last)));
2191 
2192 			ring_set_paused(engine, 1);
2193 			defer_active(engine);
2194 
2195 			/*
2196 			 * Unlike for preemption, if we rewind and continue
2197 			 * executing the same context as previously active,
2198 			 * the order of execution will remain the same and
2199 			 * the tail will only advance. We do not need to
2200 			 * force a full context restore, as a lite-restore
2201 			 * is sufficient to resample the monotonic TAIL.
2202 			 *
2203 			 * If we switch to any other context, similarly we
2204 			 * will not rewind TAIL of current context, and
2205 			 * normal save/restore will preserve state and allow
2206 			 * us to later continue executing the same request.
2207 			 */
2208 			last = NULL;
2209 		} else {
2210 			/*
2211 			 * Otherwise if we already have a request pending
2212 			 * for execution after the current one, we can
2213 			 * just wait until the next CS event before
2214 			 * queuing more. In either case we will force a
2215 			 * lite-restore preemption event, but if we wait
2216 			 * we hopefully coalesce several updates into a single
2217 			 * submission.
2218 			 */
2219 			if (!list_is_last(&last->sched.link,
2220 					  &engine->active.requests)) {
2221 				/*
2222 				 * Even if ELSP[1] is occupied and not worthy
2223 				 * of timeslices, our queue might be.
2224 				 */
2225 				start_timeslice(engine, queue_prio(execlists));
2226 				return;
2227 			}
2228 		}
2229 	}
2230 
2231 	while (rb) { /* XXX virtual is always taking precedence */
2232 		struct virtual_engine *ve =
2233 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2234 		struct i915_request *rq;
2235 
2236 		spin_lock(&ve->base.active.lock);
2237 
2238 		rq = ve->request;
2239 		if (unlikely(!rq)) { /* lost the race to a sibling */
2240 			spin_unlock(&ve->base.active.lock);
2241 			rb_erase_cached(rb, &execlists->virtual);
2242 			RB_CLEAR_NODE(rb);
2243 			rb = rb_first_cached(&execlists->virtual);
2244 			continue;
2245 		}
2246 
2247 		GEM_BUG_ON(rq != ve->request);
2248 		GEM_BUG_ON(rq->engine != &ve->base);
2249 		GEM_BUG_ON(rq->context != &ve->context);
2250 
2251 		if (rq_prio(rq) >= queue_prio(execlists)) {
2252 			if (!virtual_matches(ve, rq, engine)) {
2253 				spin_unlock(&ve->base.active.lock);
2254 				rb = rb_next(rb);
2255 				continue;
2256 			}
2257 
2258 			if (last && !can_merge_rq(last, rq)) {
2259 				spin_unlock(&ve->base.active.lock);
2260 				start_timeslice(engine, rq_prio(rq));
2261 				return; /* leave this for another sibling */
2262 			}
2263 
2264 			ENGINE_TRACE(engine,
2265 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2266 				     rq->fence.context,
2267 				     rq->fence.seqno,
2268 				     i915_request_completed(rq) ? "!" :
2269 				     i915_request_started(rq) ? "*" :
2270 				     "",
2271 				     yesno(engine != ve->siblings[0]));
2272 
2273 			WRITE_ONCE(ve->request, NULL);
2274 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2275 				   INT_MIN);
2276 			rb_erase_cached(rb, &execlists->virtual);
2277 			RB_CLEAR_NODE(rb);
2278 
2279 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2280 			WRITE_ONCE(rq->engine, engine);
2281 
2282 			if (engine != ve->siblings[0]) {
2283 				u32 *regs = ve->context.lrc_reg_state;
2284 				unsigned int n;
2285 
2286 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2287 
2288 				if (!intel_engine_has_relative_mmio(engine))
2289 					virtual_update_register_offsets(regs,
2290 									engine);
2291 
2292 				if (!list_empty(&ve->context.signals))
2293 					virtual_xfer_breadcrumbs(ve);
2294 
2295 				/*
2296 				 * Move the bound engine to the top of the list
2297 				 * for future execution. We then kick this
2298 				 * tasklet first before checking others, so that
2299 				 * we preferentially reuse this set of bound
2300 				 * registers.
2301 				 */
2302 				for (n = 1; n < ve->num_siblings; n++) {
2303 					if (ve->siblings[n] == engine) {
2304 						swap(ve->siblings[n],
2305 						     ve->siblings[0]);
2306 						break;
2307 					}
2308 				}
2309 
2310 				GEM_BUG_ON(ve->siblings[0] != engine);
2311 			}
2312 
2313 			if (__i915_request_submit(rq)) {
2314 				submit = true;
2315 				last = rq;
2316 			}
2317 			i915_request_put(rq);
2318 
2319 			/*
2320 			 * Hmm, we have a bunch of virtual engine requests,
2321 			 * but the first one was already completed (thanks
2322 			 * preempt-to-busy!). Keep looking at the veng queue
2323 			 * until we have no more relevant requests (i.e.
2324 			 * the normal submit queue has higher priority).
2325 			 */
2326 			if (!submit) {
2327 				spin_unlock(&ve->base.active.lock);
2328 				rb = rb_first_cached(&execlists->virtual);
2329 				continue;
2330 			}
2331 		}
2332 
2333 		spin_unlock(&ve->base.active.lock);
2334 		break;
2335 	}
2336 
2337 	while ((rb = rb_first_cached(&execlists->queue))) {
2338 		struct i915_priolist *p = to_priolist(rb);
2339 		struct i915_request *rq, *rn;
2340 		int i;
2341 
2342 		priolist_for_each_request_consume(rq, rn, p, i) {
2343 			bool merge = true;
2344 
2345 			/*
2346 			 * Can we combine this request with the current port?
2347 			 * It has to be the same context/ringbuffer and not
2348 			 * have any exceptions (e.g. GVT saying never to
2349 			 * combine contexts).
2350 			 *
2351 			 * If we can combine the requests, we can execute both
2352 			 * by updating the RING_TAIL to point to the end of the
2353 			 * second request, and so we never need to tell the
2354 			 * hardware about the first.
2355 			 */
2356 			if (last && !can_merge_rq(last, rq)) {
2357 				/*
2358 				 * If we are on the second port and cannot
2359 				 * combine this request with the last, then we
2360 				 * are done.
2361 				 */
2362 				if (port == last_port)
2363 					goto done;
2364 
2365 				/*
2366 				 * We must not populate both ELSP[] with the
2367 				 * same LRCA, i.e. we must submit 2 different
2368 				 * contexts if we submit 2 ELSP.
2369 				 */
2370 				if (last->context == rq->context)
2371 					goto done;
2372 
2373 				if (i915_request_has_sentinel(last))
2374 					goto done;
2375 
2376 				/*
2377 				 * If GVT overrides us we only ever submit
2378 				 * port[0], leaving port[1] empty. Note that we
2379 				 * also have to be careful that we don't queue
2380 				 * the same context (even though a different
2381 				 * request) to the second port.
2382 				 */
2383 				if (ctx_single_port_submission(last->context) ||
2384 				    ctx_single_port_submission(rq->context))
2385 					goto done;
2386 
2387 				merge = false;
2388 			}
2389 
2390 			if (__i915_request_submit(rq)) {
2391 				if (!merge) {
2392 					*port = execlists_schedule_in(last, port - execlists->pending);
2393 					port++;
2394 					last = NULL;
2395 				}
2396 
2397 				GEM_BUG_ON(last &&
2398 					   !can_merge_ctx(last->context,
2399 							  rq->context));
2400 				GEM_BUG_ON(last &&
2401 					   i915_seqno_passed(last->fence.seqno,
2402 							     rq->fence.seqno));
2403 
2404 				submit = true;
2405 				last = rq;
2406 			}
2407 		}
2408 
2409 		rb_erase_cached(&p->node, &execlists->queue);
2410 		i915_priolist_free(p);
2411 	}
2412 
2413 done:
2414 	/*
2415 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2416 	 *
2417 	 * We choose the priority hint such that if we add a request of greater
2418 	 * priority than this, we kick the submission tasklet to decide on
2419 	 * the right order of submitting the requests to hardware. We must
2420 	 * also be prepared to reorder requests as they are in-flight on the
2421 	 * HW. We derive the priority hint then as the first "hole" in
2422 	 * the HW submission ports and if there are no available slots,
2423 	 * the priority of the lowest executing request, i.e. last.
2424 	 *
2425 	 * When we do receive a higher priority request ready to run from the
2426 	 * user, see queue_request(), the priority hint is bumped to that
2427 	 * request triggering preemption on the next dequeue (or subsequent
2428 	 * interrupt for secondary ports).
2429 	 */
2430 	execlists->queue_priority_hint = queue_prio(execlists);
2431 
2432 	if (submit) {
2433 		*port = execlists_schedule_in(last, port - execlists->pending);
2434 		execlists->switch_priority_hint =
2435 			switch_prio(engine, *execlists->pending);
2436 
2437 		/*
2438 		 * Skip if we ended up with exactly the same set of requests,
2439 		 * e.g. trying to timeslice a pair of ordered contexts
2440 		 */
2441 		if (!memcmp(active, execlists->pending,
2442 			    (port - execlists->pending + 1) * sizeof(*port))) {
2443 			do
2444 				execlists_schedule_out(fetch_and_zero(port));
2445 			while (port-- != execlists->pending);
2446 
2447 			goto skip_submit;
2448 		}
2449 		clear_ports(port + 1, last_port - port);
2450 
2451 		WRITE_ONCE(execlists->yield, -1);
2452 		set_preempt_timeout(engine, *active);
2453 		execlists_submit_ports(engine);
2454 	} else {
2455 		start_timeslice(engine, execlists->queue_priority_hint);
2456 skip_submit:
2457 		ring_set_paused(engine, 0);
2458 	}
2459 }
2460 
2461 static void
2462 cancel_port_requests(struct intel_engine_execlists * const execlists)
2463 {
2464 	struct i915_request * const *port;
2465 
2466 	for (port = execlists->pending; *port; port++)
2467 		execlists_schedule_out(*port);
2468 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2469 
2470 	/* Mark the end of active before we overwrite *active */
2471 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2472 		execlists_schedule_out(*port);
2473 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2474 
2475 	smp_wmb(); /* complete the seqlock for execlists_active() */
2476 	WRITE_ONCE(execlists->active, execlists->inflight);
2477 }
2478 
2479 static inline void
2480 invalidate_csb_entries(const u32 *first, const u32 *last)
2481 {
2482 	clflush((void *)first);
2483 	clflush((void *)last);
2484 }
2485 
2486 /*
2487  * Starting with Gen12, the status has a new format:
2488  *
2489  *     bit  0:     switched to new queue
2490  *     bit  1:     reserved
2491  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2492  *                 switch detail is set to "wait on semaphore"
2493  *     bits 3-5:   engine class
2494  *     bits 6-11:  engine instance
2495  *     bits 12-14: reserved
2496  *     bits 15-25: sw context id of the lrc the GT switched to
2497  *     bits 26-31: sw counter of the lrc the GT switched to
2498  *     bits 32-35: context switch detail
2499  *                  - 0: ctx complete
2500  *                  - 1: wait on sync flip
2501  *                  - 2: wait on vblank
2502  *                  - 3: wait on scanline
2503  *                  - 4: wait on semaphore
2504  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2505  *                       WAIT_FOR_EVENT)
2506  *     bit  36:    reserved
2507  *     bits 37-43: wait detail (for switch detail 1 to 4)
2508  *     bits 44-46: reserved
2509  *     bits 47-57: sw context id of the lrc the GT switched away from
2510  *     bits 58-63: sw counter of the lrc the GT switched away from
2511  */
2512 static inline bool
2513 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2514 {
2515 	u32 lower_dw = csb[0];
2516 	u32 upper_dw = csb[1];
2517 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2518 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2519 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2520 
2521 	/*
2522 	 * The context switch detail is not guaranteed to be 5 when a preemption
2523 	 * occurs, so we can't just check for that. The check below works for
2524 	 * all the cases we care about, including preemptions of WAIT
2525 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2526 	 * would require some extra handling, but we don't support that.
2527 	 */
2528 	if (!ctx_away_valid || new_queue) {
2529 		GEM_BUG_ON(!ctx_to_valid);
2530 		return true;
2531 	}
2532 
2533 	/*
2534 	 * switch detail = 5 is covered by the case above and we do not expect a
2535 	 * context switch on an unsuccessful wait instruction since we always
2536 	 * use polling mode.
2537 	 */
2538 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2539 	return false;
2540 }
2541 
2542 static inline bool
2543 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2544 {
2545 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2546 }
2547 
2548 static void process_csb(struct intel_engine_cs *engine)
2549 {
2550 	struct intel_engine_execlists * const execlists = &engine->execlists;
2551 	const u32 * const buf = execlists->csb_status;
2552 	const u8 num_entries = execlists->csb_size;
2553 	u8 head, tail;
2554 
2555 	/*
2556 	 * As we modify our execlists state tracking we require exclusive
2557 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2558 	 * and we assume that is only inside the reset paths and so serialised.
2559 	 */
2560 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2561 		   !reset_in_progress(execlists));
2562 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2563 
2564 	/*
2565 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2566 	 * When reading from the csb_write mmio register, we have to be
2567 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2568 	 * the low 4bits. As it happens we know the next 4bits are always
2569 	 * zero and so we can simply masked off the low u8 of the register
2570 	 * and treat it identically to reading from the HWSP (without having
2571 	 * to use explicit shifting and masking, and probably bifurcating
2572 	 * the code to handle the legacy mmio read).
2573 	 */
2574 	head = execlists->csb_head;
2575 	tail = READ_ONCE(*execlists->csb_write);
2576 	if (unlikely(head == tail))
2577 		return;
2578 
2579 	/*
2580 	 * We will consume all events from HW, or at least pretend to.
2581 	 *
2582 	 * The sequence of events from the HW is deterministic, and derived
2583 	 * from our writes to the ELSP, with a smidgen of variability for
2584 	 * the arrival of the asynchronous requests wrt to the inflight
2585 	 * execution. If the HW sends an event that does not correspond with
2586 	 * the one we are expecting, we have to abandon all hope as we lose
2587 	 * all tracking of what the engine is actually executing. We will
2588 	 * only detect we are out of sequence with the HW when we get an
2589 	 * 'impossible' event because we have already drained our own
2590 	 * preemption/promotion queue. If this occurs, we know that we likely
2591 	 * lost track of execution earlier and must unwind and restart, the
2592 	 * simplest way is by stop processing the event queue and force the
2593 	 * engine to reset.
2594 	 */
2595 	execlists->csb_head = tail;
2596 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2597 
2598 	/*
2599 	 * Hopefully paired with a wmb() in HW!
2600 	 *
2601 	 * We must complete the read of the write pointer before any reads
2602 	 * from the CSB, so that we do not see stale values. Without an rmb
2603 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2604 	 * we perform the READ_ONCE(*csb_write).
2605 	 */
2606 	rmb();
2607 	do {
2608 		bool promote;
2609 
2610 		if (++head == num_entries)
2611 			head = 0;
2612 
2613 		/*
2614 		 * We are flying near dragons again.
2615 		 *
2616 		 * We hold a reference to the request in execlist_port[]
2617 		 * but no more than that. We are operating in softirq
2618 		 * context and so cannot hold any mutex or sleep. That
2619 		 * prevents us stopping the requests we are processing
2620 		 * in port[] from being retired simultaneously (the
2621 		 * breadcrumb will be complete before we see the
2622 		 * context-switch). As we only hold the reference to the
2623 		 * request, any pointer chasing underneath the request
2624 		 * is subject to a potential use-after-free. Thus we
2625 		 * store all of the bookkeeping within port[] as
2626 		 * required, and avoid using unguarded pointers beneath
2627 		 * request itself. The same applies to the atomic
2628 		 * status notifier.
2629 		 */
2630 
2631 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2632 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2633 
2634 		if (INTEL_GEN(engine->i915) >= 12)
2635 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2636 		else
2637 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2638 		if (promote) {
2639 			struct i915_request * const *old = execlists->active;
2640 
2641 			if (GEM_WARN_ON(!*execlists->pending)) {
2642 				execlists->error_interrupt |= ERROR_CSB;
2643 				break;
2644 			}
2645 
2646 			ring_set_paused(engine, 0);
2647 
2648 			/* Point active to the new ELSP; prevent overwriting */
2649 			WRITE_ONCE(execlists->active, execlists->pending);
2650 			smp_wmb(); /* notify execlists_active() */
2651 
2652 			/* cancel old inflight, prepare for switch */
2653 			trace_ports(execlists, "preempted", old);
2654 			while (*old)
2655 				execlists_schedule_out(*old++);
2656 
2657 			/* switch pending to inflight */
2658 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2659 			copy_ports(execlists->inflight,
2660 				   execlists->pending,
2661 				   execlists_num_ports(execlists));
2662 			smp_wmb(); /* complete the seqlock */
2663 			WRITE_ONCE(execlists->active, execlists->inflight);
2664 
2665 			WRITE_ONCE(execlists->pending[0], NULL);
2666 		} else {
2667 			if (GEM_WARN_ON(!*execlists->active)) {
2668 				execlists->error_interrupt |= ERROR_CSB;
2669 				break;
2670 			}
2671 
2672 			/* port0 completed, advanced to port1 */
2673 			trace_ports(execlists, "completed", execlists->active);
2674 
2675 			/*
2676 			 * We rely on the hardware being strongly
2677 			 * ordered, that the breadcrumb write is
2678 			 * coherent (visible from the CPU) before the
2679 			 * user interrupt is processed. One might assume
2680 			 * that the breadcrumb write being before the
2681 			 * user interrupt and the CS event for the context
2682 			 * switch would therefore be before the CS event
2683 			 * itself...
2684 			 */
2685 			if (GEM_SHOW_DEBUG() &&
2686 			    !i915_request_completed(*execlists->active)) {
2687 				struct i915_request *rq = *execlists->active;
2688 				const u32 *regs __maybe_unused =
2689 					rq->context->lrc_reg_state;
2690 
2691 				ENGINE_TRACE(engine,
2692 					     "context completed before request!\n");
2693 				ENGINE_TRACE(engine,
2694 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2695 					     ENGINE_READ(engine, RING_START),
2696 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2697 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2698 					     ENGINE_READ(engine, RING_CTL),
2699 					     ENGINE_READ(engine, RING_MI_MODE));
2700 				ENGINE_TRACE(engine,
2701 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2702 					     i915_ggtt_offset(rq->ring->vma),
2703 					     rq->head, rq->tail,
2704 					     rq->fence.context,
2705 					     lower_32_bits(rq->fence.seqno),
2706 					     hwsp_seqno(rq));
2707 				ENGINE_TRACE(engine,
2708 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2709 					     regs[CTX_RING_START],
2710 					     regs[CTX_RING_HEAD],
2711 					     regs[CTX_RING_TAIL]);
2712 			}
2713 
2714 			execlists_schedule_out(*execlists->active++);
2715 
2716 			GEM_BUG_ON(execlists->active - execlists->inflight >
2717 				   execlists_num_ports(execlists));
2718 		}
2719 	} while (head != tail);
2720 
2721 	set_timeslice(engine);
2722 
2723 	/*
2724 	 * Gen11 has proven to fail wrt global observation point between
2725 	 * entry and tail update, failing on the ordering and thus
2726 	 * we see an old entry in the context status buffer.
2727 	 *
2728 	 * Forcibly evict out entries for the next gpu csb update,
2729 	 * to increase the odds that we get a fresh entries with non
2730 	 * working hardware. The cost for doing so comes out mostly with
2731 	 * the wash as hardware, working or not, will need to do the
2732 	 * invalidation before.
2733 	 */
2734 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2735 }
2736 
2737 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2738 {
2739 	lockdep_assert_held(&engine->active.lock);
2740 	if (!READ_ONCE(engine->execlists.pending[0])) {
2741 		rcu_read_lock(); /* protect peeking at execlists->active */
2742 		execlists_dequeue(engine);
2743 		rcu_read_unlock();
2744 	}
2745 }
2746 
2747 static void __execlists_hold(struct i915_request *rq)
2748 {
2749 	LIST_HEAD(list);
2750 
2751 	do {
2752 		struct i915_dependency *p;
2753 
2754 		if (i915_request_is_active(rq))
2755 			__i915_request_unsubmit(rq);
2756 
2757 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2758 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2759 		i915_request_set_hold(rq);
2760 		RQ_TRACE(rq, "on hold\n");
2761 
2762 		for_each_waiter(p, rq) {
2763 			struct i915_request *w =
2764 				container_of(p->waiter, typeof(*w), sched);
2765 
2766 			/* Leave semaphores spinning on the other engines */
2767 			if (w->engine != rq->engine)
2768 				continue;
2769 
2770 			if (!i915_request_is_ready(w))
2771 				continue;
2772 
2773 			if (i915_request_completed(w))
2774 				continue;
2775 
2776 			if (i915_request_on_hold(w))
2777 				continue;
2778 
2779 			list_move_tail(&w->sched.link, &list);
2780 		}
2781 
2782 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2783 	} while (rq);
2784 }
2785 
2786 static bool execlists_hold(struct intel_engine_cs *engine,
2787 			   struct i915_request *rq)
2788 {
2789 	spin_lock_irq(&engine->active.lock);
2790 
2791 	if (i915_request_completed(rq)) { /* too late! */
2792 		rq = NULL;
2793 		goto unlock;
2794 	}
2795 
2796 	if (rq->engine != engine) { /* preempted virtual engine */
2797 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2798 
2799 		/*
2800 		 * intel_context_inflight() is only protected by virtue
2801 		 * of process_csb() being called only by the tasklet (or
2802 		 * directly from inside reset while the tasklet is suspended).
2803 		 * Assert that neither of those are allowed to run while we
2804 		 * poke at the request queues.
2805 		 */
2806 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2807 
2808 		/*
2809 		 * An unsubmitted request along a virtual engine will
2810 		 * remain on the active (this) engine until we are able
2811 		 * to process the context switch away (and so mark the
2812 		 * context as no longer in flight). That cannot have happened
2813 		 * yet, otherwise we would not be hanging!
2814 		 */
2815 		spin_lock(&ve->base.active.lock);
2816 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2817 		GEM_BUG_ON(ve->request != rq);
2818 		ve->request = NULL;
2819 		spin_unlock(&ve->base.active.lock);
2820 		i915_request_put(rq);
2821 
2822 		rq->engine = engine;
2823 	}
2824 
2825 	/*
2826 	 * Transfer this request onto the hold queue to prevent it
2827 	 * being resumbitted to HW (and potentially completed) before we have
2828 	 * released it. Since we may have already submitted following
2829 	 * requests, we need to remove those as well.
2830 	 */
2831 	GEM_BUG_ON(i915_request_on_hold(rq));
2832 	GEM_BUG_ON(rq->engine != engine);
2833 	__execlists_hold(rq);
2834 	GEM_BUG_ON(list_empty(&engine->active.hold));
2835 
2836 unlock:
2837 	spin_unlock_irq(&engine->active.lock);
2838 	return rq;
2839 }
2840 
2841 static bool hold_request(const struct i915_request *rq)
2842 {
2843 	struct i915_dependency *p;
2844 	bool result = false;
2845 
2846 	/*
2847 	 * If one of our ancestors is on hold, we must also be on hold,
2848 	 * otherwise we will bypass it and execute before it.
2849 	 */
2850 	rcu_read_lock();
2851 	for_each_signaler(p, rq) {
2852 		const struct i915_request *s =
2853 			container_of(p->signaler, typeof(*s), sched);
2854 
2855 		if (s->engine != rq->engine)
2856 			continue;
2857 
2858 		result = i915_request_on_hold(s);
2859 		if (result)
2860 			break;
2861 	}
2862 	rcu_read_unlock();
2863 
2864 	return result;
2865 }
2866 
2867 static void __execlists_unhold(struct i915_request *rq)
2868 {
2869 	LIST_HEAD(list);
2870 
2871 	do {
2872 		struct i915_dependency *p;
2873 
2874 		RQ_TRACE(rq, "hold release\n");
2875 
2876 		GEM_BUG_ON(!i915_request_on_hold(rq));
2877 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2878 
2879 		i915_request_clear_hold(rq);
2880 		list_move_tail(&rq->sched.link,
2881 			       i915_sched_lookup_priolist(rq->engine,
2882 							  rq_prio(rq)));
2883 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2884 
2885 		/* Also release any children on this engine that are ready */
2886 		for_each_waiter(p, rq) {
2887 			struct i915_request *w =
2888 				container_of(p->waiter, typeof(*w), sched);
2889 
2890 			/* Propagate any change in error status */
2891 			if (rq->fence.error)
2892 				i915_request_set_error_once(w, rq->fence.error);
2893 
2894 			if (w->engine != rq->engine)
2895 				continue;
2896 
2897 			if (!i915_request_on_hold(w))
2898 				continue;
2899 
2900 			/* Check that no other parents are also on hold */
2901 			if (hold_request(w))
2902 				continue;
2903 
2904 			list_move_tail(&w->sched.link, &list);
2905 		}
2906 
2907 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2908 	} while (rq);
2909 }
2910 
2911 static void execlists_unhold(struct intel_engine_cs *engine,
2912 			     struct i915_request *rq)
2913 {
2914 	spin_lock_irq(&engine->active.lock);
2915 
2916 	/*
2917 	 * Move this request back to the priority queue, and all of its
2918 	 * children and grandchildren that were suspended along with it.
2919 	 */
2920 	__execlists_unhold(rq);
2921 
2922 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2923 		engine->execlists.queue_priority_hint = rq_prio(rq);
2924 		tasklet_hi_schedule(&engine->execlists.tasklet);
2925 	}
2926 
2927 	spin_unlock_irq(&engine->active.lock);
2928 }
2929 
2930 struct execlists_capture {
2931 	struct work_struct work;
2932 	struct i915_request *rq;
2933 	struct i915_gpu_coredump *error;
2934 };
2935 
2936 static void execlists_capture_work(struct work_struct *work)
2937 {
2938 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2939 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2940 	struct intel_engine_cs *engine = cap->rq->engine;
2941 	struct intel_gt_coredump *gt = cap->error->gt;
2942 	struct intel_engine_capture_vma *vma;
2943 
2944 	/* Compress all the objects attached to the request, slow! */
2945 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2946 	if (vma) {
2947 		struct i915_vma_compress *compress =
2948 			i915_vma_capture_prepare(gt);
2949 
2950 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2951 		i915_vma_capture_finish(gt, compress);
2952 	}
2953 
2954 	gt->simulated = gt->engine->simulated;
2955 	cap->error->simulated = gt->simulated;
2956 
2957 	/* Publish the error state, and announce it to the world */
2958 	i915_error_state_store(cap->error);
2959 	i915_gpu_coredump_put(cap->error);
2960 
2961 	/* Return this request and all that depend upon it for signaling */
2962 	execlists_unhold(engine, cap->rq);
2963 	i915_request_put(cap->rq);
2964 
2965 	kfree(cap);
2966 }
2967 
2968 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2969 {
2970 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2971 	struct execlists_capture *cap;
2972 
2973 	cap = kmalloc(sizeof(*cap), gfp);
2974 	if (!cap)
2975 		return NULL;
2976 
2977 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2978 	if (!cap->error)
2979 		goto err_cap;
2980 
2981 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2982 	if (!cap->error->gt)
2983 		goto err_gpu;
2984 
2985 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2986 	if (!cap->error->gt->engine)
2987 		goto err_gt;
2988 
2989 	return cap;
2990 
2991 err_gt:
2992 	kfree(cap->error->gt);
2993 err_gpu:
2994 	kfree(cap->error);
2995 err_cap:
2996 	kfree(cap);
2997 	return NULL;
2998 }
2999 
3000 static struct i915_request *
3001 active_context(struct intel_engine_cs *engine, u32 ccid)
3002 {
3003 	const struct intel_engine_execlists * const el = &engine->execlists;
3004 	struct i915_request * const *port, *rq;
3005 
3006 	/*
3007 	 * Use the most recent result from process_csb(), but just in case
3008 	 * we trigger an error (via interrupt) before the first CS event has
3009 	 * been written, peek at the next submission.
3010 	 */
3011 
3012 	for (port = el->active; (rq = *port); port++) {
3013 		if (rq->context->lrc.ccid == ccid) {
3014 			ENGINE_TRACE(engine,
3015 				     "ccid found at active:%zd\n",
3016 				     port - el->active);
3017 			return rq;
3018 		}
3019 	}
3020 
3021 	for (port = el->pending; (rq = *port); port++) {
3022 		if (rq->context->lrc.ccid == ccid) {
3023 			ENGINE_TRACE(engine,
3024 				     "ccid found at pending:%zd\n",
3025 				     port - el->pending);
3026 			return rq;
3027 		}
3028 	}
3029 
3030 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3031 	return NULL;
3032 }
3033 
3034 static u32 active_ccid(struct intel_engine_cs *engine)
3035 {
3036 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3037 }
3038 
3039 static void execlists_capture(struct intel_engine_cs *engine)
3040 {
3041 	struct execlists_capture *cap;
3042 
3043 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3044 		return;
3045 
3046 	/*
3047 	 * We need to _quickly_ capture the engine state before we reset.
3048 	 * We are inside an atomic section (softirq) here and we are delaying
3049 	 * the forced preemption event.
3050 	 */
3051 	cap = capture_regs(engine);
3052 	if (!cap)
3053 		return;
3054 
3055 	spin_lock_irq(&engine->active.lock);
3056 	cap->rq = active_context(engine, active_ccid(engine));
3057 	if (cap->rq) {
3058 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3059 		cap->rq = i915_request_get_rcu(cap->rq);
3060 	}
3061 	spin_unlock_irq(&engine->active.lock);
3062 	if (!cap->rq)
3063 		goto err_free;
3064 
3065 	/*
3066 	 * Remove the request from the execlists queue, and take ownership
3067 	 * of the request. We pass it to our worker who will _slowly_ compress
3068 	 * all the pages the _user_ requested for debugging their batch, after
3069 	 * which we return it to the queue for signaling.
3070 	 *
3071 	 * By removing them from the execlists queue, we also remove the
3072 	 * requests from being processed by __unwind_incomplete_requests()
3073 	 * during the intel_engine_reset(), and so they will *not* be replayed
3074 	 * afterwards.
3075 	 *
3076 	 * Note that because we have not yet reset the engine at this point,
3077 	 * it is possible for the request that we have identified as being
3078 	 * guilty, did in fact complete and we will then hit an arbitration
3079 	 * point allowing the outstanding preemption to succeed. The likelihood
3080 	 * of that is very low (as capturing of the engine registers should be
3081 	 * fast enough to run inside an irq-off atomic section!), so we will
3082 	 * simply hold that request accountable for being non-preemptible
3083 	 * long enough to force the reset.
3084 	 */
3085 	if (!execlists_hold(engine, cap->rq))
3086 		goto err_rq;
3087 
3088 	INIT_WORK(&cap->work, execlists_capture_work);
3089 	schedule_work(&cap->work);
3090 	return;
3091 
3092 err_rq:
3093 	i915_request_put(cap->rq);
3094 err_free:
3095 	i915_gpu_coredump_put(cap->error);
3096 	kfree(cap);
3097 }
3098 
3099 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3100 {
3101 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3102 	unsigned long *lock = &engine->gt->reset.flags;
3103 
3104 	if (!intel_has_reset_engine(engine->gt))
3105 		return;
3106 
3107 	if (test_and_set_bit(bit, lock))
3108 		return;
3109 
3110 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3111 
3112 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3113 	tasklet_disable_nosync(&engine->execlists.tasklet);
3114 
3115 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3116 	execlists_capture(engine);
3117 	intel_engine_reset(engine, msg);
3118 
3119 	tasklet_enable(&engine->execlists.tasklet);
3120 	clear_and_wake_up_bit(bit, lock);
3121 }
3122 
3123 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3124 {
3125 	const struct timer_list *t = &engine->execlists.preempt;
3126 
3127 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3128 		return false;
3129 
3130 	if (!timer_expired(t))
3131 		return false;
3132 
3133 	return READ_ONCE(engine->execlists.pending[0]);
3134 }
3135 
3136 /*
3137  * Check the unread Context Status Buffers and manage the submission of new
3138  * contexts to the ELSP accordingly.
3139  */
3140 static void execlists_submission_tasklet(unsigned long data)
3141 {
3142 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3143 	bool timeout = preempt_timeout(engine);
3144 
3145 	process_csb(engine);
3146 
3147 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3148 		const char *msg;
3149 
3150 		/* Generate the error message in priority wrt to the user! */
3151 		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3152 			msg = "CS error"; /* thrown by a user payload */
3153 		else if (engine->execlists.error_interrupt & ERROR_CSB)
3154 			msg = "invalid CSB event";
3155 		else
3156 			msg = "internal error";
3157 
3158 		engine->execlists.error_interrupt = 0;
3159 		execlists_reset(engine, msg);
3160 	}
3161 
3162 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3163 		unsigned long flags;
3164 
3165 		spin_lock_irqsave(&engine->active.lock, flags);
3166 		__execlists_submission_tasklet(engine);
3167 		spin_unlock_irqrestore(&engine->active.lock, flags);
3168 
3169 		/* Recheck after serialising with direct-submission */
3170 		if (unlikely(timeout && preempt_timeout(engine)))
3171 			execlists_reset(engine, "preemption time out");
3172 	}
3173 }
3174 
3175 static void __execlists_kick(struct intel_engine_execlists *execlists)
3176 {
3177 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3178 	tasklet_hi_schedule(&execlists->tasklet);
3179 }
3180 
3181 #define execlists_kick(t, member) \
3182 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3183 
3184 static void execlists_timeslice(struct timer_list *timer)
3185 {
3186 	execlists_kick(timer, timer);
3187 }
3188 
3189 static void execlists_preempt(struct timer_list *timer)
3190 {
3191 	execlists_kick(timer, preempt);
3192 }
3193 
3194 static void queue_request(struct intel_engine_cs *engine,
3195 			  struct i915_request *rq)
3196 {
3197 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3198 	list_add_tail(&rq->sched.link,
3199 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3200 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3201 }
3202 
3203 static void __submit_queue_imm(struct intel_engine_cs *engine)
3204 {
3205 	struct intel_engine_execlists * const execlists = &engine->execlists;
3206 
3207 	if (reset_in_progress(execlists))
3208 		return; /* defer until we restart the engine following reset */
3209 
3210 	__execlists_submission_tasklet(engine);
3211 }
3212 
3213 static void submit_queue(struct intel_engine_cs *engine,
3214 			 const struct i915_request *rq)
3215 {
3216 	struct intel_engine_execlists *execlists = &engine->execlists;
3217 
3218 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3219 		return;
3220 
3221 	execlists->queue_priority_hint = rq_prio(rq);
3222 	__submit_queue_imm(engine);
3223 }
3224 
3225 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3226 			     const struct i915_request *rq)
3227 {
3228 	GEM_BUG_ON(i915_request_on_hold(rq));
3229 	return !list_empty(&engine->active.hold) && hold_request(rq);
3230 }
3231 
3232 static void flush_csb(struct intel_engine_cs *engine)
3233 {
3234 	struct intel_engine_execlists *el = &engine->execlists;
3235 
3236 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3237 		if (!reset_in_progress(el))
3238 			process_csb(engine);
3239 		tasklet_unlock(&el->tasklet);
3240 	}
3241 }
3242 
3243 static void execlists_submit_request(struct i915_request *request)
3244 {
3245 	struct intel_engine_cs *engine = request->engine;
3246 	unsigned long flags;
3247 
3248 	/* Hopefully we clear execlists->pending[] to let us through */
3249 	flush_csb(engine);
3250 
3251 	/* Will be called from irq-context when using foreign fences. */
3252 	spin_lock_irqsave(&engine->active.lock, flags);
3253 
3254 	if (unlikely(ancestor_on_hold(engine, request))) {
3255 		RQ_TRACE(request, "ancestor on hold\n");
3256 		list_add_tail(&request->sched.link, &engine->active.hold);
3257 		i915_request_set_hold(request);
3258 	} else {
3259 		queue_request(engine, request);
3260 
3261 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3262 		GEM_BUG_ON(list_empty(&request->sched.link));
3263 
3264 		submit_queue(engine, request);
3265 	}
3266 
3267 	spin_unlock_irqrestore(&engine->active.lock, flags);
3268 }
3269 
3270 static void __execlists_context_fini(struct intel_context *ce)
3271 {
3272 	intel_ring_put(ce->ring);
3273 	i915_vma_put(ce->state);
3274 }
3275 
3276 static void execlists_context_destroy(struct kref *kref)
3277 {
3278 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3279 
3280 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3281 	GEM_BUG_ON(intel_context_is_pinned(ce));
3282 
3283 	if (ce->state)
3284 		__execlists_context_fini(ce);
3285 
3286 	intel_context_fini(ce);
3287 	intel_context_free(ce);
3288 }
3289 
3290 static void
3291 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3292 {
3293 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3294 		return;
3295 
3296 	vaddr += engine->context_size;
3297 
3298 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3299 }
3300 
3301 static void
3302 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3303 {
3304 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3305 		return;
3306 
3307 	vaddr += engine->context_size;
3308 
3309 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3310 		drm_err_once(&engine->i915->drm,
3311 			     "%s context redzone overwritten!\n",
3312 			     engine->name);
3313 }
3314 
3315 static void execlists_context_unpin(struct intel_context *ce)
3316 {
3317 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3318 		      ce->engine);
3319 
3320 	i915_gem_object_unpin_map(ce->state->obj);
3321 }
3322 
3323 static u32 *
3324 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3325 {
3326 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3327 		MI_SRM_LRM_GLOBAL_GTT |
3328 		MI_LRI_LRM_CS_MMIO;
3329 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3330 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3331 		CTX_TIMESTAMP * sizeof(u32);
3332 	*cs++ = 0;
3333 
3334 	*cs++ = MI_LOAD_REGISTER_REG |
3335 		MI_LRR_SOURCE_CS_MMIO |
3336 		MI_LRI_LRM_CS_MMIO;
3337 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3338 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3339 
3340 	*cs++ = MI_LOAD_REGISTER_REG |
3341 		MI_LRR_SOURCE_CS_MMIO |
3342 		MI_LRI_LRM_CS_MMIO;
3343 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3344 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3345 
3346 	return cs;
3347 }
3348 
3349 static u32 *
3350 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3351 {
3352 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3353 
3354 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3355 		MI_SRM_LRM_GLOBAL_GTT |
3356 		MI_LRI_LRM_CS_MMIO;
3357 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3358 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3359 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3360 	*cs++ = 0;
3361 
3362 	return cs;
3363 }
3364 
3365 static u32 *
3366 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3367 {
3368 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3369 
3370 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3371 		MI_SRM_LRM_GLOBAL_GTT |
3372 		MI_LRI_LRM_CS_MMIO;
3373 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3374 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3375 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3376 	*cs++ = 0;
3377 
3378 	*cs++ = MI_LOAD_REGISTER_REG |
3379 		MI_LRR_SOURCE_CS_MMIO |
3380 		MI_LRI_LRM_CS_MMIO;
3381 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3382 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3383 
3384 	return cs;
3385 }
3386 
3387 static u32 *
3388 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3389 {
3390 	cs = gen12_emit_timestamp_wa(ce, cs);
3391 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3392 	cs = gen12_emit_restore_scratch(ce, cs);
3393 
3394 	return cs;
3395 }
3396 
3397 static u32 *
3398 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3399 {
3400 	cs = gen12_emit_timestamp_wa(ce, cs);
3401 	cs = gen12_emit_restore_scratch(ce, cs);
3402 
3403 	return cs;
3404 }
3405 
3406 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3407 {
3408 	return PAGE_SIZE * ce->wa_bb_page;
3409 }
3410 
3411 static u32 *context_indirect_bb(const struct intel_context *ce)
3412 {
3413 	void *ptr;
3414 
3415 	GEM_BUG_ON(!ce->wa_bb_page);
3416 
3417 	ptr = ce->lrc_reg_state;
3418 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3419 	ptr += context_wa_bb_offset(ce);
3420 
3421 	return ptr;
3422 }
3423 
3424 static void
3425 setup_indirect_ctx_bb(const struct intel_context *ce,
3426 		      const struct intel_engine_cs *engine,
3427 		      u32 *(*emit)(const struct intel_context *, u32 *))
3428 {
3429 	u32 * const start = context_indirect_bb(ce);
3430 	u32 *cs;
3431 
3432 	cs = emit(ce, start);
3433 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3434 	while ((unsigned long)cs % CACHELINE_BYTES)
3435 		*cs++ = MI_NOOP;
3436 
3437 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3438 				    i915_ggtt_offset(ce->state) +
3439 				    context_wa_bb_offset(ce),
3440 				    (cs - start) * sizeof(*cs));
3441 }
3442 
3443 static void
3444 __execlists_update_reg_state(const struct intel_context *ce,
3445 			     const struct intel_engine_cs *engine,
3446 			     u32 head)
3447 {
3448 	struct intel_ring *ring = ce->ring;
3449 	u32 *regs = ce->lrc_reg_state;
3450 
3451 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3452 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3453 
3454 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3455 	regs[CTX_RING_HEAD] = head;
3456 	regs[CTX_RING_TAIL] = ring->tail;
3457 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3458 
3459 	/* RPCS */
3460 	if (engine->class == RENDER_CLASS) {
3461 		regs[CTX_R_PWR_CLK_STATE] =
3462 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3463 
3464 		i915_oa_init_reg_state(ce, engine);
3465 	}
3466 
3467 	if (ce->wa_bb_page) {
3468 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3469 
3470 		fn = gen12_emit_indirect_ctx_xcs;
3471 		if (ce->engine->class == RENDER_CLASS)
3472 			fn = gen12_emit_indirect_ctx_rcs;
3473 
3474 		/* Mutually exclusive wrt to global indirect bb */
3475 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3476 		setup_indirect_ctx_bb(ce, engine, fn);
3477 	}
3478 }
3479 
3480 static int
3481 __execlists_context_pin(struct intel_context *ce,
3482 			struct intel_engine_cs *engine)
3483 {
3484 	void *vaddr;
3485 
3486 	GEM_BUG_ON(!ce->state);
3487 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3488 
3489 	vaddr = i915_gem_object_pin_map(ce->state->obj,
3490 					i915_coherent_map_type(engine->i915) |
3491 					I915_MAP_OVERRIDE);
3492 	if (IS_ERR(vaddr))
3493 		return PTR_ERR(vaddr);
3494 
3495 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3496 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3497 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3498 
3499 	return 0;
3500 }
3501 
3502 static int execlists_context_pin(struct intel_context *ce)
3503 {
3504 	return __execlists_context_pin(ce, ce->engine);
3505 }
3506 
3507 static int execlists_context_alloc(struct intel_context *ce)
3508 {
3509 	return __execlists_context_alloc(ce, ce->engine);
3510 }
3511 
3512 static void execlists_context_reset(struct intel_context *ce)
3513 {
3514 	CE_TRACE(ce, "reset\n");
3515 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3516 
3517 	intel_ring_reset(ce->ring, ce->ring->emit);
3518 
3519 	/* Scrub away the garbage */
3520 	execlists_init_reg_state(ce->lrc_reg_state,
3521 				 ce, ce->engine, ce->ring, true);
3522 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3523 
3524 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3525 }
3526 
3527 static const struct intel_context_ops execlists_context_ops = {
3528 	.alloc = execlists_context_alloc,
3529 
3530 	.pin = execlists_context_pin,
3531 	.unpin = execlists_context_unpin,
3532 
3533 	.enter = intel_context_enter_engine,
3534 	.exit = intel_context_exit_engine,
3535 
3536 	.reset = execlists_context_reset,
3537 	.destroy = execlists_context_destroy,
3538 };
3539 
3540 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3541 {
3542 	u32 *cs;
3543 
3544 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3545 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3546 		return 0;
3547 
3548 	cs = intel_ring_begin(rq, 6);
3549 	if (IS_ERR(cs))
3550 		return PTR_ERR(cs);
3551 
3552 	/*
3553 	 * Check if we have been preempted before we even get started.
3554 	 *
3555 	 * After this point i915_request_started() reports true, even if
3556 	 * we get preempted and so are no longer running.
3557 	 */
3558 	*cs++ = MI_ARB_CHECK;
3559 	*cs++ = MI_NOOP;
3560 
3561 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3562 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3563 	*cs++ = 0;
3564 	*cs++ = rq->fence.seqno - 1;
3565 
3566 	intel_ring_advance(rq, cs);
3567 
3568 	/* Record the updated position of the request's payload */
3569 	rq->infix = intel_ring_offset(rq, cs);
3570 
3571 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3572 
3573 	return 0;
3574 }
3575 
3576 static int emit_pdps(struct i915_request *rq)
3577 {
3578 	const struct intel_engine_cs * const engine = rq->engine;
3579 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3580 	int err, i;
3581 	u32 *cs;
3582 
3583 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3584 
3585 	/*
3586 	 * Beware ye of the dragons, this sequence is magic!
3587 	 *
3588 	 * Small changes to this sequence can cause anything from
3589 	 * GPU hangs to forcewake errors and machine lockups!
3590 	 */
3591 
3592 	/* Flush any residual operations from the context load */
3593 	err = engine->emit_flush(rq, EMIT_FLUSH);
3594 	if (err)
3595 		return err;
3596 
3597 	/* Magic required to prevent forcewake errors! */
3598 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3599 	if (err)
3600 		return err;
3601 
3602 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3603 	if (IS_ERR(cs))
3604 		return PTR_ERR(cs);
3605 
3606 	/* Ensure the LRI have landed before we invalidate & continue */
3607 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3608 	for (i = GEN8_3LVL_PDPES; i--; ) {
3609 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3610 		u32 base = engine->mmio_base;
3611 
3612 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3613 		*cs++ = upper_32_bits(pd_daddr);
3614 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3615 		*cs++ = lower_32_bits(pd_daddr);
3616 	}
3617 	*cs++ = MI_NOOP;
3618 
3619 	intel_ring_advance(rq, cs);
3620 
3621 	return 0;
3622 }
3623 
3624 static int execlists_request_alloc(struct i915_request *request)
3625 {
3626 	int ret;
3627 
3628 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3629 
3630 	/*
3631 	 * Flush enough space to reduce the likelihood of waiting after
3632 	 * we start building the request - in which case we will just
3633 	 * have to repeat work.
3634 	 */
3635 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3636 
3637 	/*
3638 	 * Note that after this point, we have committed to using
3639 	 * this request as it is being used to both track the
3640 	 * state of engine initialisation and liveness of the
3641 	 * golden renderstate above. Think twice before you try
3642 	 * to cancel/unwind this request now.
3643 	 */
3644 
3645 	if (!i915_vm_is_4lvl(request->context->vm)) {
3646 		ret = emit_pdps(request);
3647 		if (ret)
3648 			return ret;
3649 	}
3650 
3651 	/* Unconditionally invalidate GPU caches and TLBs. */
3652 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3653 	if (ret)
3654 		return ret;
3655 
3656 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3657 	return 0;
3658 }
3659 
3660 /*
3661  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3662  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3663  * but there is a slight complication as this is applied in WA batch where the
3664  * values are only initialized once so we cannot take register value at the
3665  * beginning and reuse it further; hence we save its value to memory, upload a
3666  * constant value with bit21 set and then we restore it back with the saved value.
3667  * To simplify the WA, a constant value is formed by using the default value
3668  * of this register. This shouldn't be a problem because we are only modifying
3669  * it for a short period and this batch in non-premptible. We can ofcourse
3670  * use additional instructions that read the actual value of the register
3671  * at that time and set our bit of interest but it makes the WA complicated.
3672  *
3673  * This WA is also required for Gen9 so extracting as a function avoids
3674  * code duplication.
3675  */
3676 static u32 *
3677 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3678 {
3679 	/* NB no one else is allowed to scribble over scratch + 256! */
3680 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3681 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3682 	*batch++ = intel_gt_scratch_offset(engine->gt,
3683 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3684 	*batch++ = 0;
3685 
3686 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3687 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3688 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3689 
3690 	batch = gen8_emit_pipe_control(batch,
3691 				       PIPE_CONTROL_CS_STALL |
3692 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3693 				       0);
3694 
3695 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3696 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3697 	*batch++ = intel_gt_scratch_offset(engine->gt,
3698 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3699 	*batch++ = 0;
3700 
3701 	return batch;
3702 }
3703 
3704 /*
3705  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3706  * initialized at the beginning and shared across all contexts but this field
3707  * helps us to have multiple batches at different offsets and select them based
3708  * on a criteria. At the moment this batch always start at the beginning of the page
3709  * and at this point we don't have multiple wa_ctx batch buffers.
3710  *
3711  * The number of WA applied are not known at the beginning; we use this field
3712  * to return the no of DWORDS written.
3713  *
3714  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3715  * so it adds NOOPs as padding to make it cacheline aligned.
3716  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3717  * makes a complete batch buffer.
3718  */
3719 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3720 {
3721 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3722 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3723 
3724 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3725 	if (IS_BROADWELL(engine->i915))
3726 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3727 
3728 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3729 	/* Actual scratch location is at 128 bytes offset */
3730 	batch = gen8_emit_pipe_control(batch,
3731 				       PIPE_CONTROL_FLUSH_L3 |
3732 				       PIPE_CONTROL_STORE_DATA_INDEX |
3733 				       PIPE_CONTROL_CS_STALL |
3734 				       PIPE_CONTROL_QW_WRITE,
3735 				       LRC_PPHWSP_SCRATCH_ADDR);
3736 
3737 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3738 
3739 	/* Pad to end of cacheline */
3740 	while ((unsigned long)batch % CACHELINE_BYTES)
3741 		*batch++ = MI_NOOP;
3742 
3743 	/*
3744 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3745 	 * execution depends on the length specified in terms of cache lines
3746 	 * in the register CTX_RCS_INDIRECT_CTX
3747 	 */
3748 
3749 	return batch;
3750 }
3751 
3752 struct lri {
3753 	i915_reg_t reg;
3754 	u32 value;
3755 };
3756 
3757 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3758 {
3759 	GEM_BUG_ON(!count || count > 63);
3760 
3761 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3762 	do {
3763 		*batch++ = i915_mmio_reg_offset(lri->reg);
3764 		*batch++ = lri->value;
3765 	} while (lri++, --count);
3766 	*batch++ = MI_NOOP;
3767 
3768 	return batch;
3769 }
3770 
3771 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3772 {
3773 	static const struct lri lri[] = {
3774 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3775 		{
3776 			COMMON_SLICE_CHICKEN2,
3777 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3778 				       0),
3779 		},
3780 
3781 		/* BSpec: 11391 */
3782 		{
3783 			FF_SLICE_CHICKEN,
3784 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3785 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3786 		},
3787 
3788 		/* BSpec: 11299 */
3789 		{
3790 			_3D_CHICKEN3,
3791 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3792 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3793 		}
3794 	};
3795 
3796 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3797 
3798 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3799 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3800 
3801 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3802 	batch = gen8_emit_pipe_control(batch,
3803 				       PIPE_CONTROL_FLUSH_L3 |
3804 				       PIPE_CONTROL_STORE_DATA_INDEX |
3805 				       PIPE_CONTROL_CS_STALL |
3806 				       PIPE_CONTROL_QW_WRITE,
3807 				       LRC_PPHWSP_SCRATCH_ADDR);
3808 
3809 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3810 
3811 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3812 	if (HAS_POOLED_EU(engine->i915)) {
3813 		/*
3814 		 * EU pool configuration is setup along with golden context
3815 		 * during context initialization. This value depends on
3816 		 * device type (2x6 or 3x6) and needs to be updated based
3817 		 * on which subslice is disabled especially for 2x6
3818 		 * devices, however it is safe to load default
3819 		 * configuration of 3x6 device instead of masking off
3820 		 * corresponding bits because HW ignores bits of a disabled
3821 		 * subslice and drops down to appropriate config. Please
3822 		 * see render_state_setup() in i915_gem_render_state.c for
3823 		 * possible configurations, to avoid duplication they are
3824 		 * not shown here again.
3825 		 */
3826 		*batch++ = GEN9_MEDIA_POOL_STATE;
3827 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3828 		*batch++ = 0x00777000;
3829 		*batch++ = 0;
3830 		*batch++ = 0;
3831 		*batch++ = 0;
3832 	}
3833 
3834 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3835 
3836 	/* Pad to end of cacheline */
3837 	while ((unsigned long)batch % CACHELINE_BYTES)
3838 		*batch++ = MI_NOOP;
3839 
3840 	return batch;
3841 }
3842 
3843 static u32 *
3844 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3845 {
3846 	int i;
3847 
3848 	/*
3849 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3850 	 *
3851 	 * Ensure the engine is idle prior to programming a
3852 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3853 	 */
3854 	batch = gen8_emit_pipe_control(batch,
3855 				       PIPE_CONTROL_CS_STALL,
3856 				       0);
3857 	/*
3858 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3859 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3860 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3861 	 * confusing. Since gen8_emit_pipe_control() already advances the
3862 	 * batch by 6 dwords, we advance the other 10 here, completing a
3863 	 * cacheline. It's not clear if the workaround requires this padding
3864 	 * before other commands, or if it's just the regular padding we would
3865 	 * already have for the workaround bb, so leave it here for now.
3866 	 */
3867 	for (i = 0; i < 10; i++)
3868 		*batch++ = MI_NOOP;
3869 
3870 	/* Pad to end of cacheline */
3871 	while ((unsigned long)batch % CACHELINE_BYTES)
3872 		*batch++ = MI_NOOP;
3873 
3874 	return batch;
3875 }
3876 
3877 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3878 
3879 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3880 {
3881 	struct drm_i915_gem_object *obj;
3882 	struct i915_vma *vma;
3883 	int err;
3884 
3885 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3886 	if (IS_ERR(obj))
3887 		return PTR_ERR(obj);
3888 
3889 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3890 	if (IS_ERR(vma)) {
3891 		err = PTR_ERR(vma);
3892 		goto err;
3893 	}
3894 
3895 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3896 	if (err)
3897 		goto err;
3898 
3899 	engine->wa_ctx.vma = vma;
3900 	return 0;
3901 
3902 err:
3903 	i915_gem_object_put(obj);
3904 	return err;
3905 }
3906 
3907 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3908 {
3909 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3910 }
3911 
3912 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3913 
3914 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3915 {
3916 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3917 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3918 					    &wa_ctx->per_ctx };
3919 	wa_bb_func_t wa_bb_fn[2];
3920 	void *batch, *batch_ptr;
3921 	unsigned int i;
3922 	int ret;
3923 
3924 	if (engine->class != RENDER_CLASS)
3925 		return 0;
3926 
3927 	switch (INTEL_GEN(engine->i915)) {
3928 	case 12:
3929 	case 11:
3930 		return 0;
3931 	case 10:
3932 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3933 		wa_bb_fn[1] = NULL;
3934 		break;
3935 	case 9:
3936 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3937 		wa_bb_fn[1] = NULL;
3938 		break;
3939 	case 8:
3940 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3941 		wa_bb_fn[1] = NULL;
3942 		break;
3943 	default:
3944 		MISSING_CASE(INTEL_GEN(engine->i915));
3945 		return 0;
3946 	}
3947 
3948 	ret = lrc_setup_wa_ctx(engine);
3949 	if (ret) {
3950 		drm_dbg(&engine->i915->drm,
3951 			"Failed to setup context WA page: %d\n", ret);
3952 		return ret;
3953 	}
3954 
3955 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3956 
3957 	/*
3958 	 * Emit the two workaround batch buffers, recording the offset from the
3959 	 * start of the workaround batch buffer object for each and their
3960 	 * respective sizes.
3961 	 */
3962 	batch_ptr = batch;
3963 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3964 		wa_bb[i]->offset = batch_ptr - batch;
3965 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3966 						  CACHELINE_BYTES))) {
3967 			ret = -EINVAL;
3968 			break;
3969 		}
3970 		if (wa_bb_fn[i])
3971 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3972 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3973 	}
3974 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3975 
3976 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
3977 	__i915_gem_object_release_map(wa_ctx->vma->obj);
3978 	if (ret)
3979 		lrc_destroy_wa_ctx(engine);
3980 
3981 	return ret;
3982 }
3983 
3984 static void reset_csb_pointers(struct intel_engine_cs *engine)
3985 {
3986 	struct intel_engine_execlists * const execlists = &engine->execlists;
3987 	const unsigned int reset_value = execlists->csb_size - 1;
3988 
3989 	ring_set_paused(engine, 0);
3990 
3991 	/*
3992 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3993 	 * Bludgeon them with a mmio update to be sure.
3994 	 */
3995 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3996 		     0xffff << 16 | reset_value << 8 | reset_value);
3997 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3998 
3999 	/*
4000 	 * After a reset, the HW starts writing into CSB entry [0]. We
4001 	 * therefore have to set our HEAD pointer back one entry so that
4002 	 * the *first* entry we check is entry 0. To complicate this further,
4003 	 * as we don't wait for the first interrupt after reset, we have to
4004 	 * fake the HW write to point back to the last entry so that our
4005 	 * inline comparison of our cached head position against the last HW
4006 	 * write works even before the first interrupt.
4007 	 */
4008 	execlists->csb_head = reset_value;
4009 	WRITE_ONCE(*execlists->csb_write, reset_value);
4010 	wmb(); /* Make sure this is visible to HW (paranoia?) */
4011 
4012 	invalidate_csb_entries(&execlists->csb_status[0],
4013 			       &execlists->csb_status[reset_value]);
4014 
4015 	/* Once more for luck and our trusty paranoia */
4016 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4017 		     0xffff << 16 | reset_value << 8 | reset_value);
4018 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4019 
4020 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4021 }
4022 
4023 static void execlists_sanitize(struct intel_engine_cs *engine)
4024 {
4025 	/*
4026 	 * Poison residual state on resume, in case the suspend didn't!
4027 	 *
4028 	 * We have to assume that across suspend/resume (or other loss
4029 	 * of control) that the contents of our pinned buffers has been
4030 	 * lost, replaced by garbage. Since this doesn't always happen,
4031 	 * let's poison such state so that we more quickly spot when
4032 	 * we falsely assume it has been preserved.
4033 	 */
4034 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4035 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4036 
4037 	reset_csb_pointers(engine);
4038 
4039 	/*
4040 	 * The kernel_context HWSP is stored in the status_page. As above,
4041 	 * that may be lost on resume/initialisation, and so we need to
4042 	 * reset the value in the HWSP.
4043 	 */
4044 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4045 
4046 	/* And scrub the dirty cachelines for the HWSP */
4047 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4048 }
4049 
4050 static void enable_error_interrupt(struct intel_engine_cs *engine)
4051 {
4052 	u32 status;
4053 
4054 	engine->execlists.error_interrupt = 0;
4055 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4056 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4057 
4058 	status = ENGINE_READ(engine, RING_ESR);
4059 	if (unlikely(status)) {
4060 		drm_err(&engine->i915->drm,
4061 			"engine '%s' resumed still in error: %08x\n",
4062 			engine->name, status);
4063 		__intel_gt_reset(engine->gt, engine->mask);
4064 	}
4065 
4066 	/*
4067 	 * On current gen8+, we have 2 signals to play with
4068 	 *
4069 	 * - I915_ERROR_INSTUCTION (bit 0)
4070 	 *
4071 	 *    Generate an error if the command parser encounters an invalid
4072 	 *    instruction
4073 	 *
4074 	 *    This is a fatal error.
4075 	 *
4076 	 * - CP_PRIV (bit 2)
4077 	 *
4078 	 *    Generate an error on privilege violation (where the CP replaces
4079 	 *    the instruction with a no-op). This also fires for writes into
4080 	 *    read-only scratch pages.
4081 	 *
4082 	 *    This is a non-fatal error, parsing continues.
4083 	 *
4084 	 * * there are a few others defined for odd HW that we do not use
4085 	 *
4086 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4087 	 * error (as the HW is validating and suppressing the mistakes), we
4088 	 * only unmask the instruction error bit.
4089 	 */
4090 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4091 }
4092 
4093 static void enable_execlists(struct intel_engine_cs *engine)
4094 {
4095 	u32 mode;
4096 
4097 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4098 
4099 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4100 
4101 	if (INTEL_GEN(engine->i915) >= 11)
4102 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4103 	else
4104 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4105 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4106 
4107 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4108 
4109 	ENGINE_WRITE_FW(engine,
4110 			RING_HWS_PGA,
4111 			i915_ggtt_offset(engine->status_page.vma));
4112 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4113 
4114 	enable_error_interrupt(engine);
4115 
4116 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4117 }
4118 
4119 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4120 {
4121 	bool unexpected = false;
4122 
4123 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4124 		drm_dbg(&engine->i915->drm,
4125 			"STOP_RING still set in RING_MI_MODE\n");
4126 		unexpected = true;
4127 	}
4128 
4129 	return unexpected;
4130 }
4131 
4132 static int execlists_resume(struct intel_engine_cs *engine)
4133 {
4134 	intel_mocs_init_engine(engine);
4135 
4136 	intel_engine_reset_breadcrumbs(engine);
4137 
4138 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4139 		struct drm_printer p = drm_debug_printer(__func__);
4140 
4141 		intel_engine_dump(engine, &p, NULL);
4142 	}
4143 
4144 	enable_execlists(engine);
4145 
4146 	return 0;
4147 }
4148 
4149 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4150 {
4151 	struct intel_engine_execlists * const execlists = &engine->execlists;
4152 	unsigned long flags;
4153 
4154 	ENGINE_TRACE(engine, "depth<-%d\n",
4155 		     atomic_read(&execlists->tasklet.count));
4156 
4157 	/*
4158 	 * Prevent request submission to the hardware until we have
4159 	 * completed the reset in i915_gem_reset_finish(). If a request
4160 	 * is completed by one engine, it may then queue a request
4161 	 * to a second via its execlists->tasklet *just* as we are
4162 	 * calling engine->resume() and also writing the ELSP.
4163 	 * Turning off the execlists->tasklet until the reset is over
4164 	 * prevents the race.
4165 	 */
4166 	__tasklet_disable_sync_once(&execlists->tasklet);
4167 	GEM_BUG_ON(!reset_in_progress(execlists));
4168 
4169 	/* And flush any current direct submission. */
4170 	spin_lock_irqsave(&engine->active.lock, flags);
4171 	spin_unlock_irqrestore(&engine->active.lock, flags);
4172 
4173 	/*
4174 	 * We stop engines, otherwise we might get failed reset and a
4175 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4176 	 * from system hang if batchbuffer is progressing when
4177 	 * the reset is issued, regardless of READY_TO_RESET ack.
4178 	 * Thus assume it is best to stop engines on all gens
4179 	 * where we have a gpu reset.
4180 	 *
4181 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4182 	 *
4183 	 * FIXME: Wa for more modern gens needs to be validated
4184 	 */
4185 	ring_set_paused(engine, 1);
4186 	intel_engine_stop_cs(engine);
4187 
4188 	engine->execlists.reset_ccid = active_ccid(engine);
4189 }
4190 
4191 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4192 {
4193 	int x;
4194 
4195 	x = lrc_ring_mi_mode(engine);
4196 	if (x != -1) {
4197 		regs[x + 1] &= ~STOP_RING;
4198 		regs[x + 1] |= STOP_RING << 16;
4199 	}
4200 }
4201 
4202 static void __execlists_reset_reg_state(const struct intel_context *ce,
4203 					const struct intel_engine_cs *engine)
4204 {
4205 	u32 *regs = ce->lrc_reg_state;
4206 
4207 	__reset_stop_ring(regs, engine);
4208 }
4209 
4210 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4211 {
4212 	struct intel_engine_execlists * const execlists = &engine->execlists;
4213 	struct intel_context *ce;
4214 	struct i915_request *rq;
4215 	u32 head;
4216 
4217 	mb(); /* paranoia: read the CSB pointers from after the reset */
4218 	clflush(execlists->csb_write);
4219 	mb();
4220 
4221 	process_csb(engine); /* drain preemption events */
4222 
4223 	/* Following the reset, we need to reload the CSB read/write pointers */
4224 	reset_csb_pointers(engine);
4225 
4226 	/*
4227 	 * Save the currently executing context, even if we completed
4228 	 * its request, it was still running at the time of the
4229 	 * reset and will have been clobbered.
4230 	 */
4231 	rq = active_context(engine, engine->execlists.reset_ccid);
4232 	if (!rq)
4233 		goto unwind;
4234 
4235 	ce = rq->context;
4236 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4237 
4238 	if (i915_request_completed(rq)) {
4239 		/* Idle context; tidy up the ring so we can restart afresh */
4240 		head = intel_ring_wrap(ce->ring, rq->tail);
4241 		goto out_replay;
4242 	}
4243 
4244 	/* We still have requests in-flight; the engine should be active */
4245 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4246 
4247 	/* Context has requests still in-flight; it should not be idle! */
4248 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4249 
4250 	rq = active_request(ce->timeline, rq);
4251 	head = intel_ring_wrap(ce->ring, rq->head);
4252 	GEM_BUG_ON(head == ce->ring->tail);
4253 
4254 	/*
4255 	 * If this request hasn't started yet, e.g. it is waiting on a
4256 	 * semaphore, we need to avoid skipping the request or else we
4257 	 * break the signaling chain. However, if the context is corrupt
4258 	 * the request will not restart and we will be stuck with a wedged
4259 	 * device. It is quite often the case that if we issue a reset
4260 	 * while the GPU is loading the context image, that the context
4261 	 * image becomes corrupt.
4262 	 *
4263 	 * Otherwise, if we have not started yet, the request should replay
4264 	 * perfectly and we do not need to flag the result as being erroneous.
4265 	 */
4266 	if (!i915_request_started(rq))
4267 		goto out_replay;
4268 
4269 	/*
4270 	 * If the request was innocent, we leave the request in the ELSP
4271 	 * and will try to replay it on restarting. The context image may
4272 	 * have been corrupted by the reset, in which case we may have
4273 	 * to service a new GPU hang, but more likely we can continue on
4274 	 * without impact.
4275 	 *
4276 	 * If the request was guilty, we presume the context is corrupt
4277 	 * and have to at least restore the RING register in the context
4278 	 * image back to the expected values to skip over the guilty request.
4279 	 */
4280 	__i915_request_reset(rq, stalled);
4281 
4282 	/*
4283 	 * We want a simple context + ring to execute the breadcrumb update.
4284 	 * We cannot rely on the context being intact across the GPU hang,
4285 	 * so clear it and rebuild just what we need for the breadcrumb.
4286 	 * All pending requests for this context will be zapped, and any
4287 	 * future request will be after userspace has had the opportunity
4288 	 * to recreate its own state.
4289 	 */
4290 out_replay:
4291 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4292 		     head, ce->ring->tail);
4293 	__execlists_reset_reg_state(ce, engine);
4294 	__execlists_update_reg_state(ce, engine, head);
4295 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4296 
4297 unwind:
4298 	/* Push back any incomplete requests for replay after the reset. */
4299 	cancel_port_requests(execlists);
4300 	__unwind_incomplete_requests(engine);
4301 }
4302 
4303 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4304 {
4305 	unsigned long flags;
4306 
4307 	ENGINE_TRACE(engine, "\n");
4308 
4309 	spin_lock_irqsave(&engine->active.lock, flags);
4310 
4311 	__execlists_reset(engine, stalled);
4312 
4313 	spin_unlock_irqrestore(&engine->active.lock, flags);
4314 }
4315 
4316 static void nop_submission_tasklet(unsigned long data)
4317 {
4318 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4319 
4320 	/* The driver is wedged; don't process any more events. */
4321 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4322 }
4323 
4324 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4325 {
4326 	struct intel_engine_execlists * const execlists = &engine->execlists;
4327 	struct i915_request *rq, *rn;
4328 	struct rb_node *rb;
4329 	unsigned long flags;
4330 
4331 	ENGINE_TRACE(engine, "\n");
4332 
4333 	/*
4334 	 * Before we call engine->cancel_requests(), we should have exclusive
4335 	 * access to the submission state. This is arranged for us by the
4336 	 * caller disabling the interrupt generation, the tasklet and other
4337 	 * threads that may then access the same state, giving us a free hand
4338 	 * to reset state. However, we still need to let lockdep be aware that
4339 	 * we know this state may be accessed in hardirq context, so we
4340 	 * disable the irq around this manipulation and we want to keep
4341 	 * the spinlock focused on its duties and not accidentally conflate
4342 	 * coverage to the submission's irq state. (Similarly, although we
4343 	 * shouldn't need to disable irq around the manipulation of the
4344 	 * submission's irq state, we also wish to remind ourselves that
4345 	 * it is irq state.)
4346 	 */
4347 	spin_lock_irqsave(&engine->active.lock, flags);
4348 
4349 	__execlists_reset(engine, true);
4350 
4351 	/* Mark all executing requests as skipped. */
4352 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4353 		mark_eio(rq);
4354 
4355 	/* Flush the queued requests to the timeline list (for retiring). */
4356 	while ((rb = rb_first_cached(&execlists->queue))) {
4357 		struct i915_priolist *p = to_priolist(rb);
4358 		int i;
4359 
4360 		priolist_for_each_request_consume(rq, rn, p, i) {
4361 			mark_eio(rq);
4362 			__i915_request_submit(rq);
4363 		}
4364 
4365 		rb_erase_cached(&p->node, &execlists->queue);
4366 		i915_priolist_free(p);
4367 	}
4368 
4369 	/* On-hold requests will be flushed to timeline upon their release */
4370 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4371 		mark_eio(rq);
4372 
4373 	/* Cancel all attached virtual engines */
4374 	while ((rb = rb_first_cached(&execlists->virtual))) {
4375 		struct virtual_engine *ve =
4376 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4377 
4378 		rb_erase_cached(rb, &execlists->virtual);
4379 		RB_CLEAR_NODE(rb);
4380 
4381 		spin_lock(&ve->base.active.lock);
4382 		rq = fetch_and_zero(&ve->request);
4383 		if (rq) {
4384 			mark_eio(rq);
4385 
4386 			rq->engine = engine;
4387 			__i915_request_submit(rq);
4388 			i915_request_put(rq);
4389 
4390 			ve->base.execlists.queue_priority_hint = INT_MIN;
4391 		}
4392 		spin_unlock(&ve->base.active.lock);
4393 	}
4394 
4395 	/* Remaining _unready_ requests will be nop'ed when submitted */
4396 
4397 	execlists->queue_priority_hint = INT_MIN;
4398 	execlists->queue = RB_ROOT_CACHED;
4399 
4400 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4401 	execlists->tasklet.func = nop_submission_tasklet;
4402 
4403 	spin_unlock_irqrestore(&engine->active.lock, flags);
4404 }
4405 
4406 static void execlists_reset_finish(struct intel_engine_cs *engine)
4407 {
4408 	struct intel_engine_execlists * const execlists = &engine->execlists;
4409 
4410 	/*
4411 	 * After a GPU reset, we may have requests to replay. Do so now while
4412 	 * we still have the forcewake to be sure that the GPU is not allowed
4413 	 * to sleep before we restart and reload a context.
4414 	 */
4415 	GEM_BUG_ON(!reset_in_progress(execlists));
4416 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4417 		execlists->tasklet.func(execlists->tasklet.data);
4418 
4419 	if (__tasklet_enable(&execlists->tasklet))
4420 		/* And kick in case we missed a new request submission. */
4421 		tasklet_hi_schedule(&execlists->tasklet);
4422 	ENGINE_TRACE(engine, "depth->%d\n",
4423 		     atomic_read(&execlists->tasklet.count));
4424 }
4425 
4426 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4427 				    u64 offset, u32 len,
4428 				    const unsigned int flags)
4429 {
4430 	u32 *cs;
4431 
4432 	cs = intel_ring_begin(rq, 4);
4433 	if (IS_ERR(cs))
4434 		return PTR_ERR(cs);
4435 
4436 	/*
4437 	 * WaDisableCtxRestoreArbitration:bdw,chv
4438 	 *
4439 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4440 	 * particular all the gen that do not need the w/a at all!), if we
4441 	 * took care to make sure that on every switch into this context
4442 	 * (both ordinary and for preemption) that arbitrartion was enabled
4443 	 * we would be fine.  However, for gen8 there is another w/a that
4444 	 * requires us to not preempt inside GPGPU execution, so we keep
4445 	 * arbitration disabled for gen8 batches. Arbitration will be
4446 	 * re-enabled before we close the request
4447 	 * (engine->emit_fini_breadcrumb).
4448 	 */
4449 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4450 
4451 	/* FIXME(BDW+): Address space and security selectors. */
4452 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4453 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4454 	*cs++ = lower_32_bits(offset);
4455 	*cs++ = upper_32_bits(offset);
4456 
4457 	intel_ring_advance(rq, cs);
4458 
4459 	return 0;
4460 }
4461 
4462 static int gen8_emit_bb_start(struct i915_request *rq,
4463 			      u64 offset, u32 len,
4464 			      const unsigned int flags)
4465 {
4466 	u32 *cs;
4467 
4468 	cs = intel_ring_begin(rq, 6);
4469 	if (IS_ERR(cs))
4470 		return PTR_ERR(cs);
4471 
4472 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4473 
4474 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4475 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4476 	*cs++ = lower_32_bits(offset);
4477 	*cs++ = upper_32_bits(offset);
4478 
4479 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4480 	*cs++ = MI_NOOP;
4481 
4482 	intel_ring_advance(rq, cs);
4483 
4484 	return 0;
4485 }
4486 
4487 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4488 {
4489 	ENGINE_WRITE(engine, RING_IMR,
4490 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4491 	ENGINE_POSTING_READ(engine, RING_IMR);
4492 }
4493 
4494 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4495 {
4496 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4497 }
4498 
4499 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4500 {
4501 	u32 cmd, *cs;
4502 
4503 	cs = intel_ring_begin(request, 4);
4504 	if (IS_ERR(cs))
4505 		return PTR_ERR(cs);
4506 
4507 	cmd = MI_FLUSH_DW + 1;
4508 
4509 	/* We always require a command barrier so that subsequent
4510 	 * commands, such as breadcrumb interrupts, are strictly ordered
4511 	 * wrt the contents of the write cache being flushed to memory
4512 	 * (and thus being coherent from the CPU).
4513 	 */
4514 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4515 
4516 	if (mode & EMIT_INVALIDATE) {
4517 		cmd |= MI_INVALIDATE_TLB;
4518 		if (request->engine->class == VIDEO_DECODE_CLASS)
4519 			cmd |= MI_INVALIDATE_BSD;
4520 	}
4521 
4522 	*cs++ = cmd;
4523 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4524 	*cs++ = 0; /* upper addr */
4525 	*cs++ = 0; /* value */
4526 	intel_ring_advance(request, cs);
4527 
4528 	return 0;
4529 }
4530 
4531 static int gen8_emit_flush_render(struct i915_request *request,
4532 				  u32 mode)
4533 {
4534 	bool vf_flush_wa = false, dc_flush_wa = false;
4535 	u32 *cs, flags = 0;
4536 	int len;
4537 
4538 	flags |= PIPE_CONTROL_CS_STALL;
4539 
4540 	if (mode & EMIT_FLUSH) {
4541 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4542 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4543 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4544 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4545 	}
4546 
4547 	if (mode & EMIT_INVALIDATE) {
4548 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4549 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4550 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4551 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4552 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4553 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4554 		flags |= PIPE_CONTROL_QW_WRITE;
4555 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4556 
4557 		/*
4558 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4559 		 * pipe control.
4560 		 */
4561 		if (IS_GEN(request->engine->i915, 9))
4562 			vf_flush_wa = true;
4563 
4564 		/* WaForGAMHang:kbl */
4565 		if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0))
4566 			dc_flush_wa = true;
4567 	}
4568 
4569 	len = 6;
4570 
4571 	if (vf_flush_wa)
4572 		len += 6;
4573 
4574 	if (dc_flush_wa)
4575 		len += 12;
4576 
4577 	cs = intel_ring_begin(request, len);
4578 	if (IS_ERR(cs))
4579 		return PTR_ERR(cs);
4580 
4581 	if (vf_flush_wa)
4582 		cs = gen8_emit_pipe_control(cs, 0, 0);
4583 
4584 	if (dc_flush_wa)
4585 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4586 					    0);
4587 
4588 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4589 
4590 	if (dc_flush_wa)
4591 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4592 
4593 	intel_ring_advance(request, cs);
4594 
4595 	return 0;
4596 }
4597 
4598 static int gen11_emit_flush_render(struct i915_request *request,
4599 				   u32 mode)
4600 {
4601 	if (mode & EMIT_FLUSH) {
4602 		u32 *cs;
4603 		u32 flags = 0;
4604 
4605 		flags |= PIPE_CONTROL_CS_STALL;
4606 
4607 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4608 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4609 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4610 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4611 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4612 		flags |= PIPE_CONTROL_QW_WRITE;
4613 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4614 
4615 		cs = intel_ring_begin(request, 6);
4616 		if (IS_ERR(cs))
4617 			return PTR_ERR(cs);
4618 
4619 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4620 		intel_ring_advance(request, cs);
4621 	}
4622 
4623 	if (mode & EMIT_INVALIDATE) {
4624 		u32 *cs;
4625 		u32 flags = 0;
4626 
4627 		flags |= PIPE_CONTROL_CS_STALL;
4628 
4629 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4630 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4631 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4632 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4633 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4634 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4635 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4636 		flags |= PIPE_CONTROL_QW_WRITE;
4637 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4638 
4639 		cs = intel_ring_begin(request, 6);
4640 		if (IS_ERR(cs))
4641 			return PTR_ERR(cs);
4642 
4643 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4644 		intel_ring_advance(request, cs);
4645 	}
4646 
4647 	return 0;
4648 }
4649 
4650 static u32 preparser_disable(bool state)
4651 {
4652 	return MI_ARB_CHECK | 1 << 8 | state;
4653 }
4654 
4655 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4656 {
4657 	static const i915_reg_t vd[] = {
4658 		GEN12_VD0_AUX_NV,
4659 		GEN12_VD1_AUX_NV,
4660 		GEN12_VD2_AUX_NV,
4661 		GEN12_VD3_AUX_NV,
4662 	};
4663 
4664 	static const i915_reg_t ve[] = {
4665 		GEN12_VE0_AUX_NV,
4666 		GEN12_VE1_AUX_NV,
4667 	};
4668 
4669 	if (engine->class == VIDEO_DECODE_CLASS)
4670 		return vd[engine->instance];
4671 
4672 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4673 		return ve[engine->instance];
4674 
4675 	GEM_BUG_ON("unknown aux_inv_reg\n");
4676 
4677 	return INVALID_MMIO_REG;
4678 }
4679 
4680 static u32 *
4681 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4682 {
4683 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4684 	*cs++ = i915_mmio_reg_offset(inv_reg);
4685 	*cs++ = AUX_INV;
4686 	*cs++ = MI_NOOP;
4687 
4688 	return cs;
4689 }
4690 
4691 static int gen12_emit_flush_render(struct i915_request *request,
4692 				   u32 mode)
4693 {
4694 	if (mode & EMIT_FLUSH) {
4695 		u32 flags = 0;
4696 		u32 *cs;
4697 
4698 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4699 		flags |= PIPE_CONTROL_FLUSH_L3;
4700 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4701 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4702 		/* Wa_1409600907:tgl */
4703 		flags |= PIPE_CONTROL_DEPTH_STALL;
4704 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4705 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4706 
4707 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4708 		flags |= PIPE_CONTROL_QW_WRITE;
4709 
4710 		flags |= PIPE_CONTROL_CS_STALL;
4711 
4712 		cs = intel_ring_begin(request, 6);
4713 		if (IS_ERR(cs))
4714 			return PTR_ERR(cs);
4715 
4716 		cs = gen12_emit_pipe_control(cs,
4717 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4718 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4719 		intel_ring_advance(request, cs);
4720 	}
4721 
4722 	if (mode & EMIT_INVALIDATE) {
4723 		u32 flags = 0;
4724 		u32 *cs;
4725 
4726 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4727 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4728 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4729 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4730 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4731 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4732 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4733 
4734 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4735 		flags |= PIPE_CONTROL_QW_WRITE;
4736 
4737 		flags |= PIPE_CONTROL_CS_STALL;
4738 
4739 		cs = intel_ring_begin(request, 8 + 4);
4740 		if (IS_ERR(cs))
4741 			return PTR_ERR(cs);
4742 
4743 		/*
4744 		 * Prevent the pre-parser from skipping past the TLB
4745 		 * invalidate and loading a stale page for the batch
4746 		 * buffer / request payload.
4747 		 */
4748 		*cs++ = preparser_disable(true);
4749 
4750 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4751 
4752 		/* hsdes: 1809175790 */
4753 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4754 
4755 		*cs++ = preparser_disable(false);
4756 		intel_ring_advance(request, cs);
4757 	}
4758 
4759 	return 0;
4760 }
4761 
4762 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4763 {
4764 	intel_engine_mask_t aux_inv = 0;
4765 	u32 cmd, *cs;
4766 
4767 	if (mode & EMIT_INVALIDATE)
4768 		aux_inv = request->engine->mask & ~BIT(BCS0);
4769 
4770 	cs = intel_ring_begin(request,
4771 			      4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0));
4772 	if (IS_ERR(cs))
4773 		return PTR_ERR(cs);
4774 
4775 	cmd = MI_FLUSH_DW + 1;
4776 
4777 	/* We always require a command barrier so that subsequent
4778 	 * commands, such as breadcrumb interrupts, are strictly ordered
4779 	 * wrt the contents of the write cache being flushed to memory
4780 	 * (and thus being coherent from the CPU).
4781 	 */
4782 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4783 
4784 	if (mode & EMIT_INVALIDATE) {
4785 		cmd |= MI_INVALIDATE_TLB;
4786 		if (request->engine->class == VIDEO_DECODE_CLASS)
4787 			cmd |= MI_INVALIDATE_BSD;
4788 	}
4789 
4790 	*cs++ = cmd;
4791 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4792 	*cs++ = 0; /* upper addr */
4793 	*cs++ = 0; /* value */
4794 
4795 	if (aux_inv) { /* hsdes: 1809175790 */
4796 		struct intel_engine_cs *engine;
4797 		unsigned int tmp;
4798 
4799 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4800 		for_each_engine_masked(engine, request->engine->gt,
4801 				       aux_inv, tmp) {
4802 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4803 			*cs++ = AUX_INV;
4804 		}
4805 		*cs++ = MI_NOOP;
4806 	}
4807 	intel_ring_advance(request, cs);
4808 
4809 	return 0;
4810 }
4811 
4812 static void assert_request_valid(struct i915_request *rq)
4813 {
4814 	struct intel_ring *ring __maybe_unused = rq->ring;
4815 
4816 	/* Can we unwind this request without appearing to go forwards? */
4817 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4818 }
4819 
4820 /*
4821  * Reserve space for 2 NOOPs at the end of each request to be
4822  * used as a workaround for not being allowed to do lite
4823  * restore with HEAD==TAIL (WaIdleLiteRestore).
4824  */
4825 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4826 {
4827 	/* Ensure there's always at least one preemption point per-request. */
4828 	*cs++ = MI_ARB_CHECK;
4829 	*cs++ = MI_NOOP;
4830 	request->wa_tail = intel_ring_offset(request, cs);
4831 
4832 	/* Check that entire request is less than half the ring */
4833 	assert_request_valid(request);
4834 
4835 	return cs;
4836 }
4837 
4838 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4839 {
4840 	*cs++ = MI_SEMAPHORE_WAIT |
4841 		MI_SEMAPHORE_GLOBAL_GTT |
4842 		MI_SEMAPHORE_POLL |
4843 		MI_SEMAPHORE_SAD_EQ_SDD;
4844 	*cs++ = 0;
4845 	*cs++ = intel_hws_preempt_address(request->engine);
4846 	*cs++ = 0;
4847 
4848 	return cs;
4849 }
4850 
4851 static __always_inline u32*
4852 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4853 {
4854 	*cs++ = MI_USER_INTERRUPT;
4855 
4856 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4857 	if (intel_engine_has_semaphores(request->engine))
4858 		cs = emit_preempt_busywait(request, cs);
4859 
4860 	request->tail = intel_ring_offset(request, cs);
4861 	assert_ring_tail_valid(request->ring, request->tail);
4862 
4863 	return gen8_emit_wa_tail(request, cs);
4864 }
4865 
4866 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4867 {
4868 	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4869 
4870 	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4871 }
4872 
4873 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4874 {
4875 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4876 }
4877 
4878 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4879 {
4880 	cs = gen8_emit_pipe_control(cs,
4881 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4882 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4883 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4884 				    0);
4885 
4886 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4887 	cs = gen8_emit_ggtt_write_rcs(cs,
4888 				      request->fence.seqno,
4889 				      i915_request_active_timeline(request)->hwsp_offset,
4890 				      PIPE_CONTROL_FLUSH_ENABLE |
4891 				      PIPE_CONTROL_CS_STALL);
4892 
4893 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4894 }
4895 
4896 static u32 *
4897 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4898 {
4899 	cs = gen8_emit_ggtt_write_rcs(cs,
4900 				      request->fence.seqno,
4901 				      i915_request_active_timeline(request)->hwsp_offset,
4902 				      PIPE_CONTROL_CS_STALL |
4903 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4904 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4905 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4906 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4907 				      PIPE_CONTROL_FLUSH_ENABLE);
4908 
4909 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4910 }
4911 
4912 /*
4913  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4914  * flush and will continue pre-fetching the instructions after it before the
4915  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4916  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4917  * of the next request before the memory has been flushed, we're guaranteed that
4918  * we won't access the batch itself too early.
4919  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4920  * so, if the current request is modifying an instruction in the next request on
4921  * the same intel_context, we might pre-fetch and then execute the pre-update
4922  * instruction. To avoid this, the users of self-modifying code should either
4923  * disable the parser around the code emitting the memory writes, via a new flag
4924  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4925  * the in-kernel use-cases we've opted to use a separate context, see
4926  * reloc_gpu() as an example.
4927  * All the above applies only to the instructions themselves. Non-inline data
4928  * used by the instructions is not pre-fetched.
4929  */
4930 
4931 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4932 {
4933 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4934 		MI_SEMAPHORE_GLOBAL_GTT |
4935 		MI_SEMAPHORE_POLL |
4936 		MI_SEMAPHORE_SAD_EQ_SDD;
4937 	*cs++ = 0;
4938 	*cs++ = intel_hws_preempt_address(request->engine);
4939 	*cs++ = 0;
4940 	*cs++ = 0;
4941 	*cs++ = MI_NOOP;
4942 
4943 	return cs;
4944 }
4945 
4946 static __always_inline u32*
4947 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4948 {
4949 	*cs++ = MI_USER_INTERRUPT;
4950 
4951 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4952 	if (intel_engine_has_semaphores(request->engine))
4953 		cs = gen12_emit_preempt_busywait(request, cs);
4954 
4955 	request->tail = intel_ring_offset(request, cs);
4956 	assert_ring_tail_valid(request->ring, request->tail);
4957 
4958 	return gen8_emit_wa_tail(request, cs);
4959 }
4960 
4961 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4962 {
4963 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4964 }
4965 
4966 static u32 *
4967 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4968 {
4969 	cs = gen12_emit_ggtt_write_rcs(cs,
4970 				       request->fence.seqno,
4971 				       i915_request_active_timeline(request)->hwsp_offset,
4972 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4973 				       PIPE_CONTROL_CS_STALL |
4974 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
4975 				       PIPE_CONTROL_FLUSH_L3 |
4976 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4977 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4978 				       /* Wa_1409600907:tgl */
4979 				       PIPE_CONTROL_DEPTH_STALL |
4980 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
4981 				       PIPE_CONTROL_FLUSH_ENABLE);
4982 
4983 	return gen12_emit_fini_breadcrumb_tail(request, cs);
4984 }
4985 
4986 static void execlists_park(struct intel_engine_cs *engine)
4987 {
4988 	cancel_timer(&engine->execlists.timer);
4989 	cancel_timer(&engine->execlists.preempt);
4990 }
4991 
4992 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4993 {
4994 	engine->submit_request = execlists_submit_request;
4995 	engine->schedule = i915_schedule;
4996 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4997 
4998 	engine->reset.prepare = execlists_reset_prepare;
4999 	engine->reset.rewind = execlists_reset_rewind;
5000 	engine->reset.cancel = execlists_reset_cancel;
5001 	engine->reset.finish = execlists_reset_finish;
5002 
5003 	engine->park = execlists_park;
5004 	engine->unpark = NULL;
5005 
5006 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5007 	if (!intel_vgpu_active(engine->i915)) {
5008 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5009 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5010 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5011 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5012 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5013 		}
5014 	}
5015 
5016 	if (INTEL_GEN(engine->i915) >= 12)
5017 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5018 
5019 	if (intel_engine_has_preemption(engine))
5020 		engine->emit_bb_start = gen8_emit_bb_start;
5021 	else
5022 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5023 }
5024 
5025 static void execlists_shutdown(struct intel_engine_cs *engine)
5026 {
5027 	/* Synchronise with residual timers and any softirq they raise */
5028 	del_timer_sync(&engine->execlists.timer);
5029 	del_timer_sync(&engine->execlists.preempt);
5030 	tasklet_kill(&engine->execlists.tasklet);
5031 }
5032 
5033 static void execlists_release(struct intel_engine_cs *engine)
5034 {
5035 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5036 
5037 	execlists_shutdown(engine);
5038 
5039 	intel_engine_cleanup_common(engine);
5040 	lrc_destroy_wa_ctx(engine);
5041 }
5042 
5043 static void
5044 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5045 {
5046 	/* Default vfuncs which can be overriden by each engine. */
5047 
5048 	engine->resume = execlists_resume;
5049 
5050 	engine->cops = &execlists_context_ops;
5051 	engine->request_alloc = execlists_request_alloc;
5052 
5053 	engine->emit_flush = gen8_emit_flush;
5054 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5055 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5056 	if (INTEL_GEN(engine->i915) >= 12) {
5057 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5058 		engine->emit_flush = gen12_emit_flush;
5059 	}
5060 	engine->set_default_submission = intel_execlists_set_default_submission;
5061 
5062 	if (INTEL_GEN(engine->i915) < 11) {
5063 		engine->irq_enable = gen8_logical_ring_enable_irq;
5064 		engine->irq_disable = gen8_logical_ring_disable_irq;
5065 	} else {
5066 		/*
5067 		 * TODO: On Gen11 interrupt masks need to be clear
5068 		 * to allow C6 entry. Keep interrupts enabled at
5069 		 * and take the hit of generating extra interrupts
5070 		 * until a more refined solution exists.
5071 		 */
5072 	}
5073 }
5074 
5075 static inline void
5076 logical_ring_default_irqs(struct intel_engine_cs *engine)
5077 {
5078 	unsigned int shift = 0;
5079 
5080 	if (INTEL_GEN(engine->i915) < 11) {
5081 		const u8 irq_shifts[] = {
5082 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5083 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5084 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5085 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5086 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5087 		};
5088 
5089 		shift = irq_shifts[engine->id];
5090 	}
5091 
5092 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5093 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5094 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5095 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5096 }
5097 
5098 static void rcs_submission_override(struct intel_engine_cs *engine)
5099 {
5100 	switch (INTEL_GEN(engine->i915)) {
5101 	case 12:
5102 		engine->emit_flush = gen12_emit_flush_render;
5103 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5104 		break;
5105 	case 11:
5106 		engine->emit_flush = gen11_emit_flush_render;
5107 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5108 		break;
5109 	default:
5110 		engine->emit_flush = gen8_emit_flush_render;
5111 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5112 		break;
5113 	}
5114 }
5115 
5116 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5117 {
5118 	struct intel_engine_execlists * const execlists = &engine->execlists;
5119 	struct drm_i915_private *i915 = engine->i915;
5120 	struct intel_uncore *uncore = engine->uncore;
5121 	u32 base = engine->mmio_base;
5122 
5123 	tasklet_init(&engine->execlists.tasklet,
5124 		     execlists_submission_tasklet, (unsigned long)engine);
5125 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5126 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5127 
5128 	logical_ring_default_vfuncs(engine);
5129 	logical_ring_default_irqs(engine);
5130 
5131 	if (engine->class == RENDER_CLASS)
5132 		rcs_submission_override(engine);
5133 
5134 	if (intel_init_workaround_bb(engine))
5135 		/*
5136 		 * We continue even if we fail to initialize WA batch
5137 		 * because we only expect rare glitches but nothing
5138 		 * critical to prevent us from using GPU
5139 		 */
5140 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5141 
5142 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5143 		execlists->submit_reg = uncore->regs +
5144 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5145 		execlists->ctrl_reg = uncore->regs +
5146 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5147 	} else {
5148 		execlists->submit_reg = uncore->regs +
5149 			i915_mmio_reg_offset(RING_ELSP(base));
5150 	}
5151 
5152 	execlists->csb_status =
5153 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5154 
5155 	execlists->csb_write =
5156 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5157 
5158 	if (INTEL_GEN(i915) < 11)
5159 		execlists->csb_size = GEN8_CSB_ENTRIES;
5160 	else
5161 		execlists->csb_size = GEN11_CSB_ENTRIES;
5162 
5163 	if (INTEL_GEN(engine->i915) >= 11) {
5164 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5165 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5166 	}
5167 
5168 	/* Finally, take ownership and responsibility for cleanup! */
5169 	engine->sanitize = execlists_sanitize;
5170 	engine->release = execlists_release;
5171 
5172 	return 0;
5173 }
5174 
5175 static void init_common_reg_state(u32 * const regs,
5176 				  const struct intel_engine_cs *engine,
5177 				  const struct intel_ring *ring,
5178 				  bool inhibit)
5179 {
5180 	u32 ctl;
5181 
5182 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5183 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5184 	if (inhibit)
5185 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5186 	if (INTEL_GEN(engine->i915) < 11)
5187 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5188 					   CTX_CTRL_RS_CTX_ENABLE);
5189 	regs[CTX_CONTEXT_CONTROL] = ctl;
5190 
5191 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5192 	regs[CTX_TIMESTAMP] = 0;
5193 }
5194 
5195 static void init_wa_bb_reg_state(u32 * const regs,
5196 				 const struct intel_engine_cs *engine)
5197 {
5198 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5199 
5200 	if (wa_ctx->per_ctx.size) {
5201 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5202 
5203 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5204 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5205 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5206 	}
5207 
5208 	if (wa_ctx->indirect_ctx.size) {
5209 		lrc_ring_setup_indirect_ctx(regs, engine,
5210 					    i915_ggtt_offset(wa_ctx->vma) +
5211 					    wa_ctx->indirect_ctx.offset,
5212 					    wa_ctx->indirect_ctx.size);
5213 	}
5214 }
5215 
5216 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5217 {
5218 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5219 		/* 64b PPGTT (48bit canonical)
5220 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5221 		 * other PDP Descriptors are ignored.
5222 		 */
5223 		ASSIGN_CTX_PML4(ppgtt, regs);
5224 	} else {
5225 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5226 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5227 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5228 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5229 	}
5230 }
5231 
5232 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5233 {
5234 	if (i915_is_ggtt(vm))
5235 		return i915_vm_to_ggtt(vm)->alias;
5236 	else
5237 		return i915_vm_to_ppgtt(vm);
5238 }
5239 
5240 static void execlists_init_reg_state(u32 *regs,
5241 				     const struct intel_context *ce,
5242 				     const struct intel_engine_cs *engine,
5243 				     const struct intel_ring *ring,
5244 				     bool inhibit)
5245 {
5246 	/*
5247 	 * A context is actually a big batch buffer with several
5248 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5249 	 * values we are setting here are only for the first context restore:
5250 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5251 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5252 	 * we are not initializing here).
5253 	 *
5254 	 * Must keep consistent with virtual_update_register_offsets().
5255 	 */
5256 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5257 
5258 	init_common_reg_state(regs, engine, ring, inhibit);
5259 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5260 
5261 	init_wa_bb_reg_state(regs, engine);
5262 
5263 	__reset_stop_ring(regs, engine);
5264 }
5265 
5266 static int
5267 populate_lr_context(struct intel_context *ce,
5268 		    struct drm_i915_gem_object *ctx_obj,
5269 		    struct intel_engine_cs *engine,
5270 		    struct intel_ring *ring)
5271 {
5272 	bool inhibit = true;
5273 	void *vaddr;
5274 
5275 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5276 	if (IS_ERR(vaddr)) {
5277 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5278 		return PTR_ERR(vaddr);
5279 	}
5280 
5281 	set_redzone(vaddr, engine);
5282 
5283 	if (engine->default_state) {
5284 		shmem_read(engine->default_state, 0,
5285 			   vaddr, engine->context_size);
5286 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5287 		inhibit = false;
5288 	}
5289 
5290 	/* Clear the ppHWSP (inc. per-context counters) */
5291 	memset(vaddr, 0, PAGE_SIZE);
5292 
5293 	/*
5294 	 * The second page of the context object contains some registers which
5295 	 * must be set up prior to the first execution.
5296 	 */
5297 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5298 				 ce, engine, ring, inhibit);
5299 
5300 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5301 	i915_gem_object_unpin_map(ctx_obj);
5302 	return 0;
5303 }
5304 
5305 static int __execlists_context_alloc(struct intel_context *ce,
5306 				     struct intel_engine_cs *engine)
5307 {
5308 	struct drm_i915_gem_object *ctx_obj;
5309 	struct intel_ring *ring;
5310 	struct i915_vma *vma;
5311 	u32 context_size;
5312 	int ret;
5313 
5314 	GEM_BUG_ON(ce->state);
5315 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5316 
5317 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5318 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5319 
5320 	if (INTEL_GEN(engine->i915) == 12) {
5321 		ce->wa_bb_page = context_size / PAGE_SIZE;
5322 		context_size += PAGE_SIZE;
5323 	}
5324 
5325 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5326 	if (IS_ERR(ctx_obj))
5327 		return PTR_ERR(ctx_obj);
5328 
5329 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5330 	if (IS_ERR(vma)) {
5331 		ret = PTR_ERR(vma);
5332 		goto error_deref_obj;
5333 	}
5334 
5335 	if (!ce->timeline) {
5336 		struct intel_timeline *tl;
5337 		struct i915_vma *hwsp;
5338 
5339 		/*
5340 		 * Use the static global HWSP for the kernel context, and
5341 		 * a dynamically allocated cacheline for everyone else.
5342 		 */
5343 		hwsp = NULL;
5344 		if (unlikely(intel_context_is_barrier(ce)))
5345 			hwsp = engine->status_page.vma;
5346 
5347 		tl = intel_timeline_create(engine->gt, hwsp);
5348 		if (IS_ERR(tl)) {
5349 			ret = PTR_ERR(tl);
5350 			goto error_deref_obj;
5351 		}
5352 
5353 		ce->timeline = tl;
5354 	}
5355 
5356 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5357 	if (IS_ERR(ring)) {
5358 		ret = PTR_ERR(ring);
5359 		goto error_deref_obj;
5360 	}
5361 
5362 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5363 	if (ret) {
5364 		drm_dbg(&engine->i915->drm,
5365 			"Failed to populate LRC: %d\n", ret);
5366 		goto error_ring_free;
5367 	}
5368 
5369 	ce->ring = ring;
5370 	ce->state = vma;
5371 
5372 	return 0;
5373 
5374 error_ring_free:
5375 	intel_ring_put(ring);
5376 error_deref_obj:
5377 	i915_gem_object_put(ctx_obj);
5378 	return ret;
5379 }
5380 
5381 static struct list_head *virtual_queue(struct virtual_engine *ve)
5382 {
5383 	return &ve->base.execlists.default_priolist.requests[0];
5384 }
5385 
5386 static void virtual_context_destroy(struct kref *kref)
5387 {
5388 	struct virtual_engine *ve =
5389 		container_of(kref, typeof(*ve), context.ref);
5390 	unsigned int n;
5391 
5392 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5393 	GEM_BUG_ON(ve->request);
5394 	GEM_BUG_ON(ve->context.inflight);
5395 
5396 	for (n = 0; n < ve->num_siblings; n++) {
5397 		struct intel_engine_cs *sibling = ve->siblings[n];
5398 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5399 		unsigned long flags;
5400 
5401 		if (RB_EMPTY_NODE(node))
5402 			continue;
5403 
5404 		spin_lock_irqsave(&sibling->active.lock, flags);
5405 
5406 		/* Detachment is lazily performed in the execlists tasklet */
5407 		if (!RB_EMPTY_NODE(node))
5408 			rb_erase_cached(node, &sibling->execlists.virtual);
5409 
5410 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5411 	}
5412 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5413 
5414 	if (ve->context.state)
5415 		__execlists_context_fini(&ve->context);
5416 	intel_context_fini(&ve->context);
5417 
5418 	intel_engine_free_request_pool(&ve->base);
5419 
5420 	kfree(ve->bonds);
5421 	kfree(ve);
5422 }
5423 
5424 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5425 {
5426 	int swp;
5427 
5428 	/*
5429 	 * Pick a random sibling on starting to help spread the load around.
5430 	 *
5431 	 * New contexts are typically created with exactly the same order
5432 	 * of siblings, and often started in batches. Due to the way we iterate
5433 	 * the array of sibling when submitting requests, sibling[0] is
5434 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5435 	 * randomised across the system, we also help spread the load by the
5436 	 * first engine we inspect being different each time.
5437 	 *
5438 	 * NB This does not force us to execute on this engine, it will just
5439 	 * typically be the first we inspect for submission.
5440 	 */
5441 	swp = prandom_u32_max(ve->num_siblings);
5442 	if (swp)
5443 		swap(ve->siblings[swp], ve->siblings[0]);
5444 }
5445 
5446 static int virtual_context_alloc(struct intel_context *ce)
5447 {
5448 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5449 
5450 	return __execlists_context_alloc(ce, ve->siblings[0]);
5451 }
5452 
5453 static int virtual_context_pin(struct intel_context *ce)
5454 {
5455 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5456 
5457 	/* Note: we must use a real engine class for setting up reg state */
5458 	return __execlists_context_pin(ce, ve->siblings[0]);
5459 }
5460 
5461 static void virtual_context_enter(struct intel_context *ce)
5462 {
5463 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5464 	unsigned int n;
5465 
5466 	for (n = 0; n < ve->num_siblings; n++)
5467 		intel_engine_pm_get(ve->siblings[n]);
5468 
5469 	intel_timeline_enter(ce->timeline);
5470 }
5471 
5472 static void virtual_context_exit(struct intel_context *ce)
5473 {
5474 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5475 	unsigned int n;
5476 
5477 	intel_timeline_exit(ce->timeline);
5478 
5479 	for (n = 0; n < ve->num_siblings; n++)
5480 		intel_engine_pm_put(ve->siblings[n]);
5481 }
5482 
5483 static const struct intel_context_ops virtual_context_ops = {
5484 	.alloc = virtual_context_alloc,
5485 
5486 	.pin = virtual_context_pin,
5487 	.unpin = execlists_context_unpin,
5488 
5489 	.enter = virtual_context_enter,
5490 	.exit = virtual_context_exit,
5491 
5492 	.destroy = virtual_context_destroy,
5493 };
5494 
5495 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5496 {
5497 	struct i915_request *rq;
5498 	intel_engine_mask_t mask;
5499 
5500 	rq = READ_ONCE(ve->request);
5501 	if (!rq)
5502 		return 0;
5503 
5504 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5505 	mask = rq->execution_mask;
5506 	if (unlikely(!mask)) {
5507 		/* Invalid selection, submit to a random engine in error */
5508 		i915_request_set_error_once(rq, -ENODEV);
5509 		mask = ve->siblings[0]->mask;
5510 	}
5511 
5512 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5513 		     rq->fence.context, rq->fence.seqno,
5514 		     mask, ve->base.execlists.queue_priority_hint);
5515 
5516 	return mask;
5517 }
5518 
5519 static void virtual_submission_tasklet(unsigned long data)
5520 {
5521 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5522 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5523 	intel_engine_mask_t mask;
5524 	unsigned int n;
5525 
5526 	rcu_read_lock();
5527 	mask = virtual_submission_mask(ve);
5528 	rcu_read_unlock();
5529 	if (unlikely(!mask))
5530 		return;
5531 
5532 	local_irq_disable();
5533 	for (n = 0; n < ve->num_siblings; n++) {
5534 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5535 		struct ve_node * const node = &ve->nodes[sibling->id];
5536 		struct rb_node **parent, *rb;
5537 		bool first;
5538 
5539 		if (!READ_ONCE(ve->request))
5540 			break; /* already handled by a sibling's tasklet */
5541 
5542 		if (unlikely(!(mask & sibling->mask))) {
5543 			if (!RB_EMPTY_NODE(&node->rb)) {
5544 				spin_lock(&sibling->active.lock);
5545 				rb_erase_cached(&node->rb,
5546 						&sibling->execlists.virtual);
5547 				RB_CLEAR_NODE(&node->rb);
5548 				spin_unlock(&sibling->active.lock);
5549 			}
5550 			continue;
5551 		}
5552 
5553 		spin_lock(&sibling->active.lock);
5554 
5555 		if (!RB_EMPTY_NODE(&node->rb)) {
5556 			/*
5557 			 * Cheat and avoid rebalancing the tree if we can
5558 			 * reuse this node in situ.
5559 			 */
5560 			first = rb_first_cached(&sibling->execlists.virtual) ==
5561 				&node->rb;
5562 			if (prio == node->prio || (prio > node->prio && first))
5563 				goto submit_engine;
5564 
5565 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5566 		}
5567 
5568 		rb = NULL;
5569 		first = true;
5570 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5571 		while (*parent) {
5572 			struct ve_node *other;
5573 
5574 			rb = *parent;
5575 			other = rb_entry(rb, typeof(*other), rb);
5576 			if (prio > other->prio) {
5577 				parent = &rb->rb_left;
5578 			} else {
5579 				parent = &rb->rb_right;
5580 				first = false;
5581 			}
5582 		}
5583 
5584 		rb_link_node(&node->rb, rb, parent);
5585 		rb_insert_color_cached(&node->rb,
5586 				       &sibling->execlists.virtual,
5587 				       first);
5588 
5589 submit_engine:
5590 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5591 		node->prio = prio;
5592 		if (first && prio > sibling->execlists.queue_priority_hint)
5593 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5594 
5595 		spin_unlock(&sibling->active.lock);
5596 	}
5597 	local_irq_enable();
5598 }
5599 
5600 static void virtual_submit_request(struct i915_request *rq)
5601 {
5602 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5603 	struct i915_request *old;
5604 	unsigned long flags;
5605 
5606 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5607 		     rq->fence.context,
5608 		     rq->fence.seqno);
5609 
5610 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5611 
5612 	spin_lock_irqsave(&ve->base.active.lock, flags);
5613 
5614 	old = ve->request;
5615 	if (old) { /* background completion event from preempt-to-busy */
5616 		GEM_BUG_ON(!i915_request_completed(old));
5617 		__i915_request_submit(old);
5618 		i915_request_put(old);
5619 	}
5620 
5621 	if (i915_request_completed(rq)) {
5622 		__i915_request_submit(rq);
5623 
5624 		ve->base.execlists.queue_priority_hint = INT_MIN;
5625 		ve->request = NULL;
5626 	} else {
5627 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5628 		ve->request = i915_request_get(rq);
5629 
5630 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5631 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5632 
5633 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5634 	}
5635 
5636 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5637 }
5638 
5639 static struct ve_bond *
5640 virtual_find_bond(struct virtual_engine *ve,
5641 		  const struct intel_engine_cs *master)
5642 {
5643 	int i;
5644 
5645 	for (i = 0; i < ve->num_bonds; i++) {
5646 		if (ve->bonds[i].master == master)
5647 			return &ve->bonds[i];
5648 	}
5649 
5650 	return NULL;
5651 }
5652 
5653 static void
5654 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5655 {
5656 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5657 	intel_engine_mask_t allowed, exec;
5658 	struct ve_bond *bond;
5659 
5660 	allowed = ~to_request(signal)->engine->mask;
5661 
5662 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5663 	if (bond)
5664 		allowed &= bond->sibling_mask;
5665 
5666 	/* Restrict the bonded request to run on only the available engines */
5667 	exec = READ_ONCE(rq->execution_mask);
5668 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5669 		;
5670 
5671 	/* Prevent the master from being re-run on the bonded engines */
5672 	to_request(signal)->execution_mask &= ~allowed;
5673 }
5674 
5675 struct intel_context *
5676 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5677 			       unsigned int count)
5678 {
5679 	struct virtual_engine *ve;
5680 	unsigned int n;
5681 	int err;
5682 
5683 	if (count == 0)
5684 		return ERR_PTR(-EINVAL);
5685 
5686 	if (count == 1)
5687 		return intel_context_create(siblings[0]);
5688 
5689 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5690 	if (!ve)
5691 		return ERR_PTR(-ENOMEM);
5692 
5693 	ve->base.i915 = siblings[0]->i915;
5694 	ve->base.gt = siblings[0]->gt;
5695 	ve->base.uncore = siblings[0]->uncore;
5696 	ve->base.id = -1;
5697 
5698 	ve->base.class = OTHER_CLASS;
5699 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5700 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5701 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5702 
5703 	/*
5704 	 * The decision on whether to submit a request using semaphores
5705 	 * depends on the saturated state of the engine. We only compute
5706 	 * this during HW submission of the request, and we need for this
5707 	 * state to be globally applied to all requests being submitted
5708 	 * to this engine. Virtual engines encompass more than one physical
5709 	 * engine and so we cannot accurately tell in advance if one of those
5710 	 * engines is already saturated and so cannot afford to use a semaphore
5711 	 * and be pessimized in priority for doing so -- if we are the only
5712 	 * context using semaphores after all other clients have stopped, we
5713 	 * will be starved on the saturated system. Such a global switch for
5714 	 * semaphores is less than ideal, but alas is the current compromise.
5715 	 */
5716 	ve->base.saturated = ALL_ENGINES;
5717 
5718 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5719 
5720 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5721 	intel_engine_init_breadcrumbs(&ve->base);
5722 	intel_engine_init_execlists(&ve->base);
5723 	ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */
5724 
5725 	ve->base.cops = &virtual_context_ops;
5726 	ve->base.request_alloc = execlists_request_alloc;
5727 
5728 	ve->base.schedule = i915_schedule;
5729 	ve->base.submit_request = virtual_submit_request;
5730 	ve->base.bond_execute = virtual_bond_execute;
5731 
5732 	INIT_LIST_HEAD(virtual_queue(ve));
5733 	ve->base.execlists.queue_priority_hint = INT_MIN;
5734 	tasklet_init(&ve->base.execlists.tasklet,
5735 		     virtual_submission_tasklet,
5736 		     (unsigned long)ve);
5737 
5738 	intel_context_init(&ve->context, &ve->base);
5739 
5740 	for (n = 0; n < count; n++) {
5741 		struct intel_engine_cs *sibling = siblings[n];
5742 
5743 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5744 		if (sibling->mask & ve->base.mask) {
5745 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5746 				  sibling->name);
5747 			err = -EINVAL;
5748 			goto err_put;
5749 		}
5750 
5751 		/*
5752 		 * The virtual engine implementation is tightly coupled to
5753 		 * the execlists backend -- we push out request directly
5754 		 * into a tree inside each physical engine. We could support
5755 		 * layering if we handle cloning of the requests and
5756 		 * submitting a copy into each backend.
5757 		 */
5758 		if (sibling->execlists.tasklet.func !=
5759 		    execlists_submission_tasklet) {
5760 			err = -ENODEV;
5761 			goto err_put;
5762 		}
5763 
5764 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5765 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5766 
5767 		ve->siblings[ve->num_siblings++] = sibling;
5768 		ve->base.mask |= sibling->mask;
5769 
5770 		/*
5771 		 * All physical engines must be compatible for their emission
5772 		 * functions (as we build the instructions during request
5773 		 * construction and do not alter them before submission
5774 		 * on the physical engine). We use the engine class as a guide
5775 		 * here, although that could be refined.
5776 		 */
5777 		if (ve->base.class != OTHER_CLASS) {
5778 			if (ve->base.class != sibling->class) {
5779 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5780 					  sibling->class, ve->base.class);
5781 				err = -EINVAL;
5782 				goto err_put;
5783 			}
5784 			continue;
5785 		}
5786 
5787 		ve->base.class = sibling->class;
5788 		ve->base.uabi_class = sibling->uabi_class;
5789 		snprintf(ve->base.name, sizeof(ve->base.name),
5790 			 "v%dx%d", ve->base.class, count);
5791 		ve->base.context_size = sibling->context_size;
5792 
5793 		ve->base.emit_bb_start = sibling->emit_bb_start;
5794 		ve->base.emit_flush = sibling->emit_flush;
5795 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5796 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5797 		ve->base.emit_fini_breadcrumb_dw =
5798 			sibling->emit_fini_breadcrumb_dw;
5799 
5800 		ve->base.flags = sibling->flags;
5801 	}
5802 
5803 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5804 
5805 	virtual_engine_initial_hint(ve);
5806 	return &ve->context;
5807 
5808 err_put:
5809 	intel_context_put(&ve->context);
5810 	return ERR_PTR(err);
5811 }
5812 
5813 struct intel_context *
5814 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5815 {
5816 	struct virtual_engine *se = to_virtual_engine(src);
5817 	struct intel_context *dst;
5818 
5819 	dst = intel_execlists_create_virtual(se->siblings,
5820 					     se->num_siblings);
5821 	if (IS_ERR(dst))
5822 		return dst;
5823 
5824 	if (se->num_bonds) {
5825 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5826 
5827 		de->bonds = kmemdup(se->bonds,
5828 				    sizeof(*se->bonds) * se->num_bonds,
5829 				    GFP_KERNEL);
5830 		if (!de->bonds) {
5831 			intel_context_put(dst);
5832 			return ERR_PTR(-ENOMEM);
5833 		}
5834 
5835 		de->num_bonds = se->num_bonds;
5836 	}
5837 
5838 	return dst;
5839 }
5840 
5841 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5842 				     const struct intel_engine_cs *master,
5843 				     const struct intel_engine_cs *sibling)
5844 {
5845 	struct virtual_engine *ve = to_virtual_engine(engine);
5846 	struct ve_bond *bond;
5847 	int n;
5848 
5849 	/* Sanity check the sibling is part of the virtual engine */
5850 	for (n = 0; n < ve->num_siblings; n++)
5851 		if (sibling == ve->siblings[n])
5852 			break;
5853 	if (n == ve->num_siblings)
5854 		return -EINVAL;
5855 
5856 	bond = virtual_find_bond(ve, master);
5857 	if (bond) {
5858 		bond->sibling_mask |= sibling->mask;
5859 		return 0;
5860 	}
5861 
5862 	bond = krealloc(ve->bonds,
5863 			sizeof(*bond) * (ve->num_bonds + 1),
5864 			GFP_KERNEL);
5865 	if (!bond)
5866 		return -ENOMEM;
5867 
5868 	bond[ve->num_bonds].master = master;
5869 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5870 
5871 	ve->bonds = bond;
5872 	ve->num_bonds++;
5873 
5874 	return 0;
5875 }
5876 
5877 struct intel_engine_cs *
5878 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5879 				 unsigned int sibling)
5880 {
5881 	struct virtual_engine *ve = to_virtual_engine(engine);
5882 
5883 	if (sibling >= ve->num_siblings)
5884 		return NULL;
5885 
5886 	return ve->siblings[sibling];
5887 }
5888 
5889 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5890 				   struct drm_printer *m,
5891 				   void (*show_request)(struct drm_printer *m,
5892 							struct i915_request *rq,
5893 							const char *prefix),
5894 				   unsigned int max)
5895 {
5896 	const struct intel_engine_execlists *execlists = &engine->execlists;
5897 	struct i915_request *rq, *last;
5898 	unsigned long flags;
5899 	unsigned int count;
5900 	struct rb_node *rb;
5901 
5902 	spin_lock_irqsave(&engine->active.lock, flags);
5903 
5904 	last = NULL;
5905 	count = 0;
5906 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5907 		if (count++ < max - 1)
5908 			show_request(m, rq, "\t\tE ");
5909 		else
5910 			last = rq;
5911 	}
5912 	if (last) {
5913 		if (count > max) {
5914 			drm_printf(m,
5915 				   "\t\t...skipping %d executing requests...\n",
5916 				   count - max);
5917 		}
5918 		show_request(m, last, "\t\tE ");
5919 	}
5920 
5921 	if (execlists->switch_priority_hint != INT_MIN)
5922 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5923 			   READ_ONCE(execlists->switch_priority_hint));
5924 	if (execlists->queue_priority_hint != INT_MIN)
5925 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5926 			   READ_ONCE(execlists->queue_priority_hint));
5927 
5928 	last = NULL;
5929 	count = 0;
5930 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5931 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5932 		int i;
5933 
5934 		priolist_for_each_request(rq, p, i) {
5935 			if (count++ < max - 1)
5936 				show_request(m, rq, "\t\tQ ");
5937 			else
5938 				last = rq;
5939 		}
5940 	}
5941 	if (last) {
5942 		if (count > max) {
5943 			drm_printf(m,
5944 				   "\t\t...skipping %d queued requests...\n",
5945 				   count - max);
5946 		}
5947 		show_request(m, last, "\t\tQ ");
5948 	}
5949 
5950 	last = NULL;
5951 	count = 0;
5952 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5953 		struct virtual_engine *ve =
5954 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5955 		struct i915_request *rq = READ_ONCE(ve->request);
5956 
5957 		if (rq) {
5958 			if (count++ < max - 1)
5959 				show_request(m, rq, "\t\tV ");
5960 			else
5961 				last = rq;
5962 		}
5963 	}
5964 	if (last) {
5965 		if (count > max) {
5966 			drm_printf(m,
5967 				   "\t\t...skipping %d virtual requests...\n",
5968 				   count - max);
5969 		}
5970 		show_request(m, last, "\t\tV ");
5971 	}
5972 
5973 	spin_unlock_irqrestore(&engine->active.lock, flags);
5974 }
5975 
5976 void intel_lr_context_reset(struct intel_engine_cs *engine,
5977 			    struct intel_context *ce,
5978 			    u32 head,
5979 			    bool scrub)
5980 {
5981 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5982 
5983 	/*
5984 	 * We want a simple context + ring to execute the breadcrumb update.
5985 	 * We cannot rely on the context being intact across the GPU hang,
5986 	 * so clear it and rebuild just what we need for the breadcrumb.
5987 	 * All pending requests for this context will be zapped, and any
5988 	 * future request will be after userspace has had the opportunity
5989 	 * to recreate its own state.
5990 	 */
5991 	if (scrub)
5992 		restore_default_state(ce, engine);
5993 
5994 	/* Rerun the request; its payload has been neutered (if guilty). */
5995 	__execlists_update_reg_state(ce, engine, head);
5996 }
5997 
5998 bool
5999 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6000 {
6001 	return engine->set_default_submission ==
6002 	       intel_execlists_set_default_submission;
6003 }
6004 
6005 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6006 #include "selftest_lrc.c"
6007 #endif
6008