xref: /linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 15a1fbdcfb519c2bd291ed01c6c94e0b89537a77)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 
180 struct virtual_engine {
181 	struct intel_engine_cs base;
182 	struct intel_context context;
183 
184 	/*
185 	 * We allow only a single request through the virtual engine at a time
186 	 * (each request in the timeline waits for the completion fence of
187 	 * the previous before being submitted). By restricting ourselves to
188 	 * only submitting a single request, each request is placed on to a
189 	 * physical to maximise load spreading (by virtue of the late greedy
190 	 * scheduling -- each real engine takes the next available request
191 	 * upon idling).
192 	 */
193 	struct i915_request *request;
194 
195 	/*
196 	 * We keep a rbtree of available virtual engines inside each physical
197 	 * engine, sorted by priority. Here we preallocate the nodes we need
198 	 * for the virtual engine, indexed by physical_engine->id.
199 	 */
200 	struct ve_node {
201 		struct rb_node rb;
202 		int prio;
203 	} nodes[I915_NUM_ENGINES];
204 
205 	/*
206 	 * Keep track of bonded pairs -- restrictions upon on our selection
207 	 * of physical engines any particular request may be submitted to.
208 	 * If we receive a submit-fence from a master engine, we will only
209 	 * use one of sibling_mask physical engines.
210 	 */
211 	struct ve_bond {
212 		const struct intel_engine_cs *master;
213 		intel_engine_mask_t sibling_mask;
214 	} *bonds;
215 	unsigned int num_bonds;
216 
217 	/* And finally, which physical engines this virtual engine maps onto. */
218 	unsigned int num_siblings;
219 	struct intel_engine_cs *siblings[0];
220 };
221 
222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
223 {
224 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
225 	return container_of(engine, struct virtual_engine, base);
226 }
227 
228 static int __execlists_context_alloc(struct intel_context *ce,
229 				     struct intel_engine_cs *engine);
230 
231 static void execlists_init_reg_state(u32 *reg_state,
232 				     const struct intel_context *ce,
233 				     const struct intel_engine_cs *engine,
234 				     const struct intel_ring *ring,
235 				     bool close);
236 static void
237 __execlists_update_reg_state(const struct intel_context *ce,
238 			     const struct intel_engine_cs *engine,
239 			     u32 head);
240 
241 static void mark_eio(struct i915_request *rq)
242 {
243 	if (i915_request_completed(rq))
244 		return;
245 
246 	GEM_BUG_ON(i915_request_signaled(rq));
247 
248 	dma_fence_set_error(&rq->fence, -EIO);
249 	i915_request_mark_complete(rq);
250 }
251 
252 static struct i915_request *
253 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
254 {
255 	struct i915_request *active = rq;
256 
257 	rcu_read_lock();
258 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
259 		if (i915_request_completed(rq))
260 			break;
261 
262 		active = rq;
263 	}
264 	rcu_read_unlock();
265 
266 	return active;
267 }
268 
269 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
270 {
271 	return (i915_ggtt_offset(engine->status_page.vma) +
272 		I915_GEM_HWS_PREEMPT_ADDR);
273 }
274 
275 static inline void
276 ring_set_paused(const struct intel_engine_cs *engine, int state)
277 {
278 	/*
279 	 * We inspect HWS_PREEMPT with a semaphore inside
280 	 * engine->emit_fini_breadcrumb. If the dword is true,
281 	 * the ring is paused as the semaphore will busywait
282 	 * until the dword is false.
283 	 */
284 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
285 	if (state)
286 		wmb();
287 }
288 
289 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
290 {
291 	return rb_entry(rb, struct i915_priolist, node);
292 }
293 
294 static inline int rq_prio(const struct i915_request *rq)
295 {
296 	return rq->sched.attr.priority;
297 }
298 
299 static int effective_prio(const struct i915_request *rq)
300 {
301 	int prio = rq_prio(rq);
302 
303 	/*
304 	 * If this request is special and must not be interrupted at any
305 	 * cost, so be it. Note we are only checking the most recent request
306 	 * in the context and so may be masking an earlier vip request. It
307 	 * is hoped that under the conditions where nopreempt is used, this
308 	 * will not matter (i.e. all requests to that context will be
309 	 * nopreempt for as long as desired).
310 	 */
311 	if (i915_request_has_nopreempt(rq))
312 		prio = I915_PRIORITY_UNPREEMPTABLE;
313 
314 	/*
315 	 * On unwinding the active request, we give it a priority bump
316 	 * if it has completed waiting on any semaphore. If we know that
317 	 * the request has already started, we can prevent an unwanted
318 	 * preempt-to-idle cycle by taking that into account now.
319 	 */
320 	if (__i915_request_has_started(rq))
321 		prio |= I915_PRIORITY_NOSEMAPHORE;
322 
323 	/* Restrict mere WAIT boosts from triggering preemption */
324 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
325 	return prio | __NO_PREEMPTION;
326 }
327 
328 static int queue_prio(const struct intel_engine_execlists *execlists)
329 {
330 	struct i915_priolist *p;
331 	struct rb_node *rb;
332 
333 	rb = rb_first_cached(&execlists->queue);
334 	if (!rb)
335 		return INT_MIN;
336 
337 	/*
338 	 * As the priolist[] are inverted, with the highest priority in [0],
339 	 * we have to flip the index value to become priority.
340 	 */
341 	p = to_priolist(rb);
342 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
343 }
344 
345 static inline bool need_preempt(const struct intel_engine_cs *engine,
346 				const struct i915_request *rq,
347 				struct rb_node *rb)
348 {
349 	int last_prio;
350 
351 	if (!intel_engine_has_semaphores(engine))
352 		return false;
353 
354 	/*
355 	 * Check if the current priority hint merits a preemption attempt.
356 	 *
357 	 * We record the highest value priority we saw during rescheduling
358 	 * prior to this dequeue, therefore we know that if it is strictly
359 	 * less than the current tail of ESLP[0], we do not need to force
360 	 * a preempt-to-idle cycle.
361 	 *
362 	 * However, the priority hint is a mere hint that we may need to
363 	 * preempt. If that hint is stale or we may be trying to preempt
364 	 * ourselves, ignore the request.
365 	 *
366 	 * More naturally we would write
367 	 *      prio >= max(0, last);
368 	 * except that we wish to prevent triggering preemption at the same
369 	 * priority level: the task that is running should remain running
370 	 * to preserve FIFO ordering of dependencies.
371 	 */
372 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
373 	if (engine->execlists.queue_priority_hint <= last_prio)
374 		return false;
375 
376 	/*
377 	 * Check against the first request in ELSP[1], it will, thanks to the
378 	 * power of PI, be the highest priority of that context.
379 	 */
380 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
381 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
382 		return true;
383 
384 	if (rb) {
385 		struct virtual_engine *ve =
386 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
387 		bool preempt = false;
388 
389 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
390 			struct i915_request *next;
391 
392 			rcu_read_lock();
393 			next = READ_ONCE(ve->request);
394 			if (next)
395 				preempt = rq_prio(next) > last_prio;
396 			rcu_read_unlock();
397 		}
398 
399 		if (preempt)
400 			return preempt;
401 	}
402 
403 	/*
404 	 * If the inflight context did not trigger the preemption, then maybe
405 	 * it was the set of queued requests? Pick the highest priority in
406 	 * the queue (the first active priolist) and see if it deserves to be
407 	 * running instead of ELSP[0].
408 	 *
409 	 * The highest priority request in the queue can not be either
410 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
411 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
412 	 */
413 	return queue_prio(&engine->execlists) > last_prio;
414 }
415 
416 __maybe_unused static inline bool
417 assert_priority_queue(const struct i915_request *prev,
418 		      const struct i915_request *next)
419 {
420 	/*
421 	 * Without preemption, the prev may refer to the still active element
422 	 * which we refuse to let go.
423 	 *
424 	 * Even with preemption, there are times when we think it is better not
425 	 * to preempt and leave an ostensibly lower priority request in flight.
426 	 */
427 	if (i915_request_is_active(prev))
428 		return true;
429 
430 	return rq_prio(prev) >= rq_prio(next);
431 }
432 
433 /*
434  * The context descriptor encodes various attributes of a context,
435  * including its GTT address and some flags. Because it's fairly
436  * expensive to calculate, we'll just do it once and cache the result,
437  * which remains valid until the context is unpinned.
438  *
439  * This is what a descriptor looks like, from LSB to MSB::
440  *
441  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
442  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
443  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
444  *      bits 53-54:    mbz, reserved for use by hardware
445  *      bits 55-63:    group ID, currently unused and set to 0
446  *
447  * Starting from Gen11, the upper dword of the descriptor has a new format:
448  *
449  *      bits 32-36:    reserved
450  *      bits 37-47:    SW context ID
451  *      bits 48:53:    engine instance
452  *      bit 54:        mbz, reserved for use by hardware
453  *      bits 55-60:    SW counter
454  *      bits 61-63:    engine class
455  *
456  * engine info, SW context ID and SW counter need to form a unique number
457  * (Context ID) per lrc.
458  */
459 static u64
460 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
461 {
462 	u64 desc;
463 
464 	desc = INTEL_LEGACY_32B_CONTEXT;
465 	if (i915_vm_is_4lvl(ce->vm))
466 		desc = INTEL_LEGACY_64B_CONTEXT;
467 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
468 
469 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
470 	if (IS_GEN(engine->i915, 8))
471 		desc |= GEN8_CTX_L3LLC_COHERENT;
472 
473 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
474 	/*
475 	 * The following 32bits are copied into the OA reports (dword 2).
476 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
477 	 * anything below.
478 	 */
479 	if (INTEL_GEN(engine->i915) >= 11) {
480 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
481 								/* bits 48-53 */
482 
483 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
484 								/* bits 61-63 */
485 	}
486 
487 	return desc;
488 }
489 
490 static inline unsigned int dword_in_page(void *addr)
491 {
492 	return offset_in_page(addr) / sizeof(u32);
493 }
494 
495 static void set_offsets(u32 *regs,
496 			const u8 *data,
497 			const struct intel_engine_cs *engine,
498 			bool clear)
499 #define NOP(x) (BIT(7) | (x))
500 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
501 #define POSTED BIT(0)
502 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
503 #define REG16(x) \
504 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
505 	(((x) >> 2) & 0x7f)
506 #define END(x) 0, (x)
507 {
508 	const u32 base = engine->mmio_base;
509 
510 	while (*data) {
511 		u8 count, flags;
512 
513 		if (*data & BIT(7)) { /* skip */
514 			count = *data++ & ~BIT(7);
515 			if (clear)
516 				memset32(regs, MI_NOOP, count);
517 			regs += count;
518 			continue;
519 		}
520 
521 		count = *data & 0x3f;
522 		flags = *data >> 6;
523 		data++;
524 
525 		*regs = MI_LOAD_REGISTER_IMM(count);
526 		if (flags & POSTED)
527 			*regs |= MI_LRI_FORCE_POSTED;
528 		if (INTEL_GEN(engine->i915) >= 11)
529 			*regs |= MI_LRI_CS_MMIO;
530 		regs++;
531 
532 		GEM_BUG_ON(!count);
533 		do {
534 			u32 offset = 0;
535 			u8 v;
536 
537 			do {
538 				v = *data++;
539 				offset <<= 7;
540 				offset |= v & ~BIT(7);
541 			} while (v & BIT(7));
542 
543 			regs[0] = base + (offset << 2);
544 			if (clear)
545 				regs[1] = 0;
546 			regs += 2;
547 		} while (--count);
548 	}
549 
550 	if (clear) {
551 		u8 count = *++data;
552 
553 		/* Clear past the tail for HW access */
554 		GEM_BUG_ON(dword_in_page(regs) > count);
555 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
556 
557 		/* Close the batch; used mainly by live_lrc_layout() */
558 		*regs = MI_BATCH_BUFFER_END;
559 		if (INTEL_GEN(engine->i915) >= 10)
560 			*regs |= BIT(0);
561 	}
562 }
563 
564 static const u8 gen8_xcs_offsets[] = {
565 	NOP(1),
566 	LRI(11, 0),
567 	REG16(0x244),
568 	REG(0x034),
569 	REG(0x030),
570 	REG(0x038),
571 	REG(0x03c),
572 	REG(0x168),
573 	REG(0x140),
574 	REG(0x110),
575 	REG(0x11c),
576 	REG(0x114),
577 	REG(0x118),
578 
579 	NOP(9),
580 	LRI(9, 0),
581 	REG16(0x3a8),
582 	REG16(0x28c),
583 	REG16(0x288),
584 	REG16(0x284),
585 	REG16(0x280),
586 	REG16(0x27c),
587 	REG16(0x278),
588 	REG16(0x274),
589 	REG16(0x270),
590 
591 	NOP(13),
592 	LRI(2, 0),
593 	REG16(0x200),
594 	REG(0x028),
595 
596 	END(80)
597 };
598 
599 static const u8 gen9_xcs_offsets[] = {
600 	NOP(1),
601 	LRI(14, POSTED),
602 	REG16(0x244),
603 	REG(0x034),
604 	REG(0x030),
605 	REG(0x038),
606 	REG(0x03c),
607 	REG(0x168),
608 	REG(0x140),
609 	REG(0x110),
610 	REG(0x11c),
611 	REG(0x114),
612 	REG(0x118),
613 	REG(0x1c0),
614 	REG(0x1c4),
615 	REG(0x1c8),
616 
617 	NOP(3),
618 	LRI(9, POSTED),
619 	REG16(0x3a8),
620 	REG16(0x28c),
621 	REG16(0x288),
622 	REG16(0x284),
623 	REG16(0x280),
624 	REG16(0x27c),
625 	REG16(0x278),
626 	REG16(0x274),
627 	REG16(0x270),
628 
629 	NOP(13),
630 	LRI(1, POSTED),
631 	REG16(0x200),
632 
633 	NOP(13),
634 	LRI(44, POSTED),
635 	REG(0x028),
636 	REG(0x09c),
637 	REG(0x0c0),
638 	REG(0x178),
639 	REG(0x17c),
640 	REG16(0x358),
641 	REG(0x170),
642 	REG(0x150),
643 	REG(0x154),
644 	REG(0x158),
645 	REG16(0x41c),
646 	REG16(0x600),
647 	REG16(0x604),
648 	REG16(0x608),
649 	REG16(0x60c),
650 	REG16(0x610),
651 	REG16(0x614),
652 	REG16(0x618),
653 	REG16(0x61c),
654 	REG16(0x620),
655 	REG16(0x624),
656 	REG16(0x628),
657 	REG16(0x62c),
658 	REG16(0x630),
659 	REG16(0x634),
660 	REG16(0x638),
661 	REG16(0x63c),
662 	REG16(0x640),
663 	REG16(0x644),
664 	REG16(0x648),
665 	REG16(0x64c),
666 	REG16(0x650),
667 	REG16(0x654),
668 	REG16(0x658),
669 	REG16(0x65c),
670 	REG16(0x660),
671 	REG16(0x664),
672 	REG16(0x668),
673 	REG16(0x66c),
674 	REG16(0x670),
675 	REG16(0x674),
676 	REG16(0x678),
677 	REG16(0x67c),
678 	REG(0x068),
679 
680 	END(176)
681 };
682 
683 static const u8 gen12_xcs_offsets[] = {
684 	NOP(1),
685 	LRI(13, POSTED),
686 	REG16(0x244),
687 	REG(0x034),
688 	REG(0x030),
689 	REG(0x038),
690 	REG(0x03c),
691 	REG(0x168),
692 	REG(0x140),
693 	REG(0x110),
694 	REG(0x1c0),
695 	REG(0x1c4),
696 	REG(0x1c8),
697 	REG(0x180),
698 	REG16(0x2b4),
699 
700 	NOP(5),
701 	LRI(9, POSTED),
702 	REG16(0x3a8),
703 	REG16(0x28c),
704 	REG16(0x288),
705 	REG16(0x284),
706 	REG16(0x280),
707 	REG16(0x27c),
708 	REG16(0x278),
709 	REG16(0x274),
710 	REG16(0x270),
711 
712 	END(80)
713 };
714 
715 static const u8 gen8_rcs_offsets[] = {
716 	NOP(1),
717 	LRI(14, POSTED),
718 	REG16(0x244),
719 	REG(0x034),
720 	REG(0x030),
721 	REG(0x038),
722 	REG(0x03c),
723 	REG(0x168),
724 	REG(0x140),
725 	REG(0x110),
726 	REG(0x11c),
727 	REG(0x114),
728 	REG(0x118),
729 	REG(0x1c0),
730 	REG(0x1c4),
731 	REG(0x1c8),
732 
733 	NOP(3),
734 	LRI(9, POSTED),
735 	REG16(0x3a8),
736 	REG16(0x28c),
737 	REG16(0x288),
738 	REG16(0x284),
739 	REG16(0x280),
740 	REG16(0x27c),
741 	REG16(0x278),
742 	REG16(0x274),
743 	REG16(0x270),
744 
745 	NOP(13),
746 	LRI(1, 0),
747 	REG(0x0c8),
748 
749 	END(80)
750 };
751 
752 static const u8 gen9_rcs_offsets[] = {
753 	NOP(1),
754 	LRI(14, POSTED),
755 	REG16(0x244),
756 	REG(0x34),
757 	REG(0x30),
758 	REG(0x38),
759 	REG(0x3c),
760 	REG(0x168),
761 	REG(0x140),
762 	REG(0x110),
763 	REG(0x11c),
764 	REG(0x114),
765 	REG(0x118),
766 	REG(0x1c0),
767 	REG(0x1c4),
768 	REG(0x1c8),
769 
770 	NOP(3),
771 	LRI(9, POSTED),
772 	REG16(0x3a8),
773 	REG16(0x28c),
774 	REG16(0x288),
775 	REG16(0x284),
776 	REG16(0x280),
777 	REG16(0x27c),
778 	REG16(0x278),
779 	REG16(0x274),
780 	REG16(0x270),
781 
782 	NOP(13),
783 	LRI(1, 0),
784 	REG(0xc8),
785 
786 	NOP(13),
787 	LRI(44, POSTED),
788 	REG(0x28),
789 	REG(0x9c),
790 	REG(0xc0),
791 	REG(0x178),
792 	REG(0x17c),
793 	REG16(0x358),
794 	REG(0x170),
795 	REG(0x150),
796 	REG(0x154),
797 	REG(0x158),
798 	REG16(0x41c),
799 	REG16(0x600),
800 	REG16(0x604),
801 	REG16(0x608),
802 	REG16(0x60c),
803 	REG16(0x610),
804 	REG16(0x614),
805 	REG16(0x618),
806 	REG16(0x61c),
807 	REG16(0x620),
808 	REG16(0x624),
809 	REG16(0x628),
810 	REG16(0x62c),
811 	REG16(0x630),
812 	REG16(0x634),
813 	REG16(0x638),
814 	REG16(0x63c),
815 	REG16(0x640),
816 	REG16(0x644),
817 	REG16(0x648),
818 	REG16(0x64c),
819 	REG16(0x650),
820 	REG16(0x654),
821 	REG16(0x658),
822 	REG16(0x65c),
823 	REG16(0x660),
824 	REG16(0x664),
825 	REG16(0x668),
826 	REG16(0x66c),
827 	REG16(0x670),
828 	REG16(0x674),
829 	REG16(0x678),
830 	REG16(0x67c),
831 	REG(0x68),
832 
833 	END(176)
834 };
835 
836 static const u8 gen11_rcs_offsets[] = {
837 	NOP(1),
838 	LRI(15, POSTED),
839 	REG16(0x244),
840 	REG(0x034),
841 	REG(0x030),
842 	REG(0x038),
843 	REG(0x03c),
844 	REG(0x168),
845 	REG(0x140),
846 	REG(0x110),
847 	REG(0x11c),
848 	REG(0x114),
849 	REG(0x118),
850 	REG(0x1c0),
851 	REG(0x1c4),
852 	REG(0x1c8),
853 	REG(0x180),
854 
855 	NOP(1),
856 	LRI(9, POSTED),
857 	REG16(0x3a8),
858 	REG16(0x28c),
859 	REG16(0x288),
860 	REG16(0x284),
861 	REG16(0x280),
862 	REG16(0x27c),
863 	REG16(0x278),
864 	REG16(0x274),
865 	REG16(0x270),
866 
867 	LRI(1, POSTED),
868 	REG(0x1b0),
869 
870 	NOP(10),
871 	LRI(1, 0),
872 	REG(0x0c8),
873 
874 	END(80)
875 };
876 
877 static const u8 gen12_rcs_offsets[] = {
878 	NOP(1),
879 	LRI(13, POSTED),
880 	REG16(0x244),
881 	REG(0x034),
882 	REG(0x030),
883 	REG(0x038),
884 	REG(0x03c),
885 	REG(0x168),
886 	REG(0x140),
887 	REG(0x110),
888 	REG(0x1c0),
889 	REG(0x1c4),
890 	REG(0x1c8),
891 	REG(0x180),
892 	REG16(0x2b4),
893 
894 	NOP(5),
895 	LRI(9, POSTED),
896 	REG16(0x3a8),
897 	REG16(0x28c),
898 	REG16(0x288),
899 	REG16(0x284),
900 	REG16(0x280),
901 	REG16(0x27c),
902 	REG16(0x278),
903 	REG16(0x274),
904 	REG16(0x270),
905 
906 	LRI(3, POSTED),
907 	REG(0x1b0),
908 	REG16(0x5a8),
909 	REG16(0x5ac),
910 
911 	NOP(6),
912 	LRI(1, 0),
913 	REG(0x0c8),
914 
915 	END(80)
916 };
917 
918 #undef END
919 #undef REG16
920 #undef REG
921 #undef LRI
922 #undef NOP
923 
924 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
925 {
926 	/*
927 	 * The gen12+ lists only have the registers we program in the basic
928 	 * default state. We rely on the context image using relative
929 	 * addressing to automatic fixup the register state between the
930 	 * physical engines for virtual engine.
931 	 */
932 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
933 		   !intel_engine_has_relative_mmio(engine));
934 
935 	if (engine->class == RENDER_CLASS) {
936 		if (INTEL_GEN(engine->i915) >= 12)
937 			return gen12_rcs_offsets;
938 		else if (INTEL_GEN(engine->i915) >= 11)
939 			return gen11_rcs_offsets;
940 		else if (INTEL_GEN(engine->i915) >= 9)
941 			return gen9_rcs_offsets;
942 		else
943 			return gen8_rcs_offsets;
944 	} else {
945 		if (INTEL_GEN(engine->i915) >= 12)
946 			return gen12_xcs_offsets;
947 		else if (INTEL_GEN(engine->i915) >= 9)
948 			return gen9_xcs_offsets;
949 		else
950 			return gen8_xcs_offsets;
951 	}
952 }
953 
954 static struct i915_request *
955 __unwind_incomplete_requests(struct intel_engine_cs *engine)
956 {
957 	struct i915_request *rq, *rn, *active = NULL;
958 	struct list_head *uninitialized_var(pl);
959 	int prio = I915_PRIORITY_INVALID;
960 
961 	lockdep_assert_held(&engine->active.lock);
962 
963 	list_for_each_entry_safe_reverse(rq, rn,
964 					 &engine->active.requests,
965 					 sched.link) {
966 		if (i915_request_completed(rq))
967 			continue; /* XXX */
968 
969 		__i915_request_unsubmit(rq);
970 
971 		/*
972 		 * Push the request back into the queue for later resubmission.
973 		 * If this request is not native to this physical engine (i.e.
974 		 * it came from a virtual source), push it back onto the virtual
975 		 * engine so that it can be moved across onto another physical
976 		 * engine as load dictates.
977 		 */
978 		if (likely(rq->execution_mask == engine->mask)) {
979 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
980 			if (rq_prio(rq) != prio) {
981 				prio = rq_prio(rq);
982 				pl = i915_sched_lookup_priolist(engine, prio);
983 			}
984 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
985 
986 			list_move(&rq->sched.link, pl);
987 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
988 
989 			active = rq;
990 		} else {
991 			struct intel_engine_cs *owner = rq->context->engine;
992 
993 			/*
994 			 * Decouple the virtual breadcrumb before moving it
995 			 * back to the virtual engine -- we don't want the
996 			 * request to complete in the background and try
997 			 * and cancel the breadcrumb on the virtual engine
998 			 * (instead of the old engine where it is linked)!
999 			 */
1000 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1001 				     &rq->fence.flags)) {
1002 				spin_lock_nested(&rq->lock,
1003 						 SINGLE_DEPTH_NESTING);
1004 				i915_request_cancel_breadcrumb(rq);
1005 				spin_unlock(&rq->lock);
1006 			}
1007 			rq->engine = owner;
1008 			owner->submit_request(rq);
1009 			active = NULL;
1010 		}
1011 	}
1012 
1013 	return active;
1014 }
1015 
1016 struct i915_request *
1017 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1018 {
1019 	struct intel_engine_cs *engine =
1020 		container_of(execlists, typeof(*engine), execlists);
1021 
1022 	return __unwind_incomplete_requests(engine);
1023 }
1024 
1025 static inline void
1026 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1027 {
1028 	/*
1029 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1030 	 * The compiler should eliminate this function as dead-code.
1031 	 */
1032 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1033 		return;
1034 
1035 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1036 				   status, rq);
1037 }
1038 
1039 static void intel_engine_context_in(struct intel_engine_cs *engine)
1040 {
1041 	unsigned long flags;
1042 
1043 	if (READ_ONCE(engine->stats.enabled) == 0)
1044 		return;
1045 
1046 	write_seqlock_irqsave(&engine->stats.lock, flags);
1047 
1048 	if (engine->stats.enabled > 0) {
1049 		if (engine->stats.active++ == 0)
1050 			engine->stats.start = ktime_get();
1051 		GEM_BUG_ON(engine->stats.active == 0);
1052 	}
1053 
1054 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1055 }
1056 
1057 static void intel_engine_context_out(struct intel_engine_cs *engine)
1058 {
1059 	unsigned long flags;
1060 
1061 	if (READ_ONCE(engine->stats.enabled) == 0)
1062 		return;
1063 
1064 	write_seqlock_irqsave(&engine->stats.lock, flags);
1065 
1066 	if (engine->stats.enabled > 0) {
1067 		ktime_t last;
1068 
1069 		if (engine->stats.active && --engine->stats.active == 0) {
1070 			/*
1071 			 * Decrement the active context count and in case GPU
1072 			 * is now idle add up to the running total.
1073 			 */
1074 			last = ktime_sub(ktime_get(), engine->stats.start);
1075 
1076 			engine->stats.total = ktime_add(engine->stats.total,
1077 							last);
1078 		} else if (engine->stats.active == 0) {
1079 			/*
1080 			 * After turning on engine stats, context out might be
1081 			 * the first event in which case we account from the
1082 			 * time stats gathering was turned on.
1083 			 */
1084 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1085 
1086 			engine->stats.total = ktime_add(engine->stats.total,
1087 							last);
1088 		}
1089 	}
1090 
1091 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1092 }
1093 
1094 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1095 {
1096 	if (INTEL_GEN(engine->i915) >= 12)
1097 		return 0x60;
1098 	else if (INTEL_GEN(engine->i915) >= 9)
1099 		return 0x54;
1100 	else if (engine->class == RENDER_CLASS)
1101 		return 0x58;
1102 	else
1103 		return -1;
1104 }
1105 
1106 static void
1107 execlists_check_context(const struct intel_context *ce,
1108 			const struct intel_engine_cs *engine)
1109 {
1110 	const struct intel_ring *ring = ce->ring;
1111 	u32 *regs = ce->lrc_reg_state;
1112 	bool valid = true;
1113 	int x;
1114 
1115 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1116 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1117 		       engine->name,
1118 		       regs[CTX_RING_START],
1119 		       i915_ggtt_offset(ring->vma));
1120 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1121 		valid = false;
1122 	}
1123 
1124 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1125 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1126 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1127 		       engine->name,
1128 		       regs[CTX_RING_CTL],
1129 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1130 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1131 		valid = false;
1132 	}
1133 
1134 	x = lrc_ring_mi_mode(engine);
1135 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1136 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1137 		       engine->name, regs[x + 1]);
1138 		regs[x + 1] &= ~STOP_RING;
1139 		regs[x + 1] |= STOP_RING << 16;
1140 		valid = false;
1141 	}
1142 
1143 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1144 }
1145 
1146 static void restore_default_state(struct intel_context *ce,
1147 				  struct intel_engine_cs *engine)
1148 {
1149 	u32 *regs = ce->lrc_reg_state;
1150 
1151 	if (engine->pinned_default_state)
1152 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1153 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1154 		       engine->context_size - PAGE_SIZE);
1155 
1156 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1157 }
1158 
1159 static void reset_active(struct i915_request *rq,
1160 			 struct intel_engine_cs *engine)
1161 {
1162 	struct intel_context * const ce = rq->context;
1163 	u32 head;
1164 
1165 	/*
1166 	 * The executing context has been cancelled. We want to prevent
1167 	 * further execution along this context and propagate the error on
1168 	 * to anything depending on its results.
1169 	 *
1170 	 * In __i915_request_submit(), we apply the -EIO and remove the
1171 	 * requests' payloads for any banned requests. But first, we must
1172 	 * rewind the context back to the start of the incomplete request so
1173 	 * that we do not jump back into the middle of the batch.
1174 	 *
1175 	 * We preserve the breadcrumbs and semaphores of the incomplete
1176 	 * requests so that inter-timeline dependencies (i.e other timelines)
1177 	 * remain correctly ordered. And we defer to __i915_request_submit()
1178 	 * so that all asynchronous waits are correctly handled.
1179 	 */
1180 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1181 		     rq->fence.context, rq->fence.seqno);
1182 
1183 	/* On resubmission of the active request, payload will be scrubbed */
1184 	if (i915_request_completed(rq))
1185 		head = rq->tail;
1186 	else
1187 		head = active_request(ce->timeline, rq)->head;
1188 	head = intel_ring_wrap(ce->ring, head);
1189 
1190 	/* Scrub the context image to prevent replaying the previous batch */
1191 	restore_default_state(ce, engine);
1192 	__execlists_update_reg_state(ce, engine, head);
1193 
1194 	/* We've switched away, so this should be a no-op, but intent matters */
1195 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1196 }
1197 
1198 static u32 intel_context_get_runtime(const struct intel_context *ce)
1199 {
1200 	/*
1201 	 * We can use either ppHWSP[16] which is recorded before the context
1202 	 * switch (and so excludes the cost of context switches) or use the
1203 	 * value from the context image itself, which is saved/restored earlier
1204 	 * and so includes the cost of the save.
1205 	 */
1206 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1207 }
1208 
1209 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1210 {
1211 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1212 	ce->runtime.num_underflow += dt < 0;
1213 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1214 #endif
1215 }
1216 
1217 static void intel_context_update_runtime(struct intel_context *ce)
1218 {
1219 	u32 old;
1220 	s32 dt;
1221 
1222 	if (intel_context_is_barrier(ce))
1223 		return;
1224 
1225 	old = ce->runtime.last;
1226 	ce->runtime.last = intel_context_get_runtime(ce);
1227 	dt = ce->runtime.last - old;
1228 
1229 	if (unlikely(dt <= 0)) {
1230 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1231 			 old, ce->runtime.last, dt);
1232 		st_update_runtime_underflow(ce, dt);
1233 		return;
1234 	}
1235 
1236 	ewma_runtime_add(&ce->runtime.avg, dt);
1237 	ce->runtime.total += dt;
1238 }
1239 
1240 static inline struct intel_engine_cs *
1241 __execlists_schedule_in(struct i915_request *rq)
1242 {
1243 	struct intel_engine_cs * const engine = rq->engine;
1244 	struct intel_context * const ce = rq->context;
1245 
1246 	intel_context_get(ce);
1247 
1248 	if (unlikely(intel_context_is_banned(ce)))
1249 		reset_active(rq, engine);
1250 
1251 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1252 		execlists_check_context(ce, engine);
1253 
1254 	ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1255 	if (ce->tag) {
1256 		/* Use a fixed tag for OA and friends */
1257 		ce->lrc_desc |= (u64)ce->tag << 32;
1258 	} else {
1259 		/* We don't need a strict matching tag, just different values */
1260 		ce->lrc_desc |=
1261 			(u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1262 			GEN11_SW_CTX_ID_SHIFT;
1263 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1264 	}
1265 
1266 	__intel_gt_pm_get(engine->gt);
1267 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1268 	intel_engine_context_in(engine);
1269 
1270 	return engine;
1271 }
1272 
1273 static inline struct i915_request *
1274 execlists_schedule_in(struct i915_request *rq, int idx)
1275 {
1276 	struct intel_context * const ce = rq->context;
1277 	struct intel_engine_cs *old;
1278 
1279 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1280 	trace_i915_request_in(rq, idx);
1281 
1282 	old = READ_ONCE(ce->inflight);
1283 	do {
1284 		if (!old) {
1285 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1286 			break;
1287 		}
1288 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1289 
1290 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1291 	return i915_request_get(rq);
1292 }
1293 
1294 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1295 {
1296 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1297 	struct i915_request *next = READ_ONCE(ve->request);
1298 
1299 	if (next && next->execution_mask & ~rq->execution_mask)
1300 		tasklet_schedule(&ve->base.execlists.tasklet);
1301 }
1302 
1303 static inline void
1304 __execlists_schedule_out(struct i915_request *rq,
1305 			 struct intel_engine_cs * const engine)
1306 {
1307 	struct intel_context * const ce = rq->context;
1308 
1309 	/*
1310 	 * NB process_csb() is not under the engine->active.lock and hence
1311 	 * schedule_out can race with schedule_in meaning that we should
1312 	 * refrain from doing non-trivial work here.
1313 	 */
1314 
1315 	/*
1316 	 * If we have just completed this context, the engine may now be
1317 	 * idle and we want to re-enter powersaving.
1318 	 */
1319 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
1320 	    i915_request_completed(rq))
1321 		intel_engine_add_retire(engine, ce->timeline);
1322 
1323 	intel_context_update_runtime(ce);
1324 	intel_engine_context_out(engine);
1325 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1326 	intel_gt_pm_put_async(engine->gt);
1327 
1328 	/*
1329 	 * If this is part of a virtual engine, its next request may
1330 	 * have been blocked waiting for access to the active context.
1331 	 * We have to kick all the siblings again in case we need to
1332 	 * switch (e.g. the next request is not runnable on this
1333 	 * engine). Hopefully, we will already have submitted the next
1334 	 * request before the tasklet runs and do not need to rebuild
1335 	 * each virtual tree and kick everyone again.
1336 	 */
1337 	if (ce->engine != engine)
1338 		kick_siblings(rq, ce);
1339 
1340 	intel_context_put(ce);
1341 }
1342 
1343 static inline void
1344 execlists_schedule_out(struct i915_request *rq)
1345 {
1346 	struct intel_context * const ce = rq->context;
1347 	struct intel_engine_cs *cur, *old;
1348 
1349 	trace_i915_request_out(rq);
1350 
1351 	old = READ_ONCE(ce->inflight);
1352 	do
1353 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1354 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1355 	if (!cur)
1356 		__execlists_schedule_out(rq, old);
1357 
1358 	i915_request_put(rq);
1359 }
1360 
1361 static u64 execlists_update_context(struct i915_request *rq)
1362 {
1363 	struct intel_context *ce = rq->context;
1364 	u64 desc = ce->lrc_desc;
1365 	u32 tail, prev;
1366 
1367 	/*
1368 	 * WaIdleLiteRestore:bdw,skl
1369 	 *
1370 	 * We should never submit the context with the same RING_TAIL twice
1371 	 * just in case we submit an empty ring, which confuses the HW.
1372 	 *
1373 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1374 	 * the normal request to be able to always advance the RING_TAIL on
1375 	 * subsequent resubmissions (for lite restore). Should that fail us,
1376 	 * and we try and submit the same tail again, force the context
1377 	 * reload.
1378 	 *
1379 	 * If we need to return to a preempted context, we need to skip the
1380 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1381 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1382 	 * an earlier request.
1383 	 */
1384 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1385 	prev = ce->lrc_reg_state[CTX_RING_TAIL];
1386 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1387 		desc |= CTX_DESC_FORCE_RESTORE;
1388 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1389 	rq->tail = rq->wa_tail;
1390 
1391 	/*
1392 	 * Make sure the context image is complete before we submit it to HW.
1393 	 *
1394 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1395 	 * an uncached write such as our mmio register access, the empirical
1396 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1397 	 * may not be visible to the HW prior to the completion of the UC
1398 	 * register write and that we may begin execution from the context
1399 	 * before its image is complete leading to invalid PD chasing.
1400 	 */
1401 	wmb();
1402 
1403 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1404 	return desc;
1405 }
1406 
1407 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1408 {
1409 	if (execlists->ctrl_reg) {
1410 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1411 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1412 	} else {
1413 		writel(upper_32_bits(desc), execlists->submit_reg);
1414 		writel(lower_32_bits(desc), execlists->submit_reg);
1415 	}
1416 }
1417 
1418 static __maybe_unused void
1419 trace_ports(const struct intel_engine_execlists *execlists,
1420 	    const char *msg,
1421 	    struct i915_request * const *ports)
1422 {
1423 	const struct intel_engine_cs *engine =
1424 		container_of(execlists, typeof(*engine), execlists);
1425 
1426 	if (!ports[0])
1427 		return;
1428 
1429 	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1430 		     ports[0]->fence.context,
1431 		     ports[0]->fence.seqno,
1432 		     i915_request_completed(ports[0]) ? "!" :
1433 		     i915_request_started(ports[0]) ? "*" :
1434 		     "",
1435 		     ports[1] ? ports[1]->fence.context : 0,
1436 		     ports[1] ? ports[1]->fence.seqno : 0);
1437 }
1438 
1439 static inline bool
1440 reset_in_progress(const struct intel_engine_execlists *execlists)
1441 {
1442 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1443 }
1444 
1445 static __maybe_unused bool
1446 assert_pending_valid(const struct intel_engine_execlists *execlists,
1447 		     const char *msg)
1448 {
1449 	struct i915_request * const *port, *rq;
1450 	struct intel_context *ce = NULL;
1451 
1452 	trace_ports(execlists, msg, execlists->pending);
1453 
1454 	/* We may be messing around with the lists during reset, lalala */
1455 	if (reset_in_progress(execlists))
1456 		return true;
1457 
1458 	if (!execlists->pending[0]) {
1459 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1460 		return false;
1461 	}
1462 
1463 	if (execlists->pending[execlists_num_ports(execlists)]) {
1464 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1465 			      execlists_num_ports(execlists));
1466 		return false;
1467 	}
1468 
1469 	for (port = execlists->pending; (rq = *port); port++) {
1470 		unsigned long flags;
1471 		bool ok = true;
1472 
1473 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1474 		GEM_BUG_ON(!i915_request_is_active(rq));
1475 
1476 		if (ce == rq->context) {
1477 			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1478 				      ce->timeline->fence_context,
1479 				      port - execlists->pending);
1480 			return false;
1481 		}
1482 		ce = rq->context;
1483 
1484 		/* Hold tightly onto the lock to prevent concurrent retires! */
1485 		if (!spin_trylock_irqsave(&rq->lock, flags))
1486 			continue;
1487 
1488 		if (i915_request_completed(rq))
1489 			goto unlock;
1490 
1491 		if (i915_active_is_idle(&ce->active) &&
1492 		    !intel_context_is_barrier(ce)) {
1493 			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1494 				      ce->timeline->fence_context,
1495 				      port - execlists->pending);
1496 			ok = false;
1497 			goto unlock;
1498 		}
1499 
1500 		if (!i915_vma_is_pinned(ce->state)) {
1501 			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1502 				      ce->timeline->fence_context,
1503 				      port - execlists->pending);
1504 			ok = false;
1505 			goto unlock;
1506 		}
1507 
1508 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1509 			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1510 				      ce->timeline->fence_context,
1511 				      port - execlists->pending);
1512 			ok = false;
1513 			goto unlock;
1514 		}
1515 
1516 unlock:
1517 		spin_unlock_irqrestore(&rq->lock, flags);
1518 		if (!ok)
1519 			return false;
1520 	}
1521 
1522 	return ce;
1523 }
1524 
1525 static void execlists_submit_ports(struct intel_engine_cs *engine)
1526 {
1527 	struct intel_engine_execlists *execlists = &engine->execlists;
1528 	unsigned int n;
1529 
1530 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1531 
1532 	/*
1533 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1534 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1535 	 * not be relinquished until the device is idle (see
1536 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1537 	 * that all ELSP are drained i.e. we have processed the CSB,
1538 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1539 	 */
1540 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1541 
1542 	/*
1543 	 * ELSQ note: the submit queue is not cleared after being submitted
1544 	 * to the HW so we need to make sure we always clean it up. This is
1545 	 * currently ensured by the fact that we always write the same number
1546 	 * of elsq entries, keep this in mind before changing the loop below.
1547 	 */
1548 	for (n = execlists_num_ports(execlists); n--; ) {
1549 		struct i915_request *rq = execlists->pending[n];
1550 
1551 		write_desc(execlists,
1552 			   rq ? execlists_update_context(rq) : 0,
1553 			   n);
1554 	}
1555 
1556 	/* we need to manually load the submit queue */
1557 	if (execlists->ctrl_reg)
1558 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1559 }
1560 
1561 static bool ctx_single_port_submission(const struct intel_context *ce)
1562 {
1563 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1564 		intel_context_force_single_submission(ce));
1565 }
1566 
1567 static bool can_merge_ctx(const struct intel_context *prev,
1568 			  const struct intel_context *next)
1569 {
1570 	if (prev != next)
1571 		return false;
1572 
1573 	if (ctx_single_port_submission(prev))
1574 		return false;
1575 
1576 	return true;
1577 }
1578 
1579 static bool can_merge_rq(const struct i915_request *prev,
1580 			 const struct i915_request *next)
1581 {
1582 	GEM_BUG_ON(prev == next);
1583 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1584 
1585 	/*
1586 	 * We do not submit known completed requests. Therefore if the next
1587 	 * request is already completed, we can pretend to merge it in
1588 	 * with the previous context (and we will skip updating the ELSP
1589 	 * and tracking). Thus hopefully keeping the ELSP full with active
1590 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1591 	 * us.
1592 	 */
1593 	if (i915_request_completed(next))
1594 		return true;
1595 
1596 	if (unlikely((prev->fence.flags ^ next->fence.flags) &
1597 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1598 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1599 		return false;
1600 
1601 	if (!can_merge_ctx(prev->context, next->context))
1602 		return false;
1603 
1604 	return true;
1605 }
1606 
1607 static void virtual_update_register_offsets(u32 *regs,
1608 					    struct intel_engine_cs *engine)
1609 {
1610 	set_offsets(regs, reg_offsets(engine), engine, false);
1611 }
1612 
1613 static bool virtual_matches(const struct virtual_engine *ve,
1614 			    const struct i915_request *rq,
1615 			    const struct intel_engine_cs *engine)
1616 {
1617 	const struct intel_engine_cs *inflight;
1618 
1619 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1620 		return false;
1621 
1622 	/*
1623 	 * We track when the HW has completed saving the context image
1624 	 * (i.e. when we have seen the final CS event switching out of
1625 	 * the context) and must not overwrite the context image before
1626 	 * then. This restricts us to only using the active engine
1627 	 * while the previous virtualized request is inflight (so
1628 	 * we reuse the register offsets). This is a very small
1629 	 * hystersis on the greedy seelction algorithm.
1630 	 */
1631 	inflight = intel_context_inflight(&ve->context);
1632 	if (inflight && inflight != engine)
1633 		return false;
1634 
1635 	return true;
1636 }
1637 
1638 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1639 				     struct intel_engine_cs *engine)
1640 {
1641 	struct intel_engine_cs *old = ve->siblings[0];
1642 
1643 	/* All unattached (rq->engine == old) must already be completed */
1644 
1645 	spin_lock(&old->breadcrumbs.irq_lock);
1646 	if (!list_empty(&ve->context.signal_link)) {
1647 		list_move_tail(&ve->context.signal_link,
1648 			       &engine->breadcrumbs.signalers);
1649 		intel_engine_signal_breadcrumbs(engine);
1650 	}
1651 	spin_unlock(&old->breadcrumbs.irq_lock);
1652 }
1653 
1654 static struct i915_request *
1655 last_active(const struct intel_engine_execlists *execlists)
1656 {
1657 	struct i915_request * const *last = READ_ONCE(execlists->active);
1658 
1659 	while (*last && i915_request_completed(*last))
1660 		last++;
1661 
1662 	return *last;
1663 }
1664 
1665 #define for_each_waiter(p__, rq__) \
1666 	list_for_each_entry_lockless(p__, \
1667 				     &(rq__)->sched.waiters_list, \
1668 				     wait_link)
1669 
1670 #define for_each_signaler(p__, rq__) \
1671 	list_for_each_entry_rcu(p__, \
1672 				&(rq__)->sched.signalers_list, \
1673 				signal_link)
1674 
1675 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1676 {
1677 	LIST_HEAD(list);
1678 
1679 	/*
1680 	 * We want to move the interrupted request to the back of
1681 	 * the round-robin list (i.e. its priority level), but
1682 	 * in doing so, we must then move all requests that were in
1683 	 * flight and were waiting for the interrupted request to
1684 	 * be run after it again.
1685 	 */
1686 	do {
1687 		struct i915_dependency *p;
1688 
1689 		GEM_BUG_ON(i915_request_is_active(rq));
1690 		list_move_tail(&rq->sched.link, pl);
1691 
1692 		for_each_waiter(p, rq) {
1693 			struct i915_request *w =
1694 				container_of(p->waiter, typeof(*w), sched);
1695 
1696 			/* Leave semaphores spinning on the other engines */
1697 			if (w->engine != rq->engine)
1698 				continue;
1699 
1700 			/* No waiter should start before its signaler */
1701 			GEM_BUG_ON(i915_request_started(w) &&
1702 				   !i915_request_completed(rq));
1703 
1704 			GEM_BUG_ON(i915_request_is_active(w));
1705 			if (!i915_request_is_ready(w))
1706 				continue;
1707 
1708 			if (rq_prio(w) < rq_prio(rq))
1709 				continue;
1710 
1711 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1712 			list_move_tail(&w->sched.link, &list);
1713 		}
1714 
1715 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1716 	} while (rq);
1717 }
1718 
1719 static void defer_active(struct intel_engine_cs *engine)
1720 {
1721 	struct i915_request *rq;
1722 
1723 	rq = __unwind_incomplete_requests(engine);
1724 	if (!rq)
1725 		return;
1726 
1727 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1728 }
1729 
1730 static bool
1731 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1732 {
1733 	int hint;
1734 
1735 	if (!intel_engine_has_timeslices(engine))
1736 		return false;
1737 
1738 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1739 		return false;
1740 
1741 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
1742 		   engine->execlists.queue_priority_hint);
1743 
1744 	return hint >= effective_prio(rq);
1745 }
1746 
1747 static int
1748 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1749 {
1750 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1751 		return INT_MIN;
1752 
1753 	return rq_prio(list_next_entry(rq, sched.link));
1754 }
1755 
1756 static inline unsigned long
1757 timeslice(const struct intel_engine_cs *engine)
1758 {
1759 	return READ_ONCE(engine->props.timeslice_duration_ms);
1760 }
1761 
1762 static unsigned long
1763 active_timeslice(const struct intel_engine_cs *engine)
1764 {
1765 	const struct i915_request *rq = *engine->execlists.active;
1766 
1767 	if (!rq || i915_request_completed(rq))
1768 		return 0;
1769 
1770 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1771 		return 0;
1772 
1773 	return timeslice(engine);
1774 }
1775 
1776 static void set_timeslice(struct intel_engine_cs *engine)
1777 {
1778 	if (!intel_engine_has_timeslices(engine))
1779 		return;
1780 
1781 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1782 }
1783 
1784 static void record_preemption(struct intel_engine_execlists *execlists)
1785 {
1786 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1787 }
1788 
1789 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1790 {
1791 	struct i915_request *rq;
1792 
1793 	rq = last_active(&engine->execlists);
1794 	if (!rq)
1795 		return 0;
1796 
1797 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1798 	if (unlikely(intel_context_is_banned(rq->context)))
1799 		return 1;
1800 
1801 	return READ_ONCE(engine->props.preempt_timeout_ms);
1802 }
1803 
1804 static void set_preempt_timeout(struct intel_engine_cs *engine)
1805 {
1806 	if (!intel_engine_has_preempt_reset(engine))
1807 		return;
1808 
1809 	set_timer_ms(&engine->execlists.preempt,
1810 		     active_preempt_timeout(engine));
1811 }
1812 
1813 static inline void clear_ports(struct i915_request **ports, int count)
1814 {
1815 	memset_p((void **)ports, NULL, count);
1816 }
1817 
1818 static void execlists_dequeue(struct intel_engine_cs *engine)
1819 {
1820 	struct intel_engine_execlists * const execlists = &engine->execlists;
1821 	struct i915_request **port = execlists->pending;
1822 	struct i915_request ** const last_port = port + execlists->port_mask;
1823 	struct i915_request *last;
1824 	struct rb_node *rb;
1825 	bool submit = false;
1826 
1827 	/*
1828 	 * Hardware submission is through 2 ports. Conceptually each port
1829 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1830 	 * static for a context, and unique to each, so we only execute
1831 	 * requests belonging to a single context from each ring. RING_HEAD
1832 	 * is maintained by the CS in the context image, it marks the place
1833 	 * where it got up to last time, and through RING_TAIL we tell the CS
1834 	 * where we want to execute up to this time.
1835 	 *
1836 	 * In this list the requests are in order of execution. Consecutive
1837 	 * requests from the same context are adjacent in the ringbuffer. We
1838 	 * can combine these requests into a single RING_TAIL update:
1839 	 *
1840 	 *              RING_HEAD...req1...req2
1841 	 *                                    ^- RING_TAIL
1842 	 * since to execute req2 the CS must first execute req1.
1843 	 *
1844 	 * Our goal then is to point each port to the end of a consecutive
1845 	 * sequence of requests as being the most optimal (fewest wake ups
1846 	 * and context switches) submission.
1847 	 */
1848 
1849 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1850 		struct virtual_engine *ve =
1851 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1852 		struct i915_request *rq = READ_ONCE(ve->request);
1853 
1854 		if (!rq) { /* lazily cleanup after another engine handled rq */
1855 			rb_erase_cached(rb, &execlists->virtual);
1856 			RB_CLEAR_NODE(rb);
1857 			rb = rb_first_cached(&execlists->virtual);
1858 			continue;
1859 		}
1860 
1861 		if (!virtual_matches(ve, rq, engine)) {
1862 			rb = rb_next(rb);
1863 			continue;
1864 		}
1865 
1866 		break;
1867 	}
1868 
1869 	/*
1870 	 * If the queue is higher priority than the last
1871 	 * request in the currently active context, submit afresh.
1872 	 * We will resubmit again afterwards in case we need to split
1873 	 * the active context to interject the preemption request,
1874 	 * i.e. we will retrigger preemption following the ack in case
1875 	 * of trouble.
1876 	 */
1877 	last = last_active(execlists);
1878 	if (last) {
1879 		if (need_preempt(engine, last, rb)) {
1880 			ENGINE_TRACE(engine,
1881 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1882 				     last->fence.context,
1883 				     last->fence.seqno,
1884 				     last->sched.attr.priority,
1885 				     execlists->queue_priority_hint);
1886 			record_preemption(execlists);
1887 
1888 			/*
1889 			 * Don't let the RING_HEAD advance past the breadcrumb
1890 			 * as we unwind (and until we resubmit) so that we do
1891 			 * not accidentally tell it to go backwards.
1892 			 */
1893 			ring_set_paused(engine, 1);
1894 
1895 			/*
1896 			 * Note that we have not stopped the GPU at this point,
1897 			 * so we are unwinding the incomplete requests as they
1898 			 * remain inflight and so by the time we do complete
1899 			 * the preemption, some of the unwound requests may
1900 			 * complete!
1901 			 */
1902 			__unwind_incomplete_requests(engine);
1903 
1904 			last = NULL;
1905 		} else if (need_timeslice(engine, last) &&
1906 			   timer_expired(&engine->execlists.timer)) {
1907 			ENGINE_TRACE(engine,
1908 				     "expired last=%llx:%lld, prio=%d, hint=%d\n",
1909 				     last->fence.context,
1910 				     last->fence.seqno,
1911 				     last->sched.attr.priority,
1912 				     execlists->queue_priority_hint);
1913 
1914 			ring_set_paused(engine, 1);
1915 			defer_active(engine);
1916 
1917 			/*
1918 			 * Unlike for preemption, if we rewind and continue
1919 			 * executing the same context as previously active,
1920 			 * the order of execution will remain the same and
1921 			 * the tail will only advance. We do not need to
1922 			 * force a full context restore, as a lite-restore
1923 			 * is sufficient to resample the monotonic TAIL.
1924 			 *
1925 			 * If we switch to any other context, similarly we
1926 			 * will not rewind TAIL of current context, and
1927 			 * normal save/restore will preserve state and allow
1928 			 * us to later continue executing the same request.
1929 			 */
1930 			last = NULL;
1931 		} else {
1932 			/*
1933 			 * Otherwise if we already have a request pending
1934 			 * for execution after the current one, we can
1935 			 * just wait until the next CS event before
1936 			 * queuing more. In either case we will force a
1937 			 * lite-restore preemption event, but if we wait
1938 			 * we hopefully coalesce several updates into a single
1939 			 * submission.
1940 			 */
1941 			if (!list_is_last(&last->sched.link,
1942 					  &engine->active.requests)) {
1943 				/*
1944 				 * Even if ELSP[1] is occupied and not worthy
1945 				 * of timeslices, our queue might be.
1946 				 */
1947 				if (!execlists->timer.expires &&
1948 				    need_timeslice(engine, last))
1949 					set_timer_ms(&execlists->timer,
1950 						     timeslice(engine));
1951 
1952 				return;
1953 			}
1954 		}
1955 	}
1956 
1957 	while (rb) { /* XXX virtual is always taking precedence */
1958 		struct virtual_engine *ve =
1959 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1960 		struct i915_request *rq;
1961 
1962 		spin_lock(&ve->base.active.lock);
1963 
1964 		rq = ve->request;
1965 		if (unlikely(!rq)) { /* lost the race to a sibling */
1966 			spin_unlock(&ve->base.active.lock);
1967 			rb_erase_cached(rb, &execlists->virtual);
1968 			RB_CLEAR_NODE(rb);
1969 			rb = rb_first_cached(&execlists->virtual);
1970 			continue;
1971 		}
1972 
1973 		GEM_BUG_ON(rq != ve->request);
1974 		GEM_BUG_ON(rq->engine != &ve->base);
1975 		GEM_BUG_ON(rq->context != &ve->context);
1976 
1977 		if (rq_prio(rq) >= queue_prio(execlists)) {
1978 			if (!virtual_matches(ve, rq, engine)) {
1979 				spin_unlock(&ve->base.active.lock);
1980 				rb = rb_next(rb);
1981 				continue;
1982 			}
1983 
1984 			if (last && !can_merge_rq(last, rq)) {
1985 				spin_unlock(&ve->base.active.lock);
1986 				return; /* leave this for another */
1987 			}
1988 
1989 			ENGINE_TRACE(engine,
1990 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
1991 				     rq->fence.context,
1992 				     rq->fence.seqno,
1993 				     i915_request_completed(rq) ? "!" :
1994 				     i915_request_started(rq) ? "*" :
1995 				     "",
1996 				     yesno(engine != ve->siblings[0]));
1997 
1998 			ve->request = NULL;
1999 			ve->base.execlists.queue_priority_hint = INT_MIN;
2000 			rb_erase_cached(rb, &execlists->virtual);
2001 			RB_CLEAR_NODE(rb);
2002 
2003 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2004 			rq->engine = engine;
2005 
2006 			if (engine != ve->siblings[0]) {
2007 				u32 *regs = ve->context.lrc_reg_state;
2008 				unsigned int n;
2009 
2010 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2011 
2012 				if (!intel_engine_has_relative_mmio(engine))
2013 					virtual_update_register_offsets(regs,
2014 									engine);
2015 
2016 				if (!list_empty(&ve->context.signals))
2017 					virtual_xfer_breadcrumbs(ve, engine);
2018 
2019 				/*
2020 				 * Move the bound engine to the top of the list
2021 				 * for future execution. We then kick this
2022 				 * tasklet first before checking others, so that
2023 				 * we preferentially reuse this set of bound
2024 				 * registers.
2025 				 */
2026 				for (n = 1; n < ve->num_siblings; n++) {
2027 					if (ve->siblings[n] == engine) {
2028 						swap(ve->siblings[n],
2029 						     ve->siblings[0]);
2030 						break;
2031 					}
2032 				}
2033 
2034 				GEM_BUG_ON(ve->siblings[0] != engine);
2035 			}
2036 
2037 			if (__i915_request_submit(rq)) {
2038 				submit = true;
2039 				last = rq;
2040 			}
2041 			i915_request_put(rq);
2042 
2043 			/*
2044 			 * Hmm, we have a bunch of virtual engine requests,
2045 			 * but the first one was already completed (thanks
2046 			 * preempt-to-busy!). Keep looking at the veng queue
2047 			 * until we have no more relevant requests (i.e.
2048 			 * the normal submit queue has higher priority).
2049 			 */
2050 			if (!submit) {
2051 				spin_unlock(&ve->base.active.lock);
2052 				rb = rb_first_cached(&execlists->virtual);
2053 				continue;
2054 			}
2055 		}
2056 
2057 		spin_unlock(&ve->base.active.lock);
2058 		break;
2059 	}
2060 
2061 	while ((rb = rb_first_cached(&execlists->queue))) {
2062 		struct i915_priolist *p = to_priolist(rb);
2063 		struct i915_request *rq, *rn;
2064 		int i;
2065 
2066 		priolist_for_each_request_consume(rq, rn, p, i) {
2067 			bool merge = true;
2068 
2069 			/*
2070 			 * Can we combine this request with the current port?
2071 			 * It has to be the same context/ringbuffer and not
2072 			 * have any exceptions (e.g. GVT saying never to
2073 			 * combine contexts).
2074 			 *
2075 			 * If we can combine the requests, we can execute both
2076 			 * by updating the RING_TAIL to point to the end of the
2077 			 * second request, and so we never need to tell the
2078 			 * hardware about the first.
2079 			 */
2080 			if (last && !can_merge_rq(last, rq)) {
2081 				/*
2082 				 * If we are on the second port and cannot
2083 				 * combine this request with the last, then we
2084 				 * are done.
2085 				 */
2086 				if (port == last_port)
2087 					goto done;
2088 
2089 				/*
2090 				 * We must not populate both ELSP[] with the
2091 				 * same LRCA, i.e. we must submit 2 different
2092 				 * contexts if we submit 2 ELSP.
2093 				 */
2094 				if (last->context == rq->context)
2095 					goto done;
2096 
2097 				if (i915_request_has_sentinel(last))
2098 					goto done;
2099 
2100 				/*
2101 				 * If GVT overrides us we only ever submit
2102 				 * port[0], leaving port[1] empty. Note that we
2103 				 * also have to be careful that we don't queue
2104 				 * the same context (even though a different
2105 				 * request) to the second port.
2106 				 */
2107 				if (ctx_single_port_submission(last->context) ||
2108 				    ctx_single_port_submission(rq->context))
2109 					goto done;
2110 
2111 				merge = false;
2112 			}
2113 
2114 			if (__i915_request_submit(rq)) {
2115 				if (!merge) {
2116 					*port = execlists_schedule_in(last, port - execlists->pending);
2117 					port++;
2118 					last = NULL;
2119 				}
2120 
2121 				GEM_BUG_ON(last &&
2122 					   !can_merge_ctx(last->context,
2123 							  rq->context));
2124 
2125 				submit = true;
2126 				last = rq;
2127 			}
2128 		}
2129 
2130 		rb_erase_cached(&p->node, &execlists->queue);
2131 		i915_priolist_free(p);
2132 	}
2133 
2134 done:
2135 	/*
2136 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2137 	 *
2138 	 * We choose the priority hint such that if we add a request of greater
2139 	 * priority than this, we kick the submission tasklet to decide on
2140 	 * the right order of submitting the requests to hardware. We must
2141 	 * also be prepared to reorder requests as they are in-flight on the
2142 	 * HW. We derive the priority hint then as the first "hole" in
2143 	 * the HW submission ports and if there are no available slots,
2144 	 * the priority of the lowest executing request, i.e. last.
2145 	 *
2146 	 * When we do receive a higher priority request ready to run from the
2147 	 * user, see queue_request(), the priority hint is bumped to that
2148 	 * request triggering preemption on the next dequeue (or subsequent
2149 	 * interrupt for secondary ports).
2150 	 */
2151 	execlists->queue_priority_hint = queue_prio(execlists);
2152 
2153 	if (submit) {
2154 		*port = execlists_schedule_in(last, port - execlists->pending);
2155 		execlists->switch_priority_hint =
2156 			switch_prio(engine, *execlists->pending);
2157 
2158 		/*
2159 		 * Skip if we ended up with exactly the same set of requests,
2160 		 * e.g. trying to timeslice a pair of ordered contexts
2161 		 */
2162 		if (!memcmp(execlists->active, execlists->pending,
2163 			    (port - execlists->pending + 1) * sizeof(*port))) {
2164 			do
2165 				execlists_schedule_out(fetch_and_zero(port));
2166 			while (port-- != execlists->pending);
2167 
2168 			goto skip_submit;
2169 		}
2170 		clear_ports(port + 1, last_port - port);
2171 
2172 		execlists_submit_ports(engine);
2173 		set_preempt_timeout(engine);
2174 	} else {
2175 skip_submit:
2176 		ring_set_paused(engine, 0);
2177 	}
2178 }
2179 
2180 static void
2181 cancel_port_requests(struct intel_engine_execlists * const execlists)
2182 {
2183 	struct i915_request * const *port;
2184 
2185 	for (port = execlists->pending; *port; port++)
2186 		execlists_schedule_out(*port);
2187 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2188 
2189 	/* Mark the end of active before we overwrite *active */
2190 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2191 		execlists_schedule_out(*port);
2192 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2193 
2194 	WRITE_ONCE(execlists->active, execlists->inflight);
2195 }
2196 
2197 static inline void
2198 invalidate_csb_entries(const u32 *first, const u32 *last)
2199 {
2200 	clflush((void *)first);
2201 	clflush((void *)last);
2202 }
2203 
2204 /*
2205  * Starting with Gen12, the status has a new format:
2206  *
2207  *     bit  0:     switched to new queue
2208  *     bit  1:     reserved
2209  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2210  *                 switch detail is set to "wait on semaphore"
2211  *     bits 3-5:   engine class
2212  *     bits 6-11:  engine instance
2213  *     bits 12-14: reserved
2214  *     bits 15-25: sw context id of the lrc the GT switched to
2215  *     bits 26-31: sw counter of the lrc the GT switched to
2216  *     bits 32-35: context switch detail
2217  *                  - 0: ctx complete
2218  *                  - 1: wait on sync flip
2219  *                  - 2: wait on vblank
2220  *                  - 3: wait on scanline
2221  *                  - 4: wait on semaphore
2222  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2223  *                       WAIT_FOR_EVENT)
2224  *     bit  36:    reserved
2225  *     bits 37-43: wait detail (for switch detail 1 to 4)
2226  *     bits 44-46: reserved
2227  *     bits 47-57: sw context id of the lrc the GT switched away from
2228  *     bits 58-63: sw counter of the lrc the GT switched away from
2229  */
2230 static inline bool
2231 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2232 {
2233 	u32 lower_dw = csb[0];
2234 	u32 upper_dw = csb[1];
2235 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2236 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2237 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2238 
2239 	/*
2240 	 * The context switch detail is not guaranteed to be 5 when a preemption
2241 	 * occurs, so we can't just check for that. The check below works for
2242 	 * all the cases we care about, including preemptions of WAIT
2243 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2244 	 * would require some extra handling, but we don't support that.
2245 	 */
2246 	if (!ctx_away_valid || new_queue) {
2247 		GEM_BUG_ON(!ctx_to_valid);
2248 		return true;
2249 	}
2250 
2251 	/*
2252 	 * switch detail = 5 is covered by the case above and we do not expect a
2253 	 * context switch on an unsuccessful wait instruction since we always
2254 	 * use polling mode.
2255 	 */
2256 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2257 	return false;
2258 }
2259 
2260 static inline bool
2261 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2262 {
2263 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2264 }
2265 
2266 static void process_csb(struct intel_engine_cs *engine)
2267 {
2268 	struct intel_engine_execlists * const execlists = &engine->execlists;
2269 	const u32 * const buf = execlists->csb_status;
2270 	const u8 num_entries = execlists->csb_size;
2271 	u8 head, tail;
2272 
2273 	/*
2274 	 * As we modify our execlists state tracking we require exclusive
2275 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2276 	 * and we assume that is only inside the reset paths and so serialised.
2277 	 */
2278 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2279 		   !reset_in_progress(execlists));
2280 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2281 
2282 	/*
2283 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2284 	 * When reading from the csb_write mmio register, we have to be
2285 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2286 	 * the low 4bits. As it happens we know the next 4bits are always
2287 	 * zero and so we can simply masked off the low u8 of the register
2288 	 * and treat it identically to reading from the HWSP (without having
2289 	 * to use explicit shifting and masking, and probably bifurcating
2290 	 * the code to handle the legacy mmio read).
2291 	 */
2292 	head = execlists->csb_head;
2293 	tail = READ_ONCE(*execlists->csb_write);
2294 	if (unlikely(head == tail))
2295 		return;
2296 
2297 	/*
2298 	 * Hopefully paired with a wmb() in HW!
2299 	 *
2300 	 * We must complete the read of the write pointer before any reads
2301 	 * from the CSB, so that we do not see stale values. Without an rmb
2302 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2303 	 * we perform the READ_ONCE(*csb_write).
2304 	 */
2305 	rmb();
2306 
2307 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2308 	do {
2309 		bool promote;
2310 
2311 		if (++head == num_entries)
2312 			head = 0;
2313 
2314 		/*
2315 		 * We are flying near dragons again.
2316 		 *
2317 		 * We hold a reference to the request in execlist_port[]
2318 		 * but no more than that. We are operating in softirq
2319 		 * context and so cannot hold any mutex or sleep. That
2320 		 * prevents us stopping the requests we are processing
2321 		 * in port[] from being retired simultaneously (the
2322 		 * breadcrumb will be complete before we see the
2323 		 * context-switch). As we only hold the reference to the
2324 		 * request, any pointer chasing underneath the request
2325 		 * is subject to a potential use-after-free. Thus we
2326 		 * store all of the bookkeeping within port[] as
2327 		 * required, and avoid using unguarded pointers beneath
2328 		 * request itself. The same applies to the atomic
2329 		 * status notifier.
2330 		 */
2331 
2332 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2333 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2334 
2335 		if (INTEL_GEN(engine->i915) >= 12)
2336 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2337 		else
2338 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2339 		if (promote) {
2340 			struct i915_request * const *old = execlists->active;
2341 
2342 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2343 
2344 			ring_set_paused(engine, 0);
2345 
2346 			/* Point active to the new ELSP; prevent overwriting */
2347 			WRITE_ONCE(execlists->active, execlists->pending);
2348 
2349 			/* cancel old inflight, prepare for switch */
2350 			trace_ports(execlists, "preempted", old);
2351 			while (*old)
2352 				execlists_schedule_out(*old++);
2353 
2354 			/* switch pending to inflight */
2355 			WRITE_ONCE(execlists->active,
2356 				   memcpy(execlists->inflight,
2357 					  execlists->pending,
2358 					  execlists_num_ports(execlists) *
2359 					  sizeof(*execlists->pending)));
2360 
2361 			WRITE_ONCE(execlists->pending[0], NULL);
2362 		} else {
2363 			GEM_BUG_ON(!*execlists->active);
2364 
2365 			/* port0 completed, advanced to port1 */
2366 			trace_ports(execlists, "completed", execlists->active);
2367 
2368 			/*
2369 			 * We rely on the hardware being strongly
2370 			 * ordered, that the breadcrumb write is
2371 			 * coherent (visible from the CPU) before the
2372 			 * user interrupt and CSB is processed.
2373 			 */
2374 			if (GEM_SHOW_DEBUG() &&
2375 			    !i915_request_completed(*execlists->active) &&
2376 			    !reset_in_progress(execlists)) {
2377 				struct i915_request *rq __maybe_unused =
2378 					*execlists->active;
2379 				const u32 *regs __maybe_unused =
2380 					rq->context->lrc_reg_state;
2381 
2382 				ENGINE_TRACE(engine,
2383 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2384 					     ENGINE_READ(engine, RING_START),
2385 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2386 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2387 					     ENGINE_READ(engine, RING_CTL),
2388 					     ENGINE_READ(engine, RING_MI_MODE));
2389 				ENGINE_TRACE(engine,
2390 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2391 					     i915_ggtt_offset(rq->ring->vma),
2392 					     rq->head, rq->tail,
2393 					     rq->fence.context,
2394 					     lower_32_bits(rq->fence.seqno),
2395 					     hwsp_seqno(rq));
2396 				ENGINE_TRACE(engine,
2397 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2398 					     regs[CTX_RING_START],
2399 					     regs[CTX_RING_HEAD],
2400 					     regs[CTX_RING_TAIL]);
2401 
2402 				GEM_BUG_ON("context completed before request");
2403 			}
2404 
2405 			execlists_schedule_out(*execlists->active++);
2406 
2407 			GEM_BUG_ON(execlists->active - execlists->inflight >
2408 				   execlists_num_ports(execlists));
2409 		}
2410 	} while (head != tail);
2411 
2412 	execlists->csb_head = head;
2413 	set_timeslice(engine);
2414 
2415 	/*
2416 	 * Gen11 has proven to fail wrt global observation point between
2417 	 * entry and tail update, failing on the ordering and thus
2418 	 * we see an old entry in the context status buffer.
2419 	 *
2420 	 * Forcibly evict out entries for the next gpu csb update,
2421 	 * to increase the odds that we get a fresh entries with non
2422 	 * working hardware. The cost for doing so comes out mostly with
2423 	 * the wash as hardware, working or not, will need to do the
2424 	 * invalidation before.
2425 	 */
2426 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2427 }
2428 
2429 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2430 {
2431 	lockdep_assert_held(&engine->active.lock);
2432 	if (!READ_ONCE(engine->execlists.pending[0])) {
2433 		rcu_read_lock(); /* protect peeking at execlists->active */
2434 		execlists_dequeue(engine);
2435 		rcu_read_unlock();
2436 	}
2437 }
2438 
2439 static void __execlists_hold(struct i915_request *rq)
2440 {
2441 	LIST_HEAD(list);
2442 
2443 	do {
2444 		struct i915_dependency *p;
2445 
2446 		if (i915_request_is_active(rq))
2447 			__i915_request_unsubmit(rq);
2448 
2449 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2450 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2451 		i915_request_set_hold(rq);
2452 		RQ_TRACE(rq, "on hold\n");
2453 
2454 		for_each_waiter(p, rq) {
2455 			struct i915_request *w =
2456 				container_of(p->waiter, typeof(*w), sched);
2457 
2458 			/* Leave semaphores spinning on the other engines */
2459 			if (w->engine != rq->engine)
2460 				continue;
2461 
2462 			if (!i915_request_is_ready(w))
2463 				continue;
2464 
2465 			if (i915_request_completed(w))
2466 				continue;
2467 
2468 			if (i915_request_on_hold(w))
2469 				continue;
2470 
2471 			list_move_tail(&w->sched.link, &list);
2472 		}
2473 
2474 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2475 	} while (rq);
2476 }
2477 
2478 static bool execlists_hold(struct intel_engine_cs *engine,
2479 			   struct i915_request *rq)
2480 {
2481 	spin_lock_irq(&engine->active.lock);
2482 
2483 	if (i915_request_completed(rq)) { /* too late! */
2484 		rq = NULL;
2485 		goto unlock;
2486 	}
2487 
2488 	if (rq->engine != engine) { /* preempted virtual engine */
2489 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2490 
2491 		/*
2492 		 * intel_context_inflight() is only protected by virtue
2493 		 * of process_csb() being called only by the tasklet (or
2494 		 * directly from inside reset while the tasklet is suspended).
2495 		 * Assert that neither of those are allowed to run while we
2496 		 * poke at the request queues.
2497 		 */
2498 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2499 
2500 		/*
2501 		 * An unsubmitted request along a virtual engine will
2502 		 * remain on the active (this) engine until we are able
2503 		 * to process the context switch away (and so mark the
2504 		 * context as no longer in flight). That cannot have happened
2505 		 * yet, otherwise we would not be hanging!
2506 		 */
2507 		spin_lock(&ve->base.active.lock);
2508 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2509 		GEM_BUG_ON(ve->request != rq);
2510 		ve->request = NULL;
2511 		spin_unlock(&ve->base.active.lock);
2512 		i915_request_put(rq);
2513 
2514 		rq->engine = engine;
2515 	}
2516 
2517 	/*
2518 	 * Transfer this request onto the hold queue to prevent it
2519 	 * being resumbitted to HW (and potentially completed) before we have
2520 	 * released it. Since we may have already submitted following
2521 	 * requests, we need to remove those as well.
2522 	 */
2523 	GEM_BUG_ON(i915_request_on_hold(rq));
2524 	GEM_BUG_ON(rq->engine != engine);
2525 	__execlists_hold(rq);
2526 	GEM_BUG_ON(list_empty(&engine->active.hold));
2527 
2528 unlock:
2529 	spin_unlock_irq(&engine->active.lock);
2530 	return rq;
2531 }
2532 
2533 static bool hold_request(const struct i915_request *rq)
2534 {
2535 	struct i915_dependency *p;
2536 	bool result = false;
2537 
2538 	/*
2539 	 * If one of our ancestors is on hold, we must also be on hold,
2540 	 * otherwise we will bypass it and execute before it.
2541 	 */
2542 	rcu_read_lock();
2543 	for_each_signaler(p, rq) {
2544 		const struct i915_request *s =
2545 			container_of(p->signaler, typeof(*s), sched);
2546 
2547 		if (s->engine != rq->engine)
2548 			continue;
2549 
2550 		result = i915_request_on_hold(s);
2551 		if (result)
2552 			break;
2553 	}
2554 	rcu_read_unlock();
2555 
2556 	return result;
2557 }
2558 
2559 static void __execlists_unhold(struct i915_request *rq)
2560 {
2561 	LIST_HEAD(list);
2562 
2563 	do {
2564 		struct i915_dependency *p;
2565 
2566 		RQ_TRACE(rq, "hold release\n");
2567 
2568 		GEM_BUG_ON(!i915_request_on_hold(rq));
2569 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2570 
2571 		i915_request_clear_hold(rq);
2572 		list_move_tail(&rq->sched.link,
2573 			       i915_sched_lookup_priolist(rq->engine,
2574 							  rq_prio(rq)));
2575 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2576 
2577 		/* Also release any children on this engine that are ready */
2578 		for_each_waiter(p, rq) {
2579 			struct i915_request *w =
2580 				container_of(p->waiter, typeof(*w), sched);
2581 
2582 			if (w->engine != rq->engine)
2583 				continue;
2584 
2585 			if (!i915_request_on_hold(w))
2586 				continue;
2587 
2588 			/* Check that no other parents are also on hold */
2589 			if (hold_request(w))
2590 				continue;
2591 
2592 			list_move_tail(&w->sched.link, &list);
2593 		}
2594 
2595 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2596 	} while (rq);
2597 }
2598 
2599 static void execlists_unhold(struct intel_engine_cs *engine,
2600 			     struct i915_request *rq)
2601 {
2602 	spin_lock_irq(&engine->active.lock);
2603 
2604 	/*
2605 	 * Move this request back to the priority queue, and all of its
2606 	 * children and grandchildren that were suspended along with it.
2607 	 */
2608 	__execlists_unhold(rq);
2609 
2610 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2611 		engine->execlists.queue_priority_hint = rq_prio(rq);
2612 		tasklet_hi_schedule(&engine->execlists.tasklet);
2613 	}
2614 
2615 	spin_unlock_irq(&engine->active.lock);
2616 }
2617 
2618 struct execlists_capture {
2619 	struct work_struct work;
2620 	struct i915_request *rq;
2621 	struct i915_gpu_coredump *error;
2622 };
2623 
2624 static void execlists_capture_work(struct work_struct *work)
2625 {
2626 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2627 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2628 	struct intel_engine_cs *engine = cap->rq->engine;
2629 	struct intel_gt_coredump *gt = cap->error->gt;
2630 	struct intel_engine_capture_vma *vma;
2631 
2632 	/* Compress all the objects attached to the request, slow! */
2633 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2634 	if (vma) {
2635 		struct i915_vma_compress *compress =
2636 			i915_vma_capture_prepare(gt);
2637 
2638 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2639 		i915_vma_capture_finish(gt, compress);
2640 	}
2641 
2642 	gt->simulated = gt->engine->simulated;
2643 	cap->error->simulated = gt->simulated;
2644 
2645 	/* Publish the error state, and announce it to the world */
2646 	i915_error_state_store(cap->error);
2647 	i915_gpu_coredump_put(cap->error);
2648 
2649 	/* Return this request and all that depend upon it for signaling */
2650 	execlists_unhold(engine, cap->rq);
2651 	i915_request_put(cap->rq);
2652 
2653 	kfree(cap);
2654 }
2655 
2656 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2657 {
2658 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2659 	struct execlists_capture *cap;
2660 
2661 	cap = kmalloc(sizeof(*cap), gfp);
2662 	if (!cap)
2663 		return NULL;
2664 
2665 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2666 	if (!cap->error)
2667 		goto err_cap;
2668 
2669 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2670 	if (!cap->error->gt)
2671 		goto err_gpu;
2672 
2673 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2674 	if (!cap->error->gt->engine)
2675 		goto err_gt;
2676 
2677 	return cap;
2678 
2679 err_gt:
2680 	kfree(cap->error->gt);
2681 err_gpu:
2682 	kfree(cap->error);
2683 err_cap:
2684 	kfree(cap);
2685 	return NULL;
2686 }
2687 
2688 static bool execlists_capture(struct intel_engine_cs *engine)
2689 {
2690 	struct execlists_capture *cap;
2691 
2692 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2693 		return true;
2694 
2695 	/*
2696 	 * We need to _quickly_ capture the engine state before we reset.
2697 	 * We are inside an atomic section (softirq) here and we are delaying
2698 	 * the forced preemption event.
2699 	 */
2700 	cap = capture_regs(engine);
2701 	if (!cap)
2702 		return true;
2703 
2704 	spin_lock_irq(&engine->active.lock);
2705 	cap->rq = execlists_active(&engine->execlists);
2706 	if (cap->rq) {
2707 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2708 		cap->rq = i915_request_get_rcu(cap->rq);
2709 	}
2710 	spin_unlock_irq(&engine->active.lock);
2711 	if (!cap->rq)
2712 		goto err_free;
2713 
2714 	/*
2715 	 * Remove the request from the execlists queue, and take ownership
2716 	 * of the request. We pass it to our worker who will _slowly_ compress
2717 	 * all the pages the _user_ requested for debugging their batch, after
2718 	 * which we return it to the queue for signaling.
2719 	 *
2720 	 * By removing them from the execlists queue, we also remove the
2721 	 * requests from being processed by __unwind_incomplete_requests()
2722 	 * during the intel_engine_reset(), and so they will *not* be replayed
2723 	 * afterwards.
2724 	 *
2725 	 * Note that because we have not yet reset the engine at this point,
2726 	 * it is possible for the request that we have identified as being
2727 	 * guilty, did in fact complete and we will then hit an arbitration
2728 	 * point allowing the outstanding preemption to succeed. The likelihood
2729 	 * of that is very low (as capturing of the engine registers should be
2730 	 * fast enough to run inside an irq-off atomic section!), so we will
2731 	 * simply hold that request accountable for being non-preemptible
2732 	 * long enough to force the reset.
2733 	 */
2734 	if (!execlists_hold(engine, cap->rq))
2735 		goto err_rq;
2736 
2737 	INIT_WORK(&cap->work, execlists_capture_work);
2738 	schedule_work(&cap->work);
2739 	return true;
2740 
2741 err_rq:
2742 	i915_request_put(cap->rq);
2743 err_free:
2744 	i915_gpu_coredump_put(cap->error);
2745 	kfree(cap);
2746 	return false;
2747 }
2748 
2749 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2750 {
2751 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2752 	unsigned long *lock = &engine->gt->reset.flags;
2753 
2754 	if (!intel_has_reset_engine(engine->gt))
2755 		return;
2756 
2757 	if (test_and_set_bit(bit, lock))
2758 		return;
2759 
2760 	ENGINE_TRACE(engine, "reset for %s\n", msg);
2761 
2762 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2763 	tasklet_disable_nosync(&engine->execlists.tasklet);
2764 
2765 	ring_set_paused(engine, 1); /* Freeze the current request in place */
2766 	if (execlists_capture(engine))
2767 		intel_engine_reset(engine, msg);
2768 	else
2769 		ring_set_paused(engine, 0);
2770 
2771 	tasklet_enable(&engine->execlists.tasklet);
2772 	clear_and_wake_up_bit(bit, lock);
2773 }
2774 
2775 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2776 {
2777 	const struct timer_list *t = &engine->execlists.preempt;
2778 
2779 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2780 		return false;
2781 
2782 	if (!timer_expired(t))
2783 		return false;
2784 
2785 	return READ_ONCE(engine->execlists.pending[0]);
2786 }
2787 
2788 /*
2789  * Check the unread Context Status Buffers and manage the submission of new
2790  * contexts to the ELSP accordingly.
2791  */
2792 static void execlists_submission_tasklet(unsigned long data)
2793 {
2794 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2795 	bool timeout = preempt_timeout(engine);
2796 
2797 	process_csb(engine);
2798 
2799 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2800 		engine->execlists.error_interrupt = 0;
2801 		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
2802 			execlists_reset(engine, "CS error");
2803 	}
2804 
2805 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2806 		unsigned long flags;
2807 
2808 		spin_lock_irqsave(&engine->active.lock, flags);
2809 		__execlists_submission_tasklet(engine);
2810 		spin_unlock_irqrestore(&engine->active.lock, flags);
2811 
2812 		/* Recheck after serialising with direct-submission */
2813 		if (unlikely(timeout && preempt_timeout(engine)))
2814 			execlists_reset(engine, "preemption time out");
2815 	}
2816 }
2817 
2818 static void __execlists_kick(struct intel_engine_execlists *execlists)
2819 {
2820 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2821 	tasklet_hi_schedule(&execlists->tasklet);
2822 }
2823 
2824 #define execlists_kick(t, member) \
2825 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2826 
2827 static void execlists_timeslice(struct timer_list *timer)
2828 {
2829 	execlists_kick(timer, timer);
2830 }
2831 
2832 static void execlists_preempt(struct timer_list *timer)
2833 {
2834 	execlists_kick(timer, preempt);
2835 }
2836 
2837 static void queue_request(struct intel_engine_cs *engine,
2838 			  struct i915_request *rq)
2839 {
2840 	GEM_BUG_ON(!list_empty(&rq->sched.link));
2841 	list_add_tail(&rq->sched.link,
2842 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
2843 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2844 }
2845 
2846 static void __submit_queue_imm(struct intel_engine_cs *engine)
2847 {
2848 	struct intel_engine_execlists * const execlists = &engine->execlists;
2849 
2850 	if (reset_in_progress(execlists))
2851 		return; /* defer until we restart the engine following reset */
2852 
2853 	if (execlists->tasklet.func == execlists_submission_tasklet)
2854 		__execlists_submission_tasklet(engine);
2855 	else
2856 		tasklet_hi_schedule(&execlists->tasklet);
2857 }
2858 
2859 static void submit_queue(struct intel_engine_cs *engine,
2860 			 const struct i915_request *rq)
2861 {
2862 	struct intel_engine_execlists *execlists = &engine->execlists;
2863 
2864 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2865 		return;
2866 
2867 	execlists->queue_priority_hint = rq_prio(rq);
2868 	__submit_queue_imm(engine);
2869 }
2870 
2871 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2872 			     const struct i915_request *rq)
2873 {
2874 	GEM_BUG_ON(i915_request_on_hold(rq));
2875 	return !list_empty(&engine->active.hold) && hold_request(rq);
2876 }
2877 
2878 static void execlists_submit_request(struct i915_request *request)
2879 {
2880 	struct intel_engine_cs *engine = request->engine;
2881 	unsigned long flags;
2882 
2883 	/* Will be called from irq-context when using foreign fences. */
2884 	spin_lock_irqsave(&engine->active.lock, flags);
2885 
2886 	if (unlikely(ancestor_on_hold(engine, request))) {
2887 		RQ_TRACE(request, "ancestor on hold\n");
2888 		list_add_tail(&request->sched.link, &engine->active.hold);
2889 		i915_request_set_hold(request);
2890 	} else {
2891 		queue_request(engine, request);
2892 
2893 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2894 		GEM_BUG_ON(list_empty(&request->sched.link));
2895 
2896 		submit_queue(engine, request);
2897 	}
2898 
2899 	spin_unlock_irqrestore(&engine->active.lock, flags);
2900 }
2901 
2902 static void __execlists_context_fini(struct intel_context *ce)
2903 {
2904 	intel_ring_put(ce->ring);
2905 	i915_vma_put(ce->state);
2906 }
2907 
2908 static void execlists_context_destroy(struct kref *kref)
2909 {
2910 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2911 
2912 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2913 	GEM_BUG_ON(intel_context_is_pinned(ce));
2914 
2915 	if (ce->state)
2916 		__execlists_context_fini(ce);
2917 
2918 	intel_context_fini(ce);
2919 	intel_context_free(ce);
2920 }
2921 
2922 static void
2923 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2924 {
2925 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2926 		return;
2927 
2928 	vaddr += engine->context_size;
2929 
2930 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2931 }
2932 
2933 static void
2934 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2935 {
2936 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2937 		return;
2938 
2939 	vaddr += engine->context_size;
2940 
2941 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2942 		dev_err_once(engine->i915->drm.dev,
2943 			     "%s context redzone overwritten!\n",
2944 			     engine->name);
2945 }
2946 
2947 static void execlists_context_unpin(struct intel_context *ce)
2948 {
2949 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2950 		      ce->engine);
2951 
2952 	i915_gem_object_unpin_map(ce->state->obj);
2953 }
2954 
2955 static void
2956 __execlists_update_reg_state(const struct intel_context *ce,
2957 			     const struct intel_engine_cs *engine,
2958 			     u32 head)
2959 {
2960 	struct intel_ring *ring = ce->ring;
2961 	u32 *regs = ce->lrc_reg_state;
2962 
2963 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
2964 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2965 
2966 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2967 	regs[CTX_RING_HEAD] = head;
2968 	regs[CTX_RING_TAIL] = ring->tail;
2969 
2970 	/* RPCS */
2971 	if (engine->class == RENDER_CLASS) {
2972 		regs[CTX_R_PWR_CLK_STATE] =
2973 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2974 
2975 		i915_oa_init_reg_state(ce, engine);
2976 	}
2977 }
2978 
2979 static int
2980 __execlists_context_pin(struct intel_context *ce,
2981 			struct intel_engine_cs *engine)
2982 {
2983 	void *vaddr;
2984 
2985 	GEM_BUG_ON(!ce->state);
2986 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2987 
2988 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2989 					i915_coherent_map_type(engine->i915) |
2990 					I915_MAP_OVERRIDE);
2991 	if (IS_ERR(vaddr))
2992 		return PTR_ERR(vaddr);
2993 
2994 	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2995 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2996 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
2997 
2998 	return 0;
2999 }
3000 
3001 static int execlists_context_pin(struct intel_context *ce)
3002 {
3003 	return __execlists_context_pin(ce, ce->engine);
3004 }
3005 
3006 static int execlists_context_alloc(struct intel_context *ce)
3007 {
3008 	return __execlists_context_alloc(ce, ce->engine);
3009 }
3010 
3011 static void execlists_context_reset(struct intel_context *ce)
3012 {
3013 	CE_TRACE(ce, "reset\n");
3014 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3015 
3016 	intel_ring_reset(ce->ring, ce->ring->emit);
3017 
3018 	/* Scrub away the garbage */
3019 	execlists_init_reg_state(ce->lrc_reg_state,
3020 				 ce, ce->engine, ce->ring, true);
3021 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3022 
3023 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
3024 }
3025 
3026 static const struct intel_context_ops execlists_context_ops = {
3027 	.alloc = execlists_context_alloc,
3028 
3029 	.pin = execlists_context_pin,
3030 	.unpin = execlists_context_unpin,
3031 
3032 	.enter = intel_context_enter_engine,
3033 	.exit = intel_context_exit_engine,
3034 
3035 	.reset = execlists_context_reset,
3036 	.destroy = execlists_context_destroy,
3037 };
3038 
3039 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3040 {
3041 	u32 *cs;
3042 
3043 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3044 		return 0;
3045 
3046 	cs = intel_ring_begin(rq, 6);
3047 	if (IS_ERR(cs))
3048 		return PTR_ERR(cs);
3049 
3050 	/*
3051 	 * Check if we have been preempted before we even get started.
3052 	 *
3053 	 * After this point i915_request_started() reports true, even if
3054 	 * we get preempted and so are no longer running.
3055 	 */
3056 	*cs++ = MI_ARB_CHECK;
3057 	*cs++ = MI_NOOP;
3058 
3059 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3060 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3061 	*cs++ = 0;
3062 	*cs++ = rq->fence.seqno - 1;
3063 
3064 	intel_ring_advance(rq, cs);
3065 
3066 	/* Record the updated position of the request's payload */
3067 	rq->infix = intel_ring_offset(rq, cs);
3068 
3069 	return 0;
3070 }
3071 
3072 static int execlists_request_alloc(struct i915_request *request)
3073 {
3074 	int ret;
3075 
3076 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3077 
3078 	/*
3079 	 * Flush enough space to reduce the likelihood of waiting after
3080 	 * we start building the request - in which case we will just
3081 	 * have to repeat work.
3082 	 */
3083 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3084 
3085 	/*
3086 	 * Note that after this point, we have committed to using
3087 	 * this request as it is being used to both track the
3088 	 * state of engine initialisation and liveness of the
3089 	 * golden renderstate above. Think twice before you try
3090 	 * to cancel/unwind this request now.
3091 	 */
3092 
3093 	/* Unconditionally invalidate GPU caches and TLBs. */
3094 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3095 	if (ret)
3096 		return ret;
3097 
3098 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3099 	return 0;
3100 }
3101 
3102 /*
3103  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3104  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3105  * but there is a slight complication as this is applied in WA batch where the
3106  * values are only initialized once so we cannot take register value at the
3107  * beginning and reuse it further; hence we save its value to memory, upload a
3108  * constant value with bit21 set and then we restore it back with the saved value.
3109  * To simplify the WA, a constant value is formed by using the default value
3110  * of this register. This shouldn't be a problem because we are only modifying
3111  * it for a short period and this batch in non-premptible. We can ofcourse
3112  * use additional instructions that read the actual value of the register
3113  * at that time and set our bit of interest but it makes the WA complicated.
3114  *
3115  * This WA is also required for Gen9 so extracting as a function avoids
3116  * code duplication.
3117  */
3118 static u32 *
3119 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3120 {
3121 	/* NB no one else is allowed to scribble over scratch + 256! */
3122 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3123 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3124 	*batch++ = intel_gt_scratch_offset(engine->gt,
3125 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3126 	*batch++ = 0;
3127 
3128 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3129 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3130 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3131 
3132 	batch = gen8_emit_pipe_control(batch,
3133 				       PIPE_CONTROL_CS_STALL |
3134 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3135 				       0);
3136 
3137 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3138 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3139 	*batch++ = intel_gt_scratch_offset(engine->gt,
3140 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3141 	*batch++ = 0;
3142 
3143 	return batch;
3144 }
3145 
3146 /*
3147  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3148  * initialized at the beginning and shared across all contexts but this field
3149  * helps us to have multiple batches at different offsets and select them based
3150  * on a criteria. At the moment this batch always start at the beginning of the page
3151  * and at this point we don't have multiple wa_ctx batch buffers.
3152  *
3153  * The number of WA applied are not known at the beginning; we use this field
3154  * to return the no of DWORDS written.
3155  *
3156  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3157  * so it adds NOOPs as padding to make it cacheline aligned.
3158  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3159  * makes a complete batch buffer.
3160  */
3161 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3162 {
3163 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3164 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3165 
3166 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3167 	if (IS_BROADWELL(engine->i915))
3168 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3169 
3170 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3171 	/* Actual scratch location is at 128 bytes offset */
3172 	batch = gen8_emit_pipe_control(batch,
3173 				       PIPE_CONTROL_FLUSH_L3 |
3174 				       PIPE_CONTROL_STORE_DATA_INDEX |
3175 				       PIPE_CONTROL_CS_STALL |
3176 				       PIPE_CONTROL_QW_WRITE,
3177 				       LRC_PPHWSP_SCRATCH_ADDR);
3178 
3179 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3180 
3181 	/* Pad to end of cacheline */
3182 	while ((unsigned long)batch % CACHELINE_BYTES)
3183 		*batch++ = MI_NOOP;
3184 
3185 	/*
3186 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3187 	 * execution depends on the length specified in terms of cache lines
3188 	 * in the register CTX_RCS_INDIRECT_CTX
3189 	 */
3190 
3191 	return batch;
3192 }
3193 
3194 struct lri {
3195 	i915_reg_t reg;
3196 	u32 value;
3197 };
3198 
3199 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3200 {
3201 	GEM_BUG_ON(!count || count > 63);
3202 
3203 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3204 	do {
3205 		*batch++ = i915_mmio_reg_offset(lri->reg);
3206 		*batch++ = lri->value;
3207 	} while (lri++, --count);
3208 	*batch++ = MI_NOOP;
3209 
3210 	return batch;
3211 }
3212 
3213 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3214 {
3215 	static const struct lri lri[] = {
3216 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3217 		{
3218 			COMMON_SLICE_CHICKEN2,
3219 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3220 				       0),
3221 		},
3222 
3223 		/* BSpec: 11391 */
3224 		{
3225 			FF_SLICE_CHICKEN,
3226 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3227 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3228 		},
3229 
3230 		/* BSpec: 11299 */
3231 		{
3232 			_3D_CHICKEN3,
3233 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3234 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3235 		}
3236 	};
3237 
3238 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3239 
3240 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3241 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3242 
3243 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3244 	batch = gen8_emit_pipe_control(batch,
3245 				       PIPE_CONTROL_FLUSH_L3 |
3246 				       PIPE_CONTROL_STORE_DATA_INDEX |
3247 				       PIPE_CONTROL_CS_STALL |
3248 				       PIPE_CONTROL_QW_WRITE,
3249 				       LRC_PPHWSP_SCRATCH_ADDR);
3250 
3251 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3252 
3253 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3254 	if (HAS_POOLED_EU(engine->i915)) {
3255 		/*
3256 		 * EU pool configuration is setup along with golden context
3257 		 * during context initialization. This value depends on
3258 		 * device type (2x6 or 3x6) and needs to be updated based
3259 		 * on which subslice is disabled especially for 2x6
3260 		 * devices, however it is safe to load default
3261 		 * configuration of 3x6 device instead of masking off
3262 		 * corresponding bits because HW ignores bits of a disabled
3263 		 * subslice and drops down to appropriate config. Please
3264 		 * see render_state_setup() in i915_gem_render_state.c for
3265 		 * possible configurations, to avoid duplication they are
3266 		 * not shown here again.
3267 		 */
3268 		*batch++ = GEN9_MEDIA_POOL_STATE;
3269 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3270 		*batch++ = 0x00777000;
3271 		*batch++ = 0;
3272 		*batch++ = 0;
3273 		*batch++ = 0;
3274 	}
3275 
3276 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3277 
3278 	/* Pad to end of cacheline */
3279 	while ((unsigned long)batch % CACHELINE_BYTES)
3280 		*batch++ = MI_NOOP;
3281 
3282 	return batch;
3283 }
3284 
3285 static u32 *
3286 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3287 {
3288 	int i;
3289 
3290 	/*
3291 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3292 	 *
3293 	 * Ensure the engine is idle prior to programming a
3294 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3295 	 */
3296 	batch = gen8_emit_pipe_control(batch,
3297 				       PIPE_CONTROL_CS_STALL,
3298 				       0);
3299 	/*
3300 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3301 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3302 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3303 	 * confusing. Since gen8_emit_pipe_control() already advances the
3304 	 * batch by 6 dwords, we advance the other 10 here, completing a
3305 	 * cacheline. It's not clear if the workaround requires this padding
3306 	 * before other commands, or if it's just the regular padding we would
3307 	 * already have for the workaround bb, so leave it here for now.
3308 	 */
3309 	for (i = 0; i < 10; i++)
3310 		*batch++ = MI_NOOP;
3311 
3312 	/* Pad to end of cacheline */
3313 	while ((unsigned long)batch % CACHELINE_BYTES)
3314 		*batch++ = MI_NOOP;
3315 
3316 	return batch;
3317 }
3318 
3319 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3320 
3321 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3322 {
3323 	struct drm_i915_gem_object *obj;
3324 	struct i915_vma *vma;
3325 	int err;
3326 
3327 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3328 	if (IS_ERR(obj))
3329 		return PTR_ERR(obj);
3330 
3331 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3332 	if (IS_ERR(vma)) {
3333 		err = PTR_ERR(vma);
3334 		goto err;
3335 	}
3336 
3337 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3338 	if (err)
3339 		goto err;
3340 
3341 	engine->wa_ctx.vma = vma;
3342 	return 0;
3343 
3344 err:
3345 	i915_gem_object_put(obj);
3346 	return err;
3347 }
3348 
3349 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3350 {
3351 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3352 }
3353 
3354 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3355 
3356 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3357 {
3358 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3359 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3360 					    &wa_ctx->per_ctx };
3361 	wa_bb_func_t wa_bb_fn[2];
3362 	struct page *page;
3363 	void *batch, *batch_ptr;
3364 	unsigned int i;
3365 	int ret;
3366 
3367 	if (engine->class != RENDER_CLASS)
3368 		return 0;
3369 
3370 	switch (INTEL_GEN(engine->i915)) {
3371 	case 12:
3372 	case 11:
3373 		return 0;
3374 	case 10:
3375 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3376 		wa_bb_fn[1] = NULL;
3377 		break;
3378 	case 9:
3379 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3380 		wa_bb_fn[1] = NULL;
3381 		break;
3382 	case 8:
3383 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3384 		wa_bb_fn[1] = NULL;
3385 		break;
3386 	default:
3387 		MISSING_CASE(INTEL_GEN(engine->i915));
3388 		return 0;
3389 	}
3390 
3391 	ret = lrc_setup_wa_ctx(engine);
3392 	if (ret) {
3393 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3394 		return ret;
3395 	}
3396 
3397 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3398 	batch = batch_ptr = kmap_atomic(page);
3399 
3400 	/*
3401 	 * Emit the two workaround batch buffers, recording the offset from the
3402 	 * start of the workaround batch buffer object for each and their
3403 	 * respective sizes.
3404 	 */
3405 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3406 		wa_bb[i]->offset = batch_ptr - batch;
3407 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3408 						  CACHELINE_BYTES))) {
3409 			ret = -EINVAL;
3410 			break;
3411 		}
3412 		if (wa_bb_fn[i])
3413 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3414 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3415 	}
3416 
3417 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3418 
3419 	kunmap_atomic(batch);
3420 	if (ret)
3421 		lrc_destroy_wa_ctx(engine);
3422 
3423 	return ret;
3424 }
3425 
3426 static void enable_error_interrupt(struct intel_engine_cs *engine)
3427 {
3428 	u32 status;
3429 
3430 	engine->execlists.error_interrupt = 0;
3431 	ENGINE_WRITE(engine, RING_EMR, ~0u);
3432 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3433 
3434 	status = ENGINE_READ(engine, RING_ESR);
3435 	if (unlikely(status)) {
3436 		dev_err(engine->i915->drm.dev,
3437 			"engine '%s' resumed still in error: %08x\n",
3438 			engine->name, status);
3439 		__intel_gt_reset(engine->gt, engine->mask);
3440 	}
3441 
3442 	/*
3443 	 * On current gen8+, we have 2 signals to play with
3444 	 *
3445 	 * - I915_ERROR_INSTUCTION (bit 0)
3446 	 *
3447 	 *    Generate an error if the command parser encounters an invalid
3448 	 *    instruction
3449 	 *
3450 	 *    This is a fatal error.
3451 	 *
3452 	 * - CP_PRIV (bit 2)
3453 	 *
3454 	 *    Generate an error on privilege violation (where the CP replaces
3455 	 *    the instruction with a no-op). This also fires for writes into
3456 	 *    read-only scratch pages.
3457 	 *
3458 	 *    This is a non-fatal error, parsing continues.
3459 	 *
3460 	 * * there are a few others defined for odd HW that we do not use
3461 	 *
3462 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
3463 	 * error (as the HW is validating and suppressing the mistakes), we
3464 	 * only unmask the instruction error bit.
3465 	 */
3466 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3467 }
3468 
3469 static void enable_execlists(struct intel_engine_cs *engine)
3470 {
3471 	u32 mode;
3472 
3473 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3474 
3475 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3476 
3477 	if (INTEL_GEN(engine->i915) >= 11)
3478 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3479 	else
3480 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3481 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3482 
3483 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3484 
3485 	ENGINE_WRITE_FW(engine,
3486 			RING_HWS_PGA,
3487 			i915_ggtt_offset(engine->status_page.vma));
3488 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3489 
3490 	enable_error_interrupt(engine);
3491 
3492 	engine->context_tag = 0;
3493 }
3494 
3495 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3496 {
3497 	bool unexpected = false;
3498 
3499 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3500 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3501 		unexpected = true;
3502 	}
3503 
3504 	return unexpected;
3505 }
3506 
3507 static int execlists_resume(struct intel_engine_cs *engine)
3508 {
3509 	intel_mocs_init_engine(engine);
3510 
3511 	intel_engine_reset_breadcrumbs(engine);
3512 
3513 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3514 		struct drm_printer p = drm_debug_printer(__func__);
3515 
3516 		intel_engine_dump(engine, &p, NULL);
3517 	}
3518 
3519 	enable_execlists(engine);
3520 
3521 	return 0;
3522 }
3523 
3524 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3525 {
3526 	struct intel_engine_execlists * const execlists = &engine->execlists;
3527 	unsigned long flags;
3528 
3529 	ENGINE_TRACE(engine, "depth<-%d\n",
3530 		     atomic_read(&execlists->tasklet.count));
3531 
3532 	/*
3533 	 * Prevent request submission to the hardware until we have
3534 	 * completed the reset in i915_gem_reset_finish(). If a request
3535 	 * is completed by one engine, it may then queue a request
3536 	 * to a second via its execlists->tasklet *just* as we are
3537 	 * calling engine->resume() and also writing the ELSP.
3538 	 * Turning off the execlists->tasklet until the reset is over
3539 	 * prevents the race.
3540 	 */
3541 	__tasklet_disable_sync_once(&execlists->tasklet);
3542 	GEM_BUG_ON(!reset_in_progress(execlists));
3543 
3544 	/* And flush any current direct submission. */
3545 	spin_lock_irqsave(&engine->active.lock, flags);
3546 	spin_unlock_irqrestore(&engine->active.lock, flags);
3547 
3548 	/*
3549 	 * We stop engines, otherwise we might get failed reset and a
3550 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3551 	 * from system hang if batchbuffer is progressing when
3552 	 * the reset is issued, regardless of READY_TO_RESET ack.
3553 	 * Thus assume it is best to stop engines on all gens
3554 	 * where we have a gpu reset.
3555 	 *
3556 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3557 	 *
3558 	 * FIXME: Wa for more modern gens needs to be validated
3559 	 */
3560 	intel_engine_stop_cs(engine);
3561 }
3562 
3563 static void reset_csb_pointers(struct intel_engine_cs *engine)
3564 {
3565 	struct intel_engine_execlists * const execlists = &engine->execlists;
3566 	const unsigned int reset_value = execlists->csb_size - 1;
3567 
3568 	ring_set_paused(engine, 0);
3569 
3570 	/*
3571 	 * After a reset, the HW starts writing into CSB entry [0]. We
3572 	 * therefore have to set our HEAD pointer back one entry so that
3573 	 * the *first* entry we check is entry 0. To complicate this further,
3574 	 * as we don't wait for the first interrupt after reset, we have to
3575 	 * fake the HW write to point back to the last entry so that our
3576 	 * inline comparison of our cached head position against the last HW
3577 	 * write works even before the first interrupt.
3578 	 */
3579 	execlists->csb_head = reset_value;
3580 	WRITE_ONCE(*execlists->csb_write, reset_value);
3581 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3582 
3583 	/*
3584 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3585 	 * Bludgeon them with a mmio update to be sure.
3586 	 */
3587 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3588 		     reset_value << 8 | reset_value);
3589 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3590 
3591 	invalidate_csb_entries(&execlists->csb_status[0],
3592 			       &execlists->csb_status[reset_value]);
3593 }
3594 
3595 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3596 {
3597 	int x;
3598 
3599 	x = lrc_ring_mi_mode(engine);
3600 	if (x != -1) {
3601 		regs[x + 1] &= ~STOP_RING;
3602 		regs[x + 1] |= STOP_RING << 16;
3603 	}
3604 }
3605 
3606 static void __execlists_reset_reg_state(const struct intel_context *ce,
3607 					const struct intel_engine_cs *engine)
3608 {
3609 	u32 *regs = ce->lrc_reg_state;
3610 
3611 	__reset_stop_ring(regs, engine);
3612 }
3613 
3614 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3615 {
3616 	struct intel_engine_execlists * const execlists = &engine->execlists;
3617 	struct intel_context *ce;
3618 	struct i915_request *rq;
3619 	u32 head;
3620 
3621 	mb(); /* paranoia: read the CSB pointers from after the reset */
3622 	clflush(execlists->csb_write);
3623 	mb();
3624 
3625 	process_csb(engine); /* drain preemption events */
3626 
3627 	/* Following the reset, we need to reload the CSB read/write pointers */
3628 	reset_csb_pointers(engine);
3629 
3630 	/*
3631 	 * Save the currently executing context, even if we completed
3632 	 * its request, it was still running at the time of the
3633 	 * reset and will have been clobbered.
3634 	 */
3635 	rq = execlists_active(execlists);
3636 	if (!rq)
3637 		goto unwind;
3638 
3639 	/* We still have requests in-flight; the engine should be active */
3640 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3641 
3642 	ce = rq->context;
3643 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3644 
3645 	if (i915_request_completed(rq)) {
3646 		/* Idle context; tidy up the ring so we can restart afresh */
3647 		head = intel_ring_wrap(ce->ring, rq->tail);
3648 		goto out_replay;
3649 	}
3650 
3651 	/* Context has requests still in-flight; it should not be idle! */
3652 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3653 	rq = active_request(ce->timeline, rq);
3654 	head = intel_ring_wrap(ce->ring, rq->head);
3655 	GEM_BUG_ON(head == ce->ring->tail);
3656 
3657 	/*
3658 	 * If this request hasn't started yet, e.g. it is waiting on a
3659 	 * semaphore, we need to avoid skipping the request or else we
3660 	 * break the signaling chain. However, if the context is corrupt
3661 	 * the request will not restart and we will be stuck with a wedged
3662 	 * device. It is quite often the case that if we issue a reset
3663 	 * while the GPU is loading the context image, that the context
3664 	 * image becomes corrupt.
3665 	 *
3666 	 * Otherwise, if we have not started yet, the request should replay
3667 	 * perfectly and we do not need to flag the result as being erroneous.
3668 	 */
3669 	if (!i915_request_started(rq))
3670 		goto out_replay;
3671 
3672 	/*
3673 	 * If the request was innocent, we leave the request in the ELSP
3674 	 * and will try to replay it on restarting. The context image may
3675 	 * have been corrupted by the reset, in which case we may have
3676 	 * to service a new GPU hang, but more likely we can continue on
3677 	 * without impact.
3678 	 *
3679 	 * If the request was guilty, we presume the context is corrupt
3680 	 * and have to at least restore the RING register in the context
3681 	 * image back to the expected values to skip over the guilty request.
3682 	 */
3683 	__i915_request_reset(rq, stalled);
3684 	if (!stalled)
3685 		goto out_replay;
3686 
3687 	/*
3688 	 * We want a simple context + ring to execute the breadcrumb update.
3689 	 * We cannot rely on the context being intact across the GPU hang,
3690 	 * so clear it and rebuild just what we need for the breadcrumb.
3691 	 * All pending requests for this context will be zapped, and any
3692 	 * future request will be after userspace has had the opportunity
3693 	 * to recreate its own state.
3694 	 */
3695 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3696 	restore_default_state(ce, engine);
3697 
3698 out_replay:
3699 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3700 		     head, ce->ring->tail);
3701 	__execlists_reset_reg_state(ce, engine);
3702 	__execlists_update_reg_state(ce, engine, head);
3703 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3704 
3705 unwind:
3706 	/* Push back any incomplete requests for replay after the reset. */
3707 	cancel_port_requests(execlists);
3708 	__unwind_incomplete_requests(engine);
3709 }
3710 
3711 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3712 {
3713 	unsigned long flags;
3714 
3715 	ENGINE_TRACE(engine, "\n");
3716 
3717 	spin_lock_irqsave(&engine->active.lock, flags);
3718 
3719 	__execlists_reset(engine, stalled);
3720 
3721 	spin_unlock_irqrestore(&engine->active.lock, flags);
3722 }
3723 
3724 static void nop_submission_tasklet(unsigned long data)
3725 {
3726 	/* The driver is wedged; don't process any more events. */
3727 }
3728 
3729 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3730 {
3731 	struct intel_engine_execlists * const execlists = &engine->execlists;
3732 	struct i915_request *rq, *rn;
3733 	struct rb_node *rb;
3734 	unsigned long flags;
3735 
3736 	ENGINE_TRACE(engine, "\n");
3737 
3738 	/*
3739 	 * Before we call engine->cancel_requests(), we should have exclusive
3740 	 * access to the submission state. This is arranged for us by the
3741 	 * caller disabling the interrupt generation, the tasklet and other
3742 	 * threads that may then access the same state, giving us a free hand
3743 	 * to reset state. However, we still need to let lockdep be aware that
3744 	 * we know this state may be accessed in hardirq context, so we
3745 	 * disable the irq around this manipulation and we want to keep
3746 	 * the spinlock focused on its duties and not accidentally conflate
3747 	 * coverage to the submission's irq state. (Similarly, although we
3748 	 * shouldn't need to disable irq around the manipulation of the
3749 	 * submission's irq state, we also wish to remind ourselves that
3750 	 * it is irq state.)
3751 	 */
3752 	spin_lock_irqsave(&engine->active.lock, flags);
3753 
3754 	__execlists_reset(engine, true);
3755 
3756 	/* Mark all executing requests as skipped. */
3757 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3758 		mark_eio(rq);
3759 
3760 	/* Flush the queued requests to the timeline list (for retiring). */
3761 	while ((rb = rb_first_cached(&execlists->queue))) {
3762 		struct i915_priolist *p = to_priolist(rb);
3763 		int i;
3764 
3765 		priolist_for_each_request_consume(rq, rn, p, i) {
3766 			mark_eio(rq);
3767 			__i915_request_submit(rq);
3768 		}
3769 
3770 		rb_erase_cached(&p->node, &execlists->queue);
3771 		i915_priolist_free(p);
3772 	}
3773 
3774 	/* On-hold requests will be flushed to timeline upon their release */
3775 	list_for_each_entry(rq, &engine->active.hold, sched.link)
3776 		mark_eio(rq);
3777 
3778 	/* Cancel all attached virtual engines */
3779 	while ((rb = rb_first_cached(&execlists->virtual))) {
3780 		struct virtual_engine *ve =
3781 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3782 
3783 		rb_erase_cached(rb, &execlists->virtual);
3784 		RB_CLEAR_NODE(rb);
3785 
3786 		spin_lock(&ve->base.active.lock);
3787 		rq = fetch_and_zero(&ve->request);
3788 		if (rq) {
3789 			mark_eio(rq);
3790 
3791 			rq->engine = engine;
3792 			__i915_request_submit(rq);
3793 			i915_request_put(rq);
3794 
3795 			ve->base.execlists.queue_priority_hint = INT_MIN;
3796 		}
3797 		spin_unlock(&ve->base.active.lock);
3798 	}
3799 
3800 	/* Remaining _unready_ requests will be nop'ed when submitted */
3801 
3802 	execlists->queue_priority_hint = INT_MIN;
3803 	execlists->queue = RB_ROOT_CACHED;
3804 
3805 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3806 	execlists->tasklet.func = nop_submission_tasklet;
3807 
3808 	spin_unlock_irqrestore(&engine->active.lock, flags);
3809 }
3810 
3811 static void execlists_reset_finish(struct intel_engine_cs *engine)
3812 {
3813 	struct intel_engine_execlists * const execlists = &engine->execlists;
3814 
3815 	/*
3816 	 * After a GPU reset, we may have requests to replay. Do so now while
3817 	 * we still have the forcewake to be sure that the GPU is not allowed
3818 	 * to sleep before we restart and reload a context.
3819 	 */
3820 	GEM_BUG_ON(!reset_in_progress(execlists));
3821 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3822 		execlists->tasklet.func(execlists->tasklet.data);
3823 
3824 	if (__tasklet_enable(&execlists->tasklet))
3825 		/* And kick in case we missed a new request submission. */
3826 		tasklet_hi_schedule(&execlists->tasklet);
3827 	ENGINE_TRACE(engine, "depth->%d\n",
3828 		     atomic_read(&execlists->tasklet.count));
3829 }
3830 
3831 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3832 				    u64 offset, u32 len,
3833 				    const unsigned int flags)
3834 {
3835 	u32 *cs;
3836 
3837 	cs = intel_ring_begin(rq, 4);
3838 	if (IS_ERR(cs))
3839 		return PTR_ERR(cs);
3840 
3841 	/*
3842 	 * WaDisableCtxRestoreArbitration:bdw,chv
3843 	 *
3844 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3845 	 * particular all the gen that do not need the w/a at all!), if we
3846 	 * took care to make sure that on every switch into this context
3847 	 * (both ordinary and for preemption) that arbitrartion was enabled
3848 	 * we would be fine.  However, for gen8 there is another w/a that
3849 	 * requires us to not preempt inside GPGPU execution, so we keep
3850 	 * arbitration disabled for gen8 batches. Arbitration will be
3851 	 * re-enabled before we close the request
3852 	 * (engine->emit_fini_breadcrumb).
3853 	 */
3854 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3855 
3856 	/* FIXME(BDW+): Address space and security selectors. */
3857 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3858 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3859 	*cs++ = lower_32_bits(offset);
3860 	*cs++ = upper_32_bits(offset);
3861 
3862 	intel_ring_advance(rq, cs);
3863 
3864 	return 0;
3865 }
3866 
3867 static int gen8_emit_bb_start(struct i915_request *rq,
3868 			      u64 offset, u32 len,
3869 			      const unsigned int flags)
3870 {
3871 	u32 *cs;
3872 
3873 	cs = intel_ring_begin(rq, 6);
3874 	if (IS_ERR(cs))
3875 		return PTR_ERR(cs);
3876 
3877 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3878 
3879 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3880 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3881 	*cs++ = lower_32_bits(offset);
3882 	*cs++ = upper_32_bits(offset);
3883 
3884 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3885 	*cs++ = MI_NOOP;
3886 
3887 	intel_ring_advance(rq, cs);
3888 
3889 	return 0;
3890 }
3891 
3892 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3893 {
3894 	ENGINE_WRITE(engine, RING_IMR,
3895 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3896 	ENGINE_POSTING_READ(engine, RING_IMR);
3897 }
3898 
3899 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3900 {
3901 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3902 }
3903 
3904 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3905 {
3906 	u32 cmd, *cs;
3907 
3908 	cs = intel_ring_begin(request, 4);
3909 	if (IS_ERR(cs))
3910 		return PTR_ERR(cs);
3911 
3912 	cmd = MI_FLUSH_DW + 1;
3913 
3914 	/* We always require a command barrier so that subsequent
3915 	 * commands, such as breadcrumb interrupts, are strictly ordered
3916 	 * wrt the contents of the write cache being flushed to memory
3917 	 * (and thus being coherent from the CPU).
3918 	 */
3919 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3920 
3921 	if (mode & EMIT_INVALIDATE) {
3922 		cmd |= MI_INVALIDATE_TLB;
3923 		if (request->engine->class == VIDEO_DECODE_CLASS)
3924 			cmd |= MI_INVALIDATE_BSD;
3925 	}
3926 
3927 	*cs++ = cmd;
3928 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3929 	*cs++ = 0; /* upper addr */
3930 	*cs++ = 0; /* value */
3931 	intel_ring_advance(request, cs);
3932 
3933 	return 0;
3934 }
3935 
3936 static int gen8_emit_flush_render(struct i915_request *request,
3937 				  u32 mode)
3938 {
3939 	bool vf_flush_wa = false, dc_flush_wa = false;
3940 	u32 *cs, flags = 0;
3941 	int len;
3942 
3943 	flags |= PIPE_CONTROL_CS_STALL;
3944 
3945 	if (mode & EMIT_FLUSH) {
3946 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3947 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3948 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3949 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3950 	}
3951 
3952 	if (mode & EMIT_INVALIDATE) {
3953 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3954 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3955 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3956 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3957 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3958 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3959 		flags |= PIPE_CONTROL_QW_WRITE;
3960 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3961 
3962 		/*
3963 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3964 		 * pipe control.
3965 		 */
3966 		if (IS_GEN(request->i915, 9))
3967 			vf_flush_wa = true;
3968 
3969 		/* WaForGAMHang:kbl */
3970 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3971 			dc_flush_wa = true;
3972 	}
3973 
3974 	len = 6;
3975 
3976 	if (vf_flush_wa)
3977 		len += 6;
3978 
3979 	if (dc_flush_wa)
3980 		len += 12;
3981 
3982 	cs = intel_ring_begin(request, len);
3983 	if (IS_ERR(cs))
3984 		return PTR_ERR(cs);
3985 
3986 	if (vf_flush_wa)
3987 		cs = gen8_emit_pipe_control(cs, 0, 0);
3988 
3989 	if (dc_flush_wa)
3990 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3991 					    0);
3992 
3993 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3994 
3995 	if (dc_flush_wa)
3996 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3997 
3998 	intel_ring_advance(request, cs);
3999 
4000 	return 0;
4001 }
4002 
4003 static int gen11_emit_flush_render(struct i915_request *request,
4004 				   u32 mode)
4005 {
4006 	if (mode & EMIT_FLUSH) {
4007 		u32 *cs;
4008 		u32 flags = 0;
4009 
4010 		flags |= PIPE_CONTROL_CS_STALL;
4011 
4012 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4013 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4014 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4015 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4016 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4017 		flags |= PIPE_CONTROL_QW_WRITE;
4018 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4019 
4020 		cs = intel_ring_begin(request, 6);
4021 		if (IS_ERR(cs))
4022 			return PTR_ERR(cs);
4023 
4024 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4025 		intel_ring_advance(request, cs);
4026 	}
4027 
4028 	if (mode & EMIT_INVALIDATE) {
4029 		u32 *cs;
4030 		u32 flags = 0;
4031 
4032 		flags |= PIPE_CONTROL_CS_STALL;
4033 
4034 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4035 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4036 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4037 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4038 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4039 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4040 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4041 		flags |= PIPE_CONTROL_QW_WRITE;
4042 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4043 
4044 		cs = intel_ring_begin(request, 6);
4045 		if (IS_ERR(cs))
4046 			return PTR_ERR(cs);
4047 
4048 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4049 		intel_ring_advance(request, cs);
4050 	}
4051 
4052 	return 0;
4053 }
4054 
4055 static u32 preparser_disable(bool state)
4056 {
4057 	return MI_ARB_CHECK | 1 << 8 | state;
4058 }
4059 
4060 static int gen12_emit_flush_render(struct i915_request *request,
4061 				   u32 mode)
4062 {
4063 	if (mode & EMIT_FLUSH) {
4064 		u32 flags = 0;
4065 		u32 *cs;
4066 
4067 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4068 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4069 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4070 		/* Wa_1409600907:tgl */
4071 		flags |= PIPE_CONTROL_DEPTH_STALL;
4072 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4073 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4074 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4075 
4076 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4077 		flags |= PIPE_CONTROL_QW_WRITE;
4078 
4079 		flags |= PIPE_CONTROL_CS_STALL;
4080 
4081 		cs = intel_ring_begin(request, 6);
4082 		if (IS_ERR(cs))
4083 			return PTR_ERR(cs);
4084 
4085 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4086 		intel_ring_advance(request, cs);
4087 	}
4088 
4089 	if (mode & EMIT_INVALIDATE) {
4090 		u32 flags = 0;
4091 		u32 *cs;
4092 
4093 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4094 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4095 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4096 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4097 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4098 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4099 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4100 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4101 
4102 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4103 		flags |= PIPE_CONTROL_QW_WRITE;
4104 
4105 		flags |= PIPE_CONTROL_CS_STALL;
4106 
4107 		cs = intel_ring_begin(request, 8);
4108 		if (IS_ERR(cs))
4109 			return PTR_ERR(cs);
4110 
4111 		/*
4112 		 * Prevent the pre-parser from skipping past the TLB
4113 		 * invalidate and loading a stale page for the batch
4114 		 * buffer / request payload.
4115 		 */
4116 		*cs++ = preparser_disable(true);
4117 
4118 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4119 
4120 		*cs++ = preparser_disable(false);
4121 		intel_ring_advance(request, cs);
4122 
4123 		/*
4124 		 * Wa_1604544889:tgl
4125 		 */
4126 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
4127 			flags = 0;
4128 			flags |= PIPE_CONTROL_CS_STALL;
4129 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4130 
4131 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4132 			flags |= PIPE_CONTROL_QW_WRITE;
4133 
4134 			cs = intel_ring_begin(request, 6);
4135 			if (IS_ERR(cs))
4136 				return PTR_ERR(cs);
4137 
4138 			cs = gen8_emit_pipe_control(cs, flags,
4139 						    LRC_PPHWSP_SCRATCH_ADDR);
4140 			intel_ring_advance(request, cs);
4141 		}
4142 	}
4143 
4144 	return 0;
4145 }
4146 
4147 /*
4148  * Reserve space for 2 NOOPs at the end of each request to be
4149  * used as a workaround for not being allowed to do lite
4150  * restore with HEAD==TAIL (WaIdleLiteRestore).
4151  */
4152 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4153 {
4154 	/* Ensure there's always at least one preemption point per-request. */
4155 	*cs++ = MI_ARB_CHECK;
4156 	*cs++ = MI_NOOP;
4157 	request->wa_tail = intel_ring_offset(request, cs);
4158 
4159 	return cs;
4160 }
4161 
4162 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4163 {
4164 	*cs++ = MI_SEMAPHORE_WAIT |
4165 		MI_SEMAPHORE_GLOBAL_GTT |
4166 		MI_SEMAPHORE_POLL |
4167 		MI_SEMAPHORE_SAD_EQ_SDD;
4168 	*cs++ = 0;
4169 	*cs++ = intel_hws_preempt_address(request->engine);
4170 	*cs++ = 0;
4171 
4172 	return cs;
4173 }
4174 
4175 static __always_inline u32*
4176 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4177 				 u32 *cs)
4178 {
4179 	*cs++ = MI_USER_INTERRUPT;
4180 
4181 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4182 	if (intel_engine_has_semaphores(request->engine))
4183 		cs = emit_preempt_busywait(request, cs);
4184 
4185 	request->tail = intel_ring_offset(request, cs);
4186 	assert_ring_tail_valid(request->ring, request->tail);
4187 
4188 	return gen8_emit_wa_tail(request, cs);
4189 }
4190 
4191 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4192 {
4193 	cs = gen8_emit_ggtt_write(cs,
4194 				  request->fence.seqno,
4195 				  i915_request_active_timeline(request)->hwsp_offset,
4196 				  0);
4197 
4198 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4199 }
4200 
4201 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4202 {
4203 	cs = gen8_emit_pipe_control(cs,
4204 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4205 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4206 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4207 				    0);
4208 
4209 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4210 	cs = gen8_emit_ggtt_write_rcs(cs,
4211 				      request->fence.seqno,
4212 				      i915_request_active_timeline(request)->hwsp_offset,
4213 				      PIPE_CONTROL_FLUSH_ENABLE |
4214 				      PIPE_CONTROL_CS_STALL);
4215 
4216 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4217 }
4218 
4219 static u32 *
4220 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4221 {
4222 	cs = gen8_emit_ggtt_write_rcs(cs,
4223 				      request->fence.seqno,
4224 				      i915_request_active_timeline(request)->hwsp_offset,
4225 				      PIPE_CONTROL_CS_STALL |
4226 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4227 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4228 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4229 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4230 				      PIPE_CONTROL_FLUSH_ENABLE);
4231 
4232 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4233 }
4234 
4235 /*
4236  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4237  * flush and will continue pre-fetching the instructions after it before the
4238  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4239  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4240  * of the next request before the memory has been flushed, we're guaranteed that
4241  * we won't access the batch itself too early.
4242  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4243  * so, if the current request is modifying an instruction in the next request on
4244  * the same intel_context, we might pre-fetch and then execute the pre-update
4245  * instruction. To avoid this, the users of self-modifying code should either
4246  * disable the parser around the code emitting the memory writes, via a new flag
4247  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4248  * the in-kernel use-cases we've opted to use a separate context, see
4249  * reloc_gpu() as an example.
4250  * All the above applies only to the instructions themselves. Non-inline data
4251  * used by the instructions is not pre-fetched.
4252  */
4253 
4254 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4255 {
4256 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4257 		MI_SEMAPHORE_GLOBAL_GTT |
4258 		MI_SEMAPHORE_POLL |
4259 		MI_SEMAPHORE_SAD_EQ_SDD;
4260 	*cs++ = 0;
4261 	*cs++ = intel_hws_preempt_address(request->engine);
4262 	*cs++ = 0;
4263 	*cs++ = 0;
4264 	*cs++ = MI_NOOP;
4265 
4266 	return cs;
4267 }
4268 
4269 static __always_inline u32*
4270 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4271 {
4272 	*cs++ = MI_USER_INTERRUPT;
4273 
4274 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4275 	if (intel_engine_has_semaphores(request->engine))
4276 		cs = gen12_emit_preempt_busywait(request, cs);
4277 
4278 	request->tail = intel_ring_offset(request, cs);
4279 	assert_ring_tail_valid(request->ring, request->tail);
4280 
4281 	return gen8_emit_wa_tail(request, cs);
4282 }
4283 
4284 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4285 {
4286 	cs = gen8_emit_ggtt_write(cs,
4287 				  request->fence.seqno,
4288 				  i915_request_active_timeline(request)->hwsp_offset,
4289 				  0);
4290 
4291 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4292 }
4293 
4294 static u32 *
4295 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4296 {
4297 	cs = gen8_emit_ggtt_write_rcs(cs,
4298 				      request->fence.seqno,
4299 				      i915_request_active_timeline(request)->hwsp_offset,
4300 				      PIPE_CONTROL_CS_STALL |
4301 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4302 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4303 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4304 				      /* Wa_1409600907:tgl */
4305 				      PIPE_CONTROL_DEPTH_STALL |
4306 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4307 				      PIPE_CONTROL_FLUSH_ENABLE |
4308 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4309 
4310 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4311 }
4312 
4313 static void execlists_park(struct intel_engine_cs *engine)
4314 {
4315 	cancel_timer(&engine->execlists.timer);
4316 	cancel_timer(&engine->execlists.preempt);
4317 }
4318 
4319 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4320 {
4321 	engine->submit_request = execlists_submit_request;
4322 	engine->schedule = i915_schedule;
4323 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4324 
4325 	engine->reset.prepare = execlists_reset_prepare;
4326 	engine->reset.rewind = execlists_reset_rewind;
4327 	engine->reset.cancel = execlists_reset_cancel;
4328 	engine->reset.finish = execlists_reset_finish;
4329 
4330 	engine->park = execlists_park;
4331 	engine->unpark = NULL;
4332 
4333 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4334 	if (!intel_vgpu_active(engine->i915)) {
4335 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4336 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4337 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4338 	}
4339 
4340 	if (INTEL_GEN(engine->i915) >= 12)
4341 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4342 
4343 	if (intel_engine_has_preemption(engine))
4344 		engine->emit_bb_start = gen8_emit_bb_start;
4345 	else
4346 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4347 }
4348 
4349 static void execlists_shutdown(struct intel_engine_cs *engine)
4350 {
4351 	/* Synchronise with residual timers and any softirq they raise */
4352 	del_timer_sync(&engine->execlists.timer);
4353 	del_timer_sync(&engine->execlists.preempt);
4354 	tasklet_kill(&engine->execlists.tasklet);
4355 }
4356 
4357 static void execlists_release(struct intel_engine_cs *engine)
4358 {
4359 	execlists_shutdown(engine);
4360 
4361 	intel_engine_cleanup_common(engine);
4362 	lrc_destroy_wa_ctx(engine);
4363 }
4364 
4365 static void
4366 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4367 {
4368 	/* Default vfuncs which can be overriden by each engine. */
4369 
4370 	engine->resume = execlists_resume;
4371 
4372 	engine->cops = &execlists_context_ops;
4373 	engine->request_alloc = execlists_request_alloc;
4374 
4375 	engine->emit_flush = gen8_emit_flush;
4376 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4377 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4378 	if (INTEL_GEN(engine->i915) >= 12)
4379 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4380 
4381 	engine->set_default_submission = intel_execlists_set_default_submission;
4382 
4383 	if (INTEL_GEN(engine->i915) < 11) {
4384 		engine->irq_enable = gen8_logical_ring_enable_irq;
4385 		engine->irq_disable = gen8_logical_ring_disable_irq;
4386 	} else {
4387 		/*
4388 		 * TODO: On Gen11 interrupt masks need to be clear
4389 		 * to allow C6 entry. Keep interrupts enabled at
4390 		 * and take the hit of generating extra interrupts
4391 		 * until a more refined solution exists.
4392 		 */
4393 	}
4394 }
4395 
4396 static inline void
4397 logical_ring_default_irqs(struct intel_engine_cs *engine)
4398 {
4399 	unsigned int shift = 0;
4400 
4401 	if (INTEL_GEN(engine->i915) < 11) {
4402 		const u8 irq_shifts[] = {
4403 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
4404 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
4405 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4406 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4407 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
4408 		};
4409 
4410 		shift = irq_shifts[engine->id];
4411 	}
4412 
4413 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4414 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4415 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4416 }
4417 
4418 static void rcs_submission_override(struct intel_engine_cs *engine)
4419 {
4420 	switch (INTEL_GEN(engine->i915)) {
4421 	case 12:
4422 		engine->emit_flush = gen12_emit_flush_render;
4423 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4424 		break;
4425 	case 11:
4426 		engine->emit_flush = gen11_emit_flush_render;
4427 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4428 		break;
4429 	default:
4430 		engine->emit_flush = gen8_emit_flush_render;
4431 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4432 		break;
4433 	}
4434 }
4435 
4436 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4437 {
4438 	struct intel_engine_execlists * const execlists = &engine->execlists;
4439 	struct drm_i915_private *i915 = engine->i915;
4440 	struct intel_uncore *uncore = engine->uncore;
4441 	u32 base = engine->mmio_base;
4442 
4443 	tasklet_init(&engine->execlists.tasklet,
4444 		     execlists_submission_tasklet, (unsigned long)engine);
4445 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4446 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4447 
4448 	logical_ring_default_vfuncs(engine);
4449 	logical_ring_default_irqs(engine);
4450 
4451 	if (engine->class == RENDER_CLASS)
4452 		rcs_submission_override(engine);
4453 
4454 	if (intel_init_workaround_bb(engine))
4455 		/*
4456 		 * We continue even if we fail to initialize WA batch
4457 		 * because we only expect rare glitches but nothing
4458 		 * critical to prevent us from using GPU
4459 		 */
4460 		DRM_ERROR("WA batch buffer initialization failed\n");
4461 
4462 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
4463 		execlists->submit_reg = uncore->regs +
4464 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4465 		execlists->ctrl_reg = uncore->regs +
4466 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4467 	} else {
4468 		execlists->submit_reg = uncore->regs +
4469 			i915_mmio_reg_offset(RING_ELSP(base));
4470 	}
4471 
4472 	execlists->csb_status =
4473 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4474 
4475 	execlists->csb_write =
4476 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
4477 
4478 	if (INTEL_GEN(i915) < 11)
4479 		execlists->csb_size = GEN8_CSB_ENTRIES;
4480 	else
4481 		execlists->csb_size = GEN11_CSB_ENTRIES;
4482 
4483 	reset_csb_pointers(engine);
4484 
4485 	/* Finally, take ownership and responsibility for cleanup! */
4486 	engine->release = execlists_release;
4487 
4488 	return 0;
4489 }
4490 
4491 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4492 {
4493 	u32 indirect_ctx_offset;
4494 
4495 	switch (INTEL_GEN(engine->i915)) {
4496 	default:
4497 		MISSING_CASE(INTEL_GEN(engine->i915));
4498 		/* fall through */
4499 	case 12:
4500 		indirect_ctx_offset =
4501 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4502 		break;
4503 	case 11:
4504 		indirect_ctx_offset =
4505 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4506 		break;
4507 	case 10:
4508 		indirect_ctx_offset =
4509 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4510 		break;
4511 	case 9:
4512 		indirect_ctx_offset =
4513 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4514 		break;
4515 	case 8:
4516 		indirect_ctx_offset =
4517 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4518 		break;
4519 	}
4520 
4521 	return indirect_ctx_offset;
4522 }
4523 
4524 
4525 static void init_common_reg_state(u32 * const regs,
4526 				  const struct intel_engine_cs *engine,
4527 				  const struct intel_ring *ring,
4528 				  bool inhibit)
4529 {
4530 	u32 ctl;
4531 
4532 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4533 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4534 	if (inhibit)
4535 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4536 	if (INTEL_GEN(engine->i915) < 11)
4537 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4538 					   CTX_CTRL_RS_CTX_ENABLE);
4539 	regs[CTX_CONTEXT_CONTROL] = ctl;
4540 
4541 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4542 }
4543 
4544 static void init_wa_bb_reg_state(u32 * const regs,
4545 				 const struct intel_engine_cs *engine,
4546 				 u32 pos_bb_per_ctx)
4547 {
4548 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4549 
4550 	if (wa_ctx->per_ctx.size) {
4551 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4552 
4553 		regs[pos_bb_per_ctx] =
4554 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4555 	}
4556 
4557 	if (wa_ctx->indirect_ctx.size) {
4558 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4559 
4560 		regs[pos_bb_per_ctx + 2] =
4561 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
4562 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4563 
4564 		regs[pos_bb_per_ctx + 4] =
4565 			intel_lr_indirect_ctx_offset(engine) << 6;
4566 	}
4567 }
4568 
4569 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4570 {
4571 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
4572 		/* 64b PPGTT (48bit canonical)
4573 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
4574 		 * other PDP Descriptors are ignored.
4575 		 */
4576 		ASSIGN_CTX_PML4(ppgtt, regs);
4577 	} else {
4578 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
4579 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
4580 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
4581 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
4582 	}
4583 }
4584 
4585 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4586 {
4587 	if (i915_is_ggtt(vm))
4588 		return i915_vm_to_ggtt(vm)->alias;
4589 	else
4590 		return i915_vm_to_ppgtt(vm);
4591 }
4592 
4593 static void execlists_init_reg_state(u32 *regs,
4594 				     const struct intel_context *ce,
4595 				     const struct intel_engine_cs *engine,
4596 				     const struct intel_ring *ring,
4597 				     bool inhibit)
4598 {
4599 	/*
4600 	 * A context is actually a big batch buffer with several
4601 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4602 	 * values we are setting here are only for the first context restore:
4603 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
4604 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4605 	 * we are not initializing here).
4606 	 *
4607 	 * Must keep consistent with virtual_update_register_offsets().
4608 	 */
4609 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
4610 
4611 	init_common_reg_state(regs, engine, ring, inhibit);
4612 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4613 
4614 	init_wa_bb_reg_state(regs, engine,
4615 			     INTEL_GEN(engine->i915) >= 12 ?
4616 			     GEN12_CTX_BB_PER_CTX_PTR :
4617 			     CTX_BB_PER_CTX_PTR);
4618 
4619 	__reset_stop_ring(regs, engine);
4620 }
4621 
4622 static int
4623 populate_lr_context(struct intel_context *ce,
4624 		    struct drm_i915_gem_object *ctx_obj,
4625 		    struct intel_engine_cs *engine,
4626 		    struct intel_ring *ring)
4627 {
4628 	bool inhibit = true;
4629 	void *vaddr;
4630 	int ret;
4631 
4632 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4633 	if (IS_ERR(vaddr)) {
4634 		ret = PTR_ERR(vaddr);
4635 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4636 		return ret;
4637 	}
4638 
4639 	set_redzone(vaddr, engine);
4640 
4641 	if (engine->default_state) {
4642 		void *defaults;
4643 
4644 		defaults = i915_gem_object_pin_map(engine->default_state,
4645 						   I915_MAP_WB);
4646 		if (IS_ERR(defaults)) {
4647 			ret = PTR_ERR(defaults);
4648 			goto err_unpin_ctx;
4649 		}
4650 
4651 		memcpy(vaddr, defaults, engine->context_size);
4652 		i915_gem_object_unpin_map(engine->default_state);
4653 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
4654 		inhibit = false;
4655 	}
4656 
4657 	/* Clear the ppHWSP (inc. per-context counters) */
4658 	memset(vaddr, 0, PAGE_SIZE);
4659 
4660 	/*
4661 	 * The second page of the context object contains some registers which
4662 	 * must be set up prior to the first execution.
4663 	 */
4664 	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4665 				 ce, engine, ring, inhibit);
4666 
4667 	ret = 0;
4668 err_unpin_ctx:
4669 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4670 	i915_gem_object_unpin_map(ctx_obj);
4671 	return ret;
4672 }
4673 
4674 static int __execlists_context_alloc(struct intel_context *ce,
4675 				     struct intel_engine_cs *engine)
4676 {
4677 	struct drm_i915_gem_object *ctx_obj;
4678 	struct intel_ring *ring;
4679 	struct i915_vma *vma;
4680 	u32 context_size;
4681 	int ret;
4682 
4683 	GEM_BUG_ON(ce->state);
4684 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4685 
4686 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4687 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4688 
4689 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4690 	if (IS_ERR(ctx_obj))
4691 		return PTR_ERR(ctx_obj);
4692 
4693 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4694 	if (IS_ERR(vma)) {
4695 		ret = PTR_ERR(vma);
4696 		goto error_deref_obj;
4697 	}
4698 
4699 	if (!ce->timeline) {
4700 		struct intel_timeline *tl;
4701 		struct i915_vma *hwsp;
4702 
4703 		/*
4704 		 * Use the static global HWSP for the kernel context, and
4705 		 * a dynamically allocated cacheline for everyone else.
4706 		 */
4707 		hwsp = NULL;
4708 		if (unlikely(intel_context_is_barrier(ce)))
4709 			hwsp = engine->status_page.vma;
4710 
4711 		tl = intel_timeline_create(engine->gt, hwsp);
4712 		if (IS_ERR(tl)) {
4713 			ret = PTR_ERR(tl);
4714 			goto error_deref_obj;
4715 		}
4716 
4717 		ce->timeline = tl;
4718 	}
4719 
4720 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4721 	if (IS_ERR(ring)) {
4722 		ret = PTR_ERR(ring);
4723 		goto error_deref_obj;
4724 	}
4725 
4726 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4727 	if (ret) {
4728 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4729 		goto error_ring_free;
4730 	}
4731 
4732 	ce->ring = ring;
4733 	ce->state = vma;
4734 
4735 	return 0;
4736 
4737 error_ring_free:
4738 	intel_ring_put(ring);
4739 error_deref_obj:
4740 	i915_gem_object_put(ctx_obj);
4741 	return ret;
4742 }
4743 
4744 static struct list_head *virtual_queue(struct virtual_engine *ve)
4745 {
4746 	return &ve->base.execlists.default_priolist.requests[0];
4747 }
4748 
4749 static void virtual_context_destroy(struct kref *kref)
4750 {
4751 	struct virtual_engine *ve =
4752 		container_of(kref, typeof(*ve), context.ref);
4753 	unsigned int n;
4754 
4755 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4756 	GEM_BUG_ON(ve->request);
4757 	GEM_BUG_ON(ve->context.inflight);
4758 
4759 	for (n = 0; n < ve->num_siblings; n++) {
4760 		struct intel_engine_cs *sibling = ve->siblings[n];
4761 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4762 		unsigned long flags;
4763 
4764 		if (RB_EMPTY_NODE(node))
4765 			continue;
4766 
4767 		spin_lock_irqsave(&sibling->active.lock, flags);
4768 
4769 		/* Detachment is lazily performed in the execlists tasklet */
4770 		if (!RB_EMPTY_NODE(node))
4771 			rb_erase_cached(node, &sibling->execlists.virtual);
4772 
4773 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4774 	}
4775 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4776 
4777 	if (ve->context.state)
4778 		__execlists_context_fini(&ve->context);
4779 	intel_context_fini(&ve->context);
4780 
4781 	kfree(ve->bonds);
4782 	kfree(ve);
4783 }
4784 
4785 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4786 {
4787 	int swp;
4788 
4789 	/*
4790 	 * Pick a random sibling on starting to help spread the load around.
4791 	 *
4792 	 * New contexts are typically created with exactly the same order
4793 	 * of siblings, and often started in batches. Due to the way we iterate
4794 	 * the array of sibling when submitting requests, sibling[0] is
4795 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4796 	 * randomised across the system, we also help spread the load by the
4797 	 * first engine we inspect being different each time.
4798 	 *
4799 	 * NB This does not force us to execute on this engine, it will just
4800 	 * typically be the first we inspect for submission.
4801 	 */
4802 	swp = prandom_u32_max(ve->num_siblings);
4803 	if (!swp)
4804 		return;
4805 
4806 	swap(ve->siblings[swp], ve->siblings[0]);
4807 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4808 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4809 						ve->siblings[0]);
4810 }
4811 
4812 static int virtual_context_alloc(struct intel_context *ce)
4813 {
4814 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4815 
4816 	return __execlists_context_alloc(ce, ve->siblings[0]);
4817 }
4818 
4819 static int virtual_context_pin(struct intel_context *ce)
4820 {
4821 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4822 	int err;
4823 
4824 	/* Note: we must use a real engine class for setting up reg state */
4825 	err = __execlists_context_pin(ce, ve->siblings[0]);
4826 	if (err)
4827 		return err;
4828 
4829 	virtual_engine_initial_hint(ve);
4830 	return 0;
4831 }
4832 
4833 static void virtual_context_enter(struct intel_context *ce)
4834 {
4835 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4836 	unsigned int n;
4837 
4838 	for (n = 0; n < ve->num_siblings; n++)
4839 		intel_engine_pm_get(ve->siblings[n]);
4840 
4841 	intel_timeline_enter(ce->timeline);
4842 }
4843 
4844 static void virtual_context_exit(struct intel_context *ce)
4845 {
4846 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4847 	unsigned int n;
4848 
4849 	intel_timeline_exit(ce->timeline);
4850 
4851 	for (n = 0; n < ve->num_siblings; n++)
4852 		intel_engine_pm_put(ve->siblings[n]);
4853 }
4854 
4855 static const struct intel_context_ops virtual_context_ops = {
4856 	.alloc = virtual_context_alloc,
4857 
4858 	.pin = virtual_context_pin,
4859 	.unpin = execlists_context_unpin,
4860 
4861 	.enter = virtual_context_enter,
4862 	.exit = virtual_context_exit,
4863 
4864 	.destroy = virtual_context_destroy,
4865 };
4866 
4867 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4868 {
4869 	struct i915_request *rq;
4870 	intel_engine_mask_t mask;
4871 
4872 	rq = READ_ONCE(ve->request);
4873 	if (!rq)
4874 		return 0;
4875 
4876 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4877 	mask = rq->execution_mask;
4878 	if (unlikely(!mask)) {
4879 		/* Invalid selection, submit to a random engine in error */
4880 		i915_request_skip(rq, -ENODEV);
4881 		mask = ve->siblings[0]->mask;
4882 	}
4883 
4884 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4885 		     rq->fence.context, rq->fence.seqno,
4886 		     mask, ve->base.execlists.queue_priority_hint);
4887 
4888 	return mask;
4889 }
4890 
4891 static void virtual_submission_tasklet(unsigned long data)
4892 {
4893 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4894 	const int prio = ve->base.execlists.queue_priority_hint;
4895 	intel_engine_mask_t mask;
4896 	unsigned int n;
4897 
4898 	rcu_read_lock();
4899 	mask = virtual_submission_mask(ve);
4900 	rcu_read_unlock();
4901 	if (unlikely(!mask))
4902 		return;
4903 
4904 	local_irq_disable();
4905 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4906 		struct intel_engine_cs *sibling = ve->siblings[n];
4907 		struct ve_node * const node = &ve->nodes[sibling->id];
4908 		struct rb_node **parent, *rb;
4909 		bool first;
4910 
4911 		if (unlikely(!(mask & sibling->mask))) {
4912 			if (!RB_EMPTY_NODE(&node->rb)) {
4913 				spin_lock(&sibling->active.lock);
4914 				rb_erase_cached(&node->rb,
4915 						&sibling->execlists.virtual);
4916 				RB_CLEAR_NODE(&node->rb);
4917 				spin_unlock(&sibling->active.lock);
4918 			}
4919 			continue;
4920 		}
4921 
4922 		spin_lock(&sibling->active.lock);
4923 
4924 		if (!RB_EMPTY_NODE(&node->rb)) {
4925 			/*
4926 			 * Cheat and avoid rebalancing the tree if we can
4927 			 * reuse this node in situ.
4928 			 */
4929 			first = rb_first_cached(&sibling->execlists.virtual) ==
4930 				&node->rb;
4931 			if (prio == node->prio || (prio > node->prio && first))
4932 				goto submit_engine;
4933 
4934 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4935 		}
4936 
4937 		rb = NULL;
4938 		first = true;
4939 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4940 		while (*parent) {
4941 			struct ve_node *other;
4942 
4943 			rb = *parent;
4944 			other = rb_entry(rb, typeof(*other), rb);
4945 			if (prio > other->prio) {
4946 				parent = &rb->rb_left;
4947 			} else {
4948 				parent = &rb->rb_right;
4949 				first = false;
4950 			}
4951 		}
4952 
4953 		rb_link_node(&node->rb, rb, parent);
4954 		rb_insert_color_cached(&node->rb,
4955 				       &sibling->execlists.virtual,
4956 				       first);
4957 
4958 submit_engine:
4959 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4960 		node->prio = prio;
4961 		if (first && prio > sibling->execlists.queue_priority_hint) {
4962 			sibling->execlists.queue_priority_hint = prio;
4963 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4964 		}
4965 
4966 		spin_unlock(&sibling->active.lock);
4967 	}
4968 	local_irq_enable();
4969 }
4970 
4971 static void virtual_submit_request(struct i915_request *rq)
4972 {
4973 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4974 	struct i915_request *old;
4975 	unsigned long flags;
4976 
4977 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4978 		     rq->fence.context,
4979 		     rq->fence.seqno);
4980 
4981 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4982 
4983 	spin_lock_irqsave(&ve->base.active.lock, flags);
4984 
4985 	old = ve->request;
4986 	if (old) { /* background completion event from preempt-to-busy */
4987 		GEM_BUG_ON(!i915_request_completed(old));
4988 		__i915_request_submit(old);
4989 		i915_request_put(old);
4990 	}
4991 
4992 	if (i915_request_completed(rq)) {
4993 		__i915_request_submit(rq);
4994 
4995 		ve->base.execlists.queue_priority_hint = INT_MIN;
4996 		ve->request = NULL;
4997 	} else {
4998 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4999 		ve->request = i915_request_get(rq);
5000 
5001 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5002 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5003 
5004 		tasklet_schedule(&ve->base.execlists.tasklet);
5005 	}
5006 
5007 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5008 }
5009 
5010 static struct ve_bond *
5011 virtual_find_bond(struct virtual_engine *ve,
5012 		  const struct intel_engine_cs *master)
5013 {
5014 	int i;
5015 
5016 	for (i = 0; i < ve->num_bonds; i++) {
5017 		if (ve->bonds[i].master == master)
5018 			return &ve->bonds[i];
5019 	}
5020 
5021 	return NULL;
5022 }
5023 
5024 static void
5025 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5026 {
5027 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5028 	intel_engine_mask_t allowed, exec;
5029 	struct ve_bond *bond;
5030 
5031 	allowed = ~to_request(signal)->engine->mask;
5032 
5033 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5034 	if (bond)
5035 		allowed &= bond->sibling_mask;
5036 
5037 	/* Restrict the bonded request to run on only the available engines */
5038 	exec = READ_ONCE(rq->execution_mask);
5039 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5040 		;
5041 
5042 	/* Prevent the master from being re-run on the bonded engines */
5043 	to_request(signal)->execution_mask &= ~allowed;
5044 }
5045 
5046 struct intel_context *
5047 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5048 			       unsigned int count)
5049 {
5050 	struct virtual_engine *ve;
5051 	unsigned int n;
5052 	int err;
5053 
5054 	if (count == 0)
5055 		return ERR_PTR(-EINVAL);
5056 
5057 	if (count == 1)
5058 		return intel_context_create(siblings[0]);
5059 
5060 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5061 	if (!ve)
5062 		return ERR_PTR(-ENOMEM);
5063 
5064 	ve->base.i915 = siblings[0]->i915;
5065 	ve->base.gt = siblings[0]->gt;
5066 	ve->base.uncore = siblings[0]->uncore;
5067 	ve->base.id = -1;
5068 
5069 	ve->base.class = OTHER_CLASS;
5070 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5071 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5072 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5073 
5074 	/*
5075 	 * The decision on whether to submit a request using semaphores
5076 	 * depends on the saturated state of the engine. We only compute
5077 	 * this during HW submission of the request, and we need for this
5078 	 * state to be globally applied to all requests being submitted
5079 	 * to this engine. Virtual engines encompass more than one physical
5080 	 * engine and so we cannot accurately tell in advance if one of those
5081 	 * engines is already saturated and so cannot afford to use a semaphore
5082 	 * and be pessimized in priority for doing so -- if we are the only
5083 	 * context using semaphores after all other clients have stopped, we
5084 	 * will be starved on the saturated system. Such a global switch for
5085 	 * semaphores is less than ideal, but alas is the current compromise.
5086 	 */
5087 	ve->base.saturated = ALL_ENGINES;
5088 
5089 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5090 
5091 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5092 	intel_engine_init_breadcrumbs(&ve->base);
5093 	intel_engine_init_execlists(&ve->base);
5094 
5095 	ve->base.cops = &virtual_context_ops;
5096 	ve->base.request_alloc = execlists_request_alloc;
5097 
5098 	ve->base.schedule = i915_schedule;
5099 	ve->base.submit_request = virtual_submit_request;
5100 	ve->base.bond_execute = virtual_bond_execute;
5101 
5102 	INIT_LIST_HEAD(virtual_queue(ve));
5103 	ve->base.execlists.queue_priority_hint = INT_MIN;
5104 	tasklet_init(&ve->base.execlists.tasklet,
5105 		     virtual_submission_tasklet,
5106 		     (unsigned long)ve);
5107 
5108 	intel_context_init(&ve->context, &ve->base);
5109 
5110 	for (n = 0; n < count; n++) {
5111 		struct intel_engine_cs *sibling = siblings[n];
5112 
5113 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5114 		if (sibling->mask & ve->base.mask) {
5115 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5116 				  sibling->name);
5117 			err = -EINVAL;
5118 			goto err_put;
5119 		}
5120 
5121 		/*
5122 		 * The virtual engine implementation is tightly coupled to
5123 		 * the execlists backend -- we push out request directly
5124 		 * into a tree inside each physical engine. We could support
5125 		 * layering if we handle cloning of the requests and
5126 		 * submitting a copy into each backend.
5127 		 */
5128 		if (sibling->execlists.tasklet.func !=
5129 		    execlists_submission_tasklet) {
5130 			err = -ENODEV;
5131 			goto err_put;
5132 		}
5133 
5134 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5135 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5136 
5137 		ve->siblings[ve->num_siblings++] = sibling;
5138 		ve->base.mask |= sibling->mask;
5139 
5140 		/*
5141 		 * All physical engines must be compatible for their emission
5142 		 * functions (as we build the instructions during request
5143 		 * construction and do not alter them before submission
5144 		 * on the physical engine). We use the engine class as a guide
5145 		 * here, although that could be refined.
5146 		 */
5147 		if (ve->base.class != OTHER_CLASS) {
5148 			if (ve->base.class != sibling->class) {
5149 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5150 					  sibling->class, ve->base.class);
5151 				err = -EINVAL;
5152 				goto err_put;
5153 			}
5154 			continue;
5155 		}
5156 
5157 		ve->base.class = sibling->class;
5158 		ve->base.uabi_class = sibling->uabi_class;
5159 		snprintf(ve->base.name, sizeof(ve->base.name),
5160 			 "v%dx%d", ve->base.class, count);
5161 		ve->base.context_size = sibling->context_size;
5162 
5163 		ve->base.emit_bb_start = sibling->emit_bb_start;
5164 		ve->base.emit_flush = sibling->emit_flush;
5165 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5166 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5167 		ve->base.emit_fini_breadcrumb_dw =
5168 			sibling->emit_fini_breadcrumb_dw;
5169 
5170 		ve->base.flags = sibling->flags;
5171 	}
5172 
5173 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5174 
5175 	return &ve->context;
5176 
5177 err_put:
5178 	intel_context_put(&ve->context);
5179 	return ERR_PTR(err);
5180 }
5181 
5182 struct intel_context *
5183 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5184 {
5185 	struct virtual_engine *se = to_virtual_engine(src);
5186 	struct intel_context *dst;
5187 
5188 	dst = intel_execlists_create_virtual(se->siblings,
5189 					     se->num_siblings);
5190 	if (IS_ERR(dst))
5191 		return dst;
5192 
5193 	if (se->num_bonds) {
5194 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5195 
5196 		de->bonds = kmemdup(se->bonds,
5197 				    sizeof(*se->bonds) * se->num_bonds,
5198 				    GFP_KERNEL);
5199 		if (!de->bonds) {
5200 			intel_context_put(dst);
5201 			return ERR_PTR(-ENOMEM);
5202 		}
5203 
5204 		de->num_bonds = se->num_bonds;
5205 	}
5206 
5207 	return dst;
5208 }
5209 
5210 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5211 				     const struct intel_engine_cs *master,
5212 				     const struct intel_engine_cs *sibling)
5213 {
5214 	struct virtual_engine *ve = to_virtual_engine(engine);
5215 	struct ve_bond *bond;
5216 	int n;
5217 
5218 	/* Sanity check the sibling is part of the virtual engine */
5219 	for (n = 0; n < ve->num_siblings; n++)
5220 		if (sibling == ve->siblings[n])
5221 			break;
5222 	if (n == ve->num_siblings)
5223 		return -EINVAL;
5224 
5225 	bond = virtual_find_bond(ve, master);
5226 	if (bond) {
5227 		bond->sibling_mask |= sibling->mask;
5228 		return 0;
5229 	}
5230 
5231 	bond = krealloc(ve->bonds,
5232 			sizeof(*bond) * (ve->num_bonds + 1),
5233 			GFP_KERNEL);
5234 	if (!bond)
5235 		return -ENOMEM;
5236 
5237 	bond[ve->num_bonds].master = master;
5238 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5239 
5240 	ve->bonds = bond;
5241 	ve->num_bonds++;
5242 
5243 	return 0;
5244 }
5245 
5246 struct intel_engine_cs *
5247 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5248 				 unsigned int sibling)
5249 {
5250 	struct virtual_engine *ve = to_virtual_engine(engine);
5251 
5252 	if (sibling >= ve->num_siblings)
5253 		return NULL;
5254 
5255 	return ve->siblings[sibling];
5256 }
5257 
5258 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5259 				   struct drm_printer *m,
5260 				   void (*show_request)(struct drm_printer *m,
5261 							struct i915_request *rq,
5262 							const char *prefix),
5263 				   unsigned int max)
5264 {
5265 	const struct intel_engine_execlists *execlists = &engine->execlists;
5266 	struct i915_request *rq, *last;
5267 	unsigned long flags;
5268 	unsigned int count;
5269 	struct rb_node *rb;
5270 
5271 	spin_lock_irqsave(&engine->active.lock, flags);
5272 
5273 	last = NULL;
5274 	count = 0;
5275 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5276 		if (count++ < max - 1)
5277 			show_request(m, rq, "\t\tE ");
5278 		else
5279 			last = rq;
5280 	}
5281 	if (last) {
5282 		if (count > max) {
5283 			drm_printf(m,
5284 				   "\t\t...skipping %d executing requests...\n",
5285 				   count - max);
5286 		}
5287 		show_request(m, last, "\t\tE ");
5288 	}
5289 
5290 	last = NULL;
5291 	count = 0;
5292 	if (execlists->queue_priority_hint != INT_MIN)
5293 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5294 			   execlists->queue_priority_hint);
5295 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5296 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5297 		int i;
5298 
5299 		priolist_for_each_request(rq, p, i) {
5300 			if (count++ < max - 1)
5301 				show_request(m, rq, "\t\tQ ");
5302 			else
5303 				last = rq;
5304 		}
5305 	}
5306 	if (last) {
5307 		if (count > max) {
5308 			drm_printf(m,
5309 				   "\t\t...skipping %d queued requests...\n",
5310 				   count - max);
5311 		}
5312 		show_request(m, last, "\t\tQ ");
5313 	}
5314 
5315 	last = NULL;
5316 	count = 0;
5317 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5318 		struct virtual_engine *ve =
5319 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5320 		struct i915_request *rq = READ_ONCE(ve->request);
5321 
5322 		if (rq) {
5323 			if (count++ < max - 1)
5324 				show_request(m, rq, "\t\tV ");
5325 			else
5326 				last = rq;
5327 		}
5328 	}
5329 	if (last) {
5330 		if (count > max) {
5331 			drm_printf(m,
5332 				   "\t\t...skipping %d virtual requests...\n",
5333 				   count - max);
5334 		}
5335 		show_request(m, last, "\t\tV ");
5336 	}
5337 
5338 	spin_unlock_irqrestore(&engine->active.lock, flags);
5339 }
5340 
5341 void intel_lr_context_reset(struct intel_engine_cs *engine,
5342 			    struct intel_context *ce,
5343 			    u32 head,
5344 			    bool scrub)
5345 {
5346 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5347 
5348 	/*
5349 	 * We want a simple context + ring to execute the breadcrumb update.
5350 	 * We cannot rely on the context being intact across the GPU hang,
5351 	 * so clear it and rebuild just what we need for the breadcrumb.
5352 	 * All pending requests for this context will be zapped, and any
5353 	 * future request will be after userspace has had the opportunity
5354 	 * to recreate its own state.
5355 	 */
5356 	if (scrub)
5357 		restore_default_state(ce, engine);
5358 
5359 	/* Rerun the request; its payload has been neutered (if guilty). */
5360 	__execlists_update_reg_state(ce, engine, head);
5361 }
5362 
5363 bool
5364 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5365 {
5366 	return engine->set_default_submission ==
5367 	       intel_execlists_set_default_submission;
5368 }
5369 
5370 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5371 #include "selftest_lrc.c"
5372 #endif
5373