xref: /linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 65c93628599dff4cd7cfb70130d1f6a2203731ea)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 #define WA_TAIL_DWORDS 2
180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[0];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine,
241 			     u32 head);
242 
243 static void mark_eio(struct i915_request *rq)
244 {
245 	if (i915_request_completed(rq))
246 		return;
247 
248 	GEM_BUG_ON(i915_request_signaled(rq));
249 
250 	dma_fence_set_error(&rq->fence, -EIO);
251 	i915_request_mark_complete(rq);
252 }
253 
254 static struct i915_request *
255 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
256 {
257 	struct i915_request *active = rq;
258 
259 	rcu_read_lock();
260 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
261 		if (i915_request_completed(rq))
262 			break;
263 
264 		active = rq;
265 	}
266 	rcu_read_unlock();
267 
268 	return active;
269 }
270 
271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
272 {
273 	return (i915_ggtt_offset(engine->status_page.vma) +
274 		I915_GEM_HWS_PREEMPT_ADDR);
275 }
276 
277 static inline void
278 ring_set_paused(const struct intel_engine_cs *engine, int state)
279 {
280 	/*
281 	 * We inspect HWS_PREEMPT with a semaphore inside
282 	 * engine->emit_fini_breadcrumb. If the dword is true,
283 	 * the ring is paused as the semaphore will busywait
284 	 * until the dword is false.
285 	 */
286 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
287 	if (state)
288 		wmb();
289 }
290 
291 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
292 {
293 	return rb_entry(rb, struct i915_priolist, node);
294 }
295 
296 static inline int rq_prio(const struct i915_request *rq)
297 {
298 	return rq->sched.attr.priority;
299 }
300 
301 static int effective_prio(const struct i915_request *rq)
302 {
303 	int prio = rq_prio(rq);
304 
305 	/*
306 	 * If this request is special and must not be interrupted at any
307 	 * cost, so be it. Note we are only checking the most recent request
308 	 * in the context and so may be masking an earlier vip request. It
309 	 * is hoped that under the conditions where nopreempt is used, this
310 	 * will not matter (i.e. all requests to that context will be
311 	 * nopreempt for as long as desired).
312 	 */
313 	if (i915_request_has_nopreempt(rq))
314 		prio = I915_PRIORITY_UNPREEMPTABLE;
315 
316 	/*
317 	 * On unwinding the active request, we give it a priority bump
318 	 * if it has completed waiting on any semaphore. If we know that
319 	 * the request has already started, we can prevent an unwanted
320 	 * preempt-to-idle cycle by taking that into account now.
321 	 */
322 	if (__i915_request_has_started(rq))
323 		prio |= I915_PRIORITY_NOSEMAPHORE;
324 
325 	/* Restrict mere WAIT boosts from triggering preemption */
326 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
327 	return prio | __NO_PREEMPTION;
328 }
329 
330 static int queue_prio(const struct intel_engine_execlists *execlists)
331 {
332 	struct i915_priolist *p;
333 	struct rb_node *rb;
334 
335 	rb = rb_first_cached(&execlists->queue);
336 	if (!rb)
337 		return INT_MIN;
338 
339 	/*
340 	 * As the priolist[] are inverted, with the highest priority in [0],
341 	 * we have to flip the index value to become priority.
342 	 */
343 	p = to_priolist(rb);
344 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
345 }
346 
347 static inline bool need_preempt(const struct intel_engine_cs *engine,
348 				const struct i915_request *rq,
349 				struct rb_node *rb)
350 {
351 	int last_prio;
352 
353 	if (!intel_engine_has_semaphores(engine))
354 		return false;
355 
356 	/*
357 	 * Check if the current priority hint merits a preemption attempt.
358 	 *
359 	 * We record the highest value priority we saw during rescheduling
360 	 * prior to this dequeue, therefore we know that if it is strictly
361 	 * less than the current tail of ESLP[0], we do not need to force
362 	 * a preempt-to-idle cycle.
363 	 *
364 	 * However, the priority hint is a mere hint that we may need to
365 	 * preempt. If that hint is stale or we may be trying to preempt
366 	 * ourselves, ignore the request.
367 	 *
368 	 * More naturally we would write
369 	 *      prio >= max(0, last);
370 	 * except that we wish to prevent triggering preemption at the same
371 	 * priority level: the task that is running should remain running
372 	 * to preserve FIFO ordering of dependencies.
373 	 */
374 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
375 	if (engine->execlists.queue_priority_hint <= last_prio)
376 		return false;
377 
378 	/*
379 	 * Check against the first request in ELSP[1], it will, thanks to the
380 	 * power of PI, be the highest priority of that context.
381 	 */
382 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
383 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
384 		return true;
385 
386 	if (rb) {
387 		struct virtual_engine *ve =
388 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
389 		bool preempt = false;
390 
391 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
392 			struct i915_request *next;
393 
394 			rcu_read_lock();
395 			next = READ_ONCE(ve->request);
396 			if (next)
397 				preempt = rq_prio(next) > last_prio;
398 			rcu_read_unlock();
399 		}
400 
401 		if (preempt)
402 			return preempt;
403 	}
404 
405 	/*
406 	 * If the inflight context did not trigger the preemption, then maybe
407 	 * it was the set of queued requests? Pick the highest priority in
408 	 * the queue (the first active priolist) and see if it deserves to be
409 	 * running instead of ELSP[0].
410 	 *
411 	 * The highest priority request in the queue can not be either
412 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
413 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
414 	 */
415 	return queue_prio(&engine->execlists) > last_prio;
416 }
417 
418 __maybe_unused static inline bool
419 assert_priority_queue(const struct i915_request *prev,
420 		      const struct i915_request *next)
421 {
422 	/*
423 	 * Without preemption, the prev may refer to the still active element
424 	 * which we refuse to let go.
425 	 *
426 	 * Even with preemption, there are times when we think it is better not
427 	 * to preempt and leave an ostensibly lower priority request in flight.
428 	 */
429 	if (i915_request_is_active(prev))
430 		return true;
431 
432 	return rq_prio(prev) >= rq_prio(next);
433 }
434 
435 /*
436  * The context descriptor encodes various attributes of a context,
437  * including its GTT address and some flags. Because it's fairly
438  * expensive to calculate, we'll just do it once and cache the result,
439  * which remains valid until the context is unpinned.
440  *
441  * This is what a descriptor looks like, from LSB to MSB::
442  *
443  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
444  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
445  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
446  *      bits 53-54:    mbz, reserved for use by hardware
447  *      bits 55-63:    group ID, currently unused and set to 0
448  *
449  * Starting from Gen11, the upper dword of the descriptor has a new format:
450  *
451  *      bits 32-36:    reserved
452  *      bits 37-47:    SW context ID
453  *      bits 48:53:    engine instance
454  *      bit 54:        mbz, reserved for use by hardware
455  *      bits 55-60:    SW counter
456  *      bits 61-63:    engine class
457  *
458  * engine info, SW context ID and SW counter need to form a unique number
459  * (Context ID) per lrc.
460  */
461 static u64
462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
463 {
464 	u64 desc;
465 
466 	desc = INTEL_LEGACY_32B_CONTEXT;
467 	if (i915_vm_is_4lvl(ce->vm))
468 		desc = INTEL_LEGACY_64B_CONTEXT;
469 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
470 
471 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
472 	if (IS_GEN(engine->i915, 8))
473 		desc |= GEN8_CTX_L3LLC_COHERENT;
474 
475 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
476 	/*
477 	 * The following 32bits are copied into the OA reports (dword 2).
478 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
479 	 * anything below.
480 	 */
481 	if (INTEL_GEN(engine->i915) >= 11) {
482 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
483 								/* bits 48-53 */
484 
485 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
486 								/* bits 61-63 */
487 	}
488 
489 	return desc;
490 }
491 
492 static inline unsigned int dword_in_page(void *addr)
493 {
494 	return offset_in_page(addr) / sizeof(u32);
495 }
496 
497 static void set_offsets(u32 *regs,
498 			const u8 *data,
499 			const struct intel_engine_cs *engine,
500 			bool clear)
501 #define NOP(x) (BIT(7) | (x))
502 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
503 #define POSTED BIT(0)
504 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
505 #define REG16(x) \
506 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
507 	(((x) >> 2) & 0x7f)
508 #define END(x) 0, (x)
509 {
510 	const u32 base = engine->mmio_base;
511 
512 	while (*data) {
513 		u8 count, flags;
514 
515 		if (*data & BIT(7)) { /* skip */
516 			count = *data++ & ~BIT(7);
517 			if (clear)
518 				memset32(regs, MI_NOOP, count);
519 			regs += count;
520 			continue;
521 		}
522 
523 		count = *data & 0x3f;
524 		flags = *data >> 6;
525 		data++;
526 
527 		*regs = MI_LOAD_REGISTER_IMM(count);
528 		if (flags & POSTED)
529 			*regs |= MI_LRI_FORCE_POSTED;
530 		if (INTEL_GEN(engine->i915) >= 11)
531 			*regs |= MI_LRI_CS_MMIO;
532 		regs++;
533 
534 		GEM_BUG_ON(!count);
535 		do {
536 			u32 offset = 0;
537 			u8 v;
538 
539 			do {
540 				v = *data++;
541 				offset <<= 7;
542 				offset |= v & ~BIT(7);
543 			} while (v & BIT(7));
544 
545 			regs[0] = base + (offset << 2);
546 			if (clear)
547 				regs[1] = 0;
548 			regs += 2;
549 		} while (--count);
550 	}
551 
552 	if (clear) {
553 		u8 count = *++data;
554 
555 		/* Clear past the tail for HW access */
556 		GEM_BUG_ON(dword_in_page(regs) > count);
557 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
558 
559 		/* Close the batch; used mainly by live_lrc_layout() */
560 		*regs = MI_BATCH_BUFFER_END;
561 		if (INTEL_GEN(engine->i915) >= 10)
562 			*regs |= BIT(0);
563 	}
564 }
565 
566 static const u8 gen8_xcs_offsets[] = {
567 	NOP(1),
568 	LRI(11, 0),
569 	REG16(0x244),
570 	REG(0x034),
571 	REG(0x030),
572 	REG(0x038),
573 	REG(0x03c),
574 	REG(0x168),
575 	REG(0x140),
576 	REG(0x110),
577 	REG(0x11c),
578 	REG(0x114),
579 	REG(0x118),
580 
581 	NOP(9),
582 	LRI(9, 0),
583 	REG16(0x3a8),
584 	REG16(0x28c),
585 	REG16(0x288),
586 	REG16(0x284),
587 	REG16(0x280),
588 	REG16(0x27c),
589 	REG16(0x278),
590 	REG16(0x274),
591 	REG16(0x270),
592 
593 	NOP(13),
594 	LRI(2, 0),
595 	REG16(0x200),
596 	REG(0x028),
597 
598 	END(80)
599 };
600 
601 static const u8 gen9_xcs_offsets[] = {
602 	NOP(1),
603 	LRI(14, POSTED),
604 	REG16(0x244),
605 	REG(0x034),
606 	REG(0x030),
607 	REG(0x038),
608 	REG(0x03c),
609 	REG(0x168),
610 	REG(0x140),
611 	REG(0x110),
612 	REG(0x11c),
613 	REG(0x114),
614 	REG(0x118),
615 	REG(0x1c0),
616 	REG(0x1c4),
617 	REG(0x1c8),
618 
619 	NOP(3),
620 	LRI(9, POSTED),
621 	REG16(0x3a8),
622 	REG16(0x28c),
623 	REG16(0x288),
624 	REG16(0x284),
625 	REG16(0x280),
626 	REG16(0x27c),
627 	REG16(0x278),
628 	REG16(0x274),
629 	REG16(0x270),
630 
631 	NOP(13),
632 	LRI(1, POSTED),
633 	REG16(0x200),
634 
635 	NOP(13),
636 	LRI(44, POSTED),
637 	REG(0x028),
638 	REG(0x09c),
639 	REG(0x0c0),
640 	REG(0x178),
641 	REG(0x17c),
642 	REG16(0x358),
643 	REG(0x170),
644 	REG(0x150),
645 	REG(0x154),
646 	REG(0x158),
647 	REG16(0x41c),
648 	REG16(0x600),
649 	REG16(0x604),
650 	REG16(0x608),
651 	REG16(0x60c),
652 	REG16(0x610),
653 	REG16(0x614),
654 	REG16(0x618),
655 	REG16(0x61c),
656 	REG16(0x620),
657 	REG16(0x624),
658 	REG16(0x628),
659 	REG16(0x62c),
660 	REG16(0x630),
661 	REG16(0x634),
662 	REG16(0x638),
663 	REG16(0x63c),
664 	REG16(0x640),
665 	REG16(0x644),
666 	REG16(0x648),
667 	REG16(0x64c),
668 	REG16(0x650),
669 	REG16(0x654),
670 	REG16(0x658),
671 	REG16(0x65c),
672 	REG16(0x660),
673 	REG16(0x664),
674 	REG16(0x668),
675 	REG16(0x66c),
676 	REG16(0x670),
677 	REG16(0x674),
678 	REG16(0x678),
679 	REG16(0x67c),
680 	REG(0x068),
681 
682 	END(176)
683 };
684 
685 static const u8 gen12_xcs_offsets[] = {
686 	NOP(1),
687 	LRI(13, POSTED),
688 	REG16(0x244),
689 	REG(0x034),
690 	REG(0x030),
691 	REG(0x038),
692 	REG(0x03c),
693 	REG(0x168),
694 	REG(0x140),
695 	REG(0x110),
696 	REG(0x1c0),
697 	REG(0x1c4),
698 	REG(0x1c8),
699 	REG(0x180),
700 	REG16(0x2b4),
701 
702 	NOP(5),
703 	LRI(9, POSTED),
704 	REG16(0x3a8),
705 	REG16(0x28c),
706 	REG16(0x288),
707 	REG16(0x284),
708 	REG16(0x280),
709 	REG16(0x27c),
710 	REG16(0x278),
711 	REG16(0x274),
712 	REG16(0x270),
713 
714 	END(80)
715 };
716 
717 static const u8 gen8_rcs_offsets[] = {
718 	NOP(1),
719 	LRI(14, POSTED),
720 	REG16(0x244),
721 	REG(0x034),
722 	REG(0x030),
723 	REG(0x038),
724 	REG(0x03c),
725 	REG(0x168),
726 	REG(0x140),
727 	REG(0x110),
728 	REG(0x11c),
729 	REG(0x114),
730 	REG(0x118),
731 	REG(0x1c0),
732 	REG(0x1c4),
733 	REG(0x1c8),
734 
735 	NOP(3),
736 	LRI(9, POSTED),
737 	REG16(0x3a8),
738 	REG16(0x28c),
739 	REG16(0x288),
740 	REG16(0x284),
741 	REG16(0x280),
742 	REG16(0x27c),
743 	REG16(0x278),
744 	REG16(0x274),
745 	REG16(0x270),
746 
747 	NOP(13),
748 	LRI(1, 0),
749 	REG(0x0c8),
750 
751 	END(80)
752 };
753 
754 static const u8 gen9_rcs_offsets[] = {
755 	NOP(1),
756 	LRI(14, POSTED),
757 	REG16(0x244),
758 	REG(0x34),
759 	REG(0x30),
760 	REG(0x38),
761 	REG(0x3c),
762 	REG(0x168),
763 	REG(0x140),
764 	REG(0x110),
765 	REG(0x11c),
766 	REG(0x114),
767 	REG(0x118),
768 	REG(0x1c0),
769 	REG(0x1c4),
770 	REG(0x1c8),
771 
772 	NOP(3),
773 	LRI(9, POSTED),
774 	REG16(0x3a8),
775 	REG16(0x28c),
776 	REG16(0x288),
777 	REG16(0x284),
778 	REG16(0x280),
779 	REG16(0x27c),
780 	REG16(0x278),
781 	REG16(0x274),
782 	REG16(0x270),
783 
784 	NOP(13),
785 	LRI(1, 0),
786 	REG(0xc8),
787 
788 	NOP(13),
789 	LRI(44, POSTED),
790 	REG(0x28),
791 	REG(0x9c),
792 	REG(0xc0),
793 	REG(0x178),
794 	REG(0x17c),
795 	REG16(0x358),
796 	REG(0x170),
797 	REG(0x150),
798 	REG(0x154),
799 	REG(0x158),
800 	REG16(0x41c),
801 	REG16(0x600),
802 	REG16(0x604),
803 	REG16(0x608),
804 	REG16(0x60c),
805 	REG16(0x610),
806 	REG16(0x614),
807 	REG16(0x618),
808 	REG16(0x61c),
809 	REG16(0x620),
810 	REG16(0x624),
811 	REG16(0x628),
812 	REG16(0x62c),
813 	REG16(0x630),
814 	REG16(0x634),
815 	REG16(0x638),
816 	REG16(0x63c),
817 	REG16(0x640),
818 	REG16(0x644),
819 	REG16(0x648),
820 	REG16(0x64c),
821 	REG16(0x650),
822 	REG16(0x654),
823 	REG16(0x658),
824 	REG16(0x65c),
825 	REG16(0x660),
826 	REG16(0x664),
827 	REG16(0x668),
828 	REG16(0x66c),
829 	REG16(0x670),
830 	REG16(0x674),
831 	REG16(0x678),
832 	REG16(0x67c),
833 	REG(0x68),
834 
835 	END(176)
836 };
837 
838 static const u8 gen11_rcs_offsets[] = {
839 	NOP(1),
840 	LRI(15, POSTED),
841 	REG16(0x244),
842 	REG(0x034),
843 	REG(0x030),
844 	REG(0x038),
845 	REG(0x03c),
846 	REG(0x168),
847 	REG(0x140),
848 	REG(0x110),
849 	REG(0x11c),
850 	REG(0x114),
851 	REG(0x118),
852 	REG(0x1c0),
853 	REG(0x1c4),
854 	REG(0x1c8),
855 	REG(0x180),
856 
857 	NOP(1),
858 	LRI(9, POSTED),
859 	REG16(0x3a8),
860 	REG16(0x28c),
861 	REG16(0x288),
862 	REG16(0x284),
863 	REG16(0x280),
864 	REG16(0x27c),
865 	REG16(0x278),
866 	REG16(0x274),
867 	REG16(0x270),
868 
869 	LRI(1, POSTED),
870 	REG(0x1b0),
871 
872 	NOP(10),
873 	LRI(1, 0),
874 	REG(0x0c8),
875 
876 	END(80)
877 };
878 
879 static const u8 gen12_rcs_offsets[] = {
880 	NOP(1),
881 	LRI(13, POSTED),
882 	REG16(0x244),
883 	REG(0x034),
884 	REG(0x030),
885 	REG(0x038),
886 	REG(0x03c),
887 	REG(0x168),
888 	REG(0x140),
889 	REG(0x110),
890 	REG(0x1c0),
891 	REG(0x1c4),
892 	REG(0x1c8),
893 	REG(0x180),
894 	REG16(0x2b4),
895 
896 	NOP(5),
897 	LRI(9, POSTED),
898 	REG16(0x3a8),
899 	REG16(0x28c),
900 	REG16(0x288),
901 	REG16(0x284),
902 	REG16(0x280),
903 	REG16(0x27c),
904 	REG16(0x278),
905 	REG16(0x274),
906 	REG16(0x270),
907 
908 	LRI(3, POSTED),
909 	REG(0x1b0),
910 	REG16(0x5a8),
911 	REG16(0x5ac),
912 
913 	NOP(6),
914 	LRI(1, 0),
915 	REG(0x0c8),
916 
917 	END(80)
918 };
919 
920 #undef END
921 #undef REG16
922 #undef REG
923 #undef LRI
924 #undef NOP
925 
926 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
927 {
928 	/*
929 	 * The gen12+ lists only have the registers we program in the basic
930 	 * default state. We rely on the context image using relative
931 	 * addressing to automatic fixup the register state between the
932 	 * physical engines for virtual engine.
933 	 */
934 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
935 		   !intel_engine_has_relative_mmio(engine));
936 
937 	if (engine->class == RENDER_CLASS) {
938 		if (INTEL_GEN(engine->i915) >= 12)
939 			return gen12_rcs_offsets;
940 		else if (INTEL_GEN(engine->i915) >= 11)
941 			return gen11_rcs_offsets;
942 		else if (INTEL_GEN(engine->i915) >= 9)
943 			return gen9_rcs_offsets;
944 		else
945 			return gen8_rcs_offsets;
946 	} else {
947 		if (INTEL_GEN(engine->i915) >= 12)
948 			return gen12_xcs_offsets;
949 		else if (INTEL_GEN(engine->i915) >= 9)
950 			return gen9_xcs_offsets;
951 		else
952 			return gen8_xcs_offsets;
953 	}
954 }
955 
956 static struct i915_request *
957 __unwind_incomplete_requests(struct intel_engine_cs *engine)
958 {
959 	struct i915_request *rq, *rn, *active = NULL;
960 	struct list_head *uninitialized_var(pl);
961 	int prio = I915_PRIORITY_INVALID;
962 
963 	lockdep_assert_held(&engine->active.lock);
964 
965 	list_for_each_entry_safe_reverse(rq, rn,
966 					 &engine->active.requests,
967 					 sched.link) {
968 		if (i915_request_completed(rq))
969 			continue; /* XXX */
970 
971 		__i915_request_unsubmit(rq);
972 
973 		/*
974 		 * Push the request back into the queue for later resubmission.
975 		 * If this request is not native to this physical engine (i.e.
976 		 * it came from a virtual source), push it back onto the virtual
977 		 * engine so that it can be moved across onto another physical
978 		 * engine as load dictates.
979 		 */
980 		if (likely(rq->execution_mask == engine->mask)) {
981 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
982 			if (rq_prio(rq) != prio) {
983 				prio = rq_prio(rq);
984 				pl = i915_sched_lookup_priolist(engine, prio);
985 			}
986 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
987 
988 			list_move(&rq->sched.link, pl);
989 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
990 
991 			active = rq;
992 		} else {
993 			struct intel_engine_cs *owner = rq->context->engine;
994 
995 			/*
996 			 * Decouple the virtual breadcrumb before moving it
997 			 * back to the virtual engine -- we don't want the
998 			 * request to complete in the background and try
999 			 * and cancel the breadcrumb on the virtual engine
1000 			 * (instead of the old engine where it is linked)!
1001 			 */
1002 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1003 				     &rq->fence.flags)) {
1004 				spin_lock_nested(&rq->lock,
1005 						 SINGLE_DEPTH_NESTING);
1006 				i915_request_cancel_breadcrumb(rq);
1007 				spin_unlock(&rq->lock);
1008 			}
1009 			rq->engine = owner;
1010 			owner->submit_request(rq);
1011 			active = NULL;
1012 		}
1013 	}
1014 
1015 	return active;
1016 }
1017 
1018 struct i915_request *
1019 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1020 {
1021 	struct intel_engine_cs *engine =
1022 		container_of(execlists, typeof(*engine), execlists);
1023 
1024 	return __unwind_incomplete_requests(engine);
1025 }
1026 
1027 static inline void
1028 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1029 {
1030 	/*
1031 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1032 	 * The compiler should eliminate this function as dead-code.
1033 	 */
1034 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1035 		return;
1036 
1037 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1038 				   status, rq);
1039 }
1040 
1041 static void intel_engine_context_in(struct intel_engine_cs *engine)
1042 {
1043 	unsigned long flags;
1044 
1045 	if (READ_ONCE(engine->stats.enabled) == 0)
1046 		return;
1047 
1048 	write_seqlock_irqsave(&engine->stats.lock, flags);
1049 
1050 	if (engine->stats.enabled > 0) {
1051 		if (engine->stats.active++ == 0)
1052 			engine->stats.start = ktime_get();
1053 		GEM_BUG_ON(engine->stats.active == 0);
1054 	}
1055 
1056 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1057 }
1058 
1059 static void intel_engine_context_out(struct intel_engine_cs *engine)
1060 {
1061 	unsigned long flags;
1062 
1063 	if (READ_ONCE(engine->stats.enabled) == 0)
1064 		return;
1065 
1066 	write_seqlock_irqsave(&engine->stats.lock, flags);
1067 
1068 	if (engine->stats.enabled > 0) {
1069 		ktime_t last;
1070 
1071 		if (engine->stats.active && --engine->stats.active == 0) {
1072 			/*
1073 			 * Decrement the active context count and in case GPU
1074 			 * is now idle add up to the running total.
1075 			 */
1076 			last = ktime_sub(ktime_get(), engine->stats.start);
1077 
1078 			engine->stats.total = ktime_add(engine->stats.total,
1079 							last);
1080 		} else if (engine->stats.active == 0) {
1081 			/*
1082 			 * After turning on engine stats, context out might be
1083 			 * the first event in which case we account from the
1084 			 * time stats gathering was turned on.
1085 			 */
1086 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1087 
1088 			engine->stats.total = ktime_add(engine->stats.total,
1089 							last);
1090 		}
1091 	}
1092 
1093 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1094 }
1095 
1096 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1097 {
1098 	if (INTEL_GEN(engine->i915) >= 12)
1099 		return 0x60;
1100 	else if (INTEL_GEN(engine->i915) >= 9)
1101 		return 0x54;
1102 	else if (engine->class == RENDER_CLASS)
1103 		return 0x58;
1104 	else
1105 		return -1;
1106 }
1107 
1108 static void
1109 execlists_check_context(const struct intel_context *ce,
1110 			const struct intel_engine_cs *engine)
1111 {
1112 	const struct intel_ring *ring = ce->ring;
1113 	u32 *regs = ce->lrc_reg_state;
1114 	bool valid = true;
1115 	int x;
1116 
1117 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1118 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1119 		       engine->name,
1120 		       regs[CTX_RING_START],
1121 		       i915_ggtt_offset(ring->vma));
1122 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1123 		valid = false;
1124 	}
1125 
1126 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1127 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1128 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1129 		       engine->name,
1130 		       regs[CTX_RING_CTL],
1131 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1132 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1133 		valid = false;
1134 	}
1135 
1136 	x = lrc_ring_mi_mode(engine);
1137 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1138 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1139 		       engine->name, regs[x + 1]);
1140 		regs[x + 1] &= ~STOP_RING;
1141 		regs[x + 1] |= STOP_RING << 16;
1142 		valid = false;
1143 	}
1144 
1145 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1146 }
1147 
1148 static void restore_default_state(struct intel_context *ce,
1149 				  struct intel_engine_cs *engine)
1150 {
1151 	u32 *regs = ce->lrc_reg_state;
1152 
1153 	if (engine->pinned_default_state)
1154 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1155 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1156 		       engine->context_size - PAGE_SIZE);
1157 
1158 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1159 }
1160 
1161 static void reset_active(struct i915_request *rq,
1162 			 struct intel_engine_cs *engine)
1163 {
1164 	struct intel_context * const ce = rq->context;
1165 	u32 head;
1166 
1167 	/*
1168 	 * The executing context has been cancelled. We want to prevent
1169 	 * further execution along this context and propagate the error on
1170 	 * to anything depending on its results.
1171 	 *
1172 	 * In __i915_request_submit(), we apply the -EIO and remove the
1173 	 * requests' payloads for any banned requests. But first, we must
1174 	 * rewind the context back to the start of the incomplete request so
1175 	 * that we do not jump back into the middle of the batch.
1176 	 *
1177 	 * We preserve the breadcrumbs and semaphores of the incomplete
1178 	 * requests so that inter-timeline dependencies (i.e other timelines)
1179 	 * remain correctly ordered. And we defer to __i915_request_submit()
1180 	 * so that all asynchronous waits are correctly handled.
1181 	 */
1182 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1183 		     rq->fence.context, rq->fence.seqno);
1184 
1185 	/* On resubmission of the active request, payload will be scrubbed */
1186 	if (i915_request_completed(rq))
1187 		head = rq->tail;
1188 	else
1189 		head = active_request(ce->timeline, rq)->head;
1190 	head = intel_ring_wrap(ce->ring, head);
1191 
1192 	/* Scrub the context image to prevent replaying the previous batch */
1193 	restore_default_state(ce, engine);
1194 	__execlists_update_reg_state(ce, engine, head);
1195 
1196 	/* We've switched away, so this should be a no-op, but intent matters */
1197 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1198 }
1199 
1200 static inline struct intel_engine_cs *
1201 __execlists_schedule_in(struct i915_request *rq)
1202 {
1203 	struct intel_engine_cs * const engine = rq->engine;
1204 	struct intel_context * const ce = rq->context;
1205 
1206 	intel_context_get(ce);
1207 
1208 	if (unlikely(intel_context_is_banned(ce)))
1209 		reset_active(rq, engine);
1210 
1211 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1212 		execlists_check_context(ce, engine);
1213 
1214 	if (ce->tag) {
1215 		/* Use a fixed tag for OA and friends */
1216 		ce->lrc_desc |= (u64)ce->tag << 32;
1217 	} else {
1218 		/* We don't need a strict matching tag, just different values */
1219 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1220 		ce->lrc_desc |=
1221 			(u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1222 			GEN11_SW_CTX_ID_SHIFT;
1223 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1224 	}
1225 
1226 	__intel_gt_pm_get(engine->gt);
1227 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1228 	intel_engine_context_in(engine);
1229 
1230 	return engine;
1231 }
1232 
1233 static inline struct i915_request *
1234 execlists_schedule_in(struct i915_request *rq, int idx)
1235 {
1236 	struct intel_context * const ce = rq->context;
1237 	struct intel_engine_cs *old;
1238 
1239 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1240 	trace_i915_request_in(rq, idx);
1241 
1242 	old = READ_ONCE(ce->inflight);
1243 	do {
1244 		if (!old) {
1245 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1246 			break;
1247 		}
1248 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1249 
1250 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1251 	return i915_request_get(rq);
1252 }
1253 
1254 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1255 {
1256 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1257 	struct i915_request *next = READ_ONCE(ve->request);
1258 
1259 	if (next && next->execution_mask & ~rq->execution_mask)
1260 		tasklet_schedule(&ve->base.execlists.tasklet);
1261 }
1262 
1263 static inline void
1264 __execlists_schedule_out(struct i915_request *rq,
1265 			 struct intel_engine_cs * const engine)
1266 {
1267 	struct intel_context * const ce = rq->context;
1268 
1269 	/*
1270 	 * NB process_csb() is not under the engine->active.lock and hence
1271 	 * schedule_out can race with schedule_in meaning that we should
1272 	 * refrain from doing non-trivial work here.
1273 	 */
1274 
1275 	/*
1276 	 * If we have just completed this context, the engine may now be
1277 	 * idle and we want to re-enter powersaving.
1278 	 */
1279 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
1280 	    i915_request_completed(rq))
1281 		intel_engine_add_retire(engine, ce->timeline);
1282 
1283 	intel_engine_context_out(engine);
1284 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1285 	intel_gt_pm_put_async(engine->gt);
1286 
1287 	/*
1288 	 * If this is part of a virtual engine, its next request may
1289 	 * have been blocked waiting for access to the active context.
1290 	 * We have to kick all the siblings again in case we need to
1291 	 * switch (e.g. the next request is not runnable on this
1292 	 * engine). Hopefully, we will already have submitted the next
1293 	 * request before the tasklet runs and do not need to rebuild
1294 	 * each virtual tree and kick everyone again.
1295 	 */
1296 	if (ce->engine != engine)
1297 		kick_siblings(rq, ce);
1298 
1299 	intel_context_put(ce);
1300 }
1301 
1302 static inline void
1303 execlists_schedule_out(struct i915_request *rq)
1304 {
1305 	struct intel_context * const ce = rq->context;
1306 	struct intel_engine_cs *cur, *old;
1307 
1308 	trace_i915_request_out(rq);
1309 
1310 	old = READ_ONCE(ce->inflight);
1311 	do
1312 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1313 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1314 	if (!cur)
1315 		__execlists_schedule_out(rq, old);
1316 
1317 	i915_request_put(rq);
1318 }
1319 
1320 static u64 execlists_update_context(struct i915_request *rq)
1321 {
1322 	struct intel_context *ce = rq->context;
1323 	u64 desc = ce->lrc_desc;
1324 	u32 tail, prev;
1325 
1326 	/*
1327 	 * WaIdleLiteRestore:bdw,skl
1328 	 *
1329 	 * We should never submit the context with the same RING_TAIL twice
1330 	 * just in case we submit an empty ring, which confuses the HW.
1331 	 *
1332 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1333 	 * the normal request to be able to always advance the RING_TAIL on
1334 	 * subsequent resubmissions (for lite restore). Should that fail us,
1335 	 * and we try and submit the same tail again, force the context
1336 	 * reload.
1337 	 *
1338 	 * If we need to return to a preempted context, we need to skip the
1339 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1340 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1341 	 * an earlier request.
1342 	 */
1343 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1344 	prev = ce->lrc_reg_state[CTX_RING_TAIL];
1345 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1346 		desc |= CTX_DESC_FORCE_RESTORE;
1347 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1348 	rq->tail = rq->wa_tail;
1349 
1350 	/*
1351 	 * Make sure the context image is complete before we submit it to HW.
1352 	 *
1353 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1354 	 * an uncached write such as our mmio register access, the empirical
1355 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1356 	 * may not be visible to the HW prior to the completion of the UC
1357 	 * register write and that we may begin execution from the context
1358 	 * before its image is complete leading to invalid PD chasing.
1359 	 */
1360 	wmb();
1361 
1362 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1363 	return desc;
1364 }
1365 
1366 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1367 {
1368 	if (execlists->ctrl_reg) {
1369 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1370 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1371 	} else {
1372 		writel(upper_32_bits(desc), execlists->submit_reg);
1373 		writel(lower_32_bits(desc), execlists->submit_reg);
1374 	}
1375 }
1376 
1377 static __maybe_unused void
1378 trace_ports(const struct intel_engine_execlists *execlists,
1379 	    const char *msg,
1380 	    struct i915_request * const *ports)
1381 {
1382 	const struct intel_engine_cs *engine =
1383 		container_of(execlists, typeof(*engine), execlists);
1384 
1385 	if (!ports[0])
1386 		return;
1387 
1388 	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1389 		     ports[0]->fence.context,
1390 		     ports[0]->fence.seqno,
1391 		     i915_request_completed(ports[0]) ? "!" :
1392 		     i915_request_started(ports[0]) ? "*" :
1393 		     "",
1394 		     ports[1] ? ports[1]->fence.context : 0,
1395 		     ports[1] ? ports[1]->fence.seqno : 0);
1396 }
1397 
1398 static __maybe_unused bool
1399 assert_pending_valid(const struct intel_engine_execlists *execlists,
1400 		     const char *msg)
1401 {
1402 	struct i915_request * const *port, *rq;
1403 	struct intel_context *ce = NULL;
1404 
1405 	trace_ports(execlists, msg, execlists->pending);
1406 
1407 	if (!execlists->pending[0]) {
1408 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1409 		return false;
1410 	}
1411 
1412 	if (execlists->pending[execlists_num_ports(execlists)]) {
1413 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1414 			      execlists_num_ports(execlists));
1415 		return false;
1416 	}
1417 
1418 	for (port = execlists->pending; (rq = *port); port++) {
1419 		unsigned long flags;
1420 		bool ok = true;
1421 
1422 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1423 		GEM_BUG_ON(!i915_request_is_active(rq));
1424 
1425 		if (ce == rq->context) {
1426 			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1427 				      ce->timeline->fence_context,
1428 				      port - execlists->pending);
1429 			return false;
1430 		}
1431 		ce = rq->context;
1432 
1433 		/* Hold tightly onto the lock to prevent concurrent retires! */
1434 		if (!spin_trylock_irqsave(&rq->lock, flags))
1435 			continue;
1436 
1437 		if (i915_request_completed(rq))
1438 			goto unlock;
1439 
1440 		if (i915_active_is_idle(&ce->active) &&
1441 		    !intel_context_is_barrier(ce)) {
1442 			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1443 				      ce->timeline->fence_context,
1444 				      port - execlists->pending);
1445 			ok = false;
1446 			goto unlock;
1447 		}
1448 
1449 		if (!i915_vma_is_pinned(ce->state)) {
1450 			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1451 				      ce->timeline->fence_context,
1452 				      port - execlists->pending);
1453 			ok = false;
1454 			goto unlock;
1455 		}
1456 
1457 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1458 			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1459 				      ce->timeline->fence_context,
1460 				      port - execlists->pending);
1461 			ok = false;
1462 			goto unlock;
1463 		}
1464 
1465 unlock:
1466 		spin_unlock_irqrestore(&rq->lock, flags);
1467 		if (!ok)
1468 			return false;
1469 	}
1470 
1471 	return ce;
1472 }
1473 
1474 static void execlists_submit_ports(struct intel_engine_cs *engine)
1475 {
1476 	struct intel_engine_execlists *execlists = &engine->execlists;
1477 	unsigned int n;
1478 
1479 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1480 
1481 	/*
1482 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1483 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1484 	 * not be relinquished until the device is idle (see
1485 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1486 	 * that all ELSP are drained i.e. we have processed the CSB,
1487 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1488 	 */
1489 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1490 
1491 	/*
1492 	 * ELSQ note: the submit queue is not cleared after being submitted
1493 	 * to the HW so we need to make sure we always clean it up. This is
1494 	 * currently ensured by the fact that we always write the same number
1495 	 * of elsq entries, keep this in mind before changing the loop below.
1496 	 */
1497 	for (n = execlists_num_ports(execlists); n--; ) {
1498 		struct i915_request *rq = execlists->pending[n];
1499 
1500 		write_desc(execlists,
1501 			   rq ? execlists_update_context(rq) : 0,
1502 			   n);
1503 	}
1504 
1505 	/* we need to manually load the submit queue */
1506 	if (execlists->ctrl_reg)
1507 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1508 }
1509 
1510 static bool ctx_single_port_submission(const struct intel_context *ce)
1511 {
1512 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1513 		intel_context_force_single_submission(ce));
1514 }
1515 
1516 static bool can_merge_ctx(const struct intel_context *prev,
1517 			  const struct intel_context *next)
1518 {
1519 	if (prev != next)
1520 		return false;
1521 
1522 	if (ctx_single_port_submission(prev))
1523 		return false;
1524 
1525 	return true;
1526 }
1527 
1528 static bool can_merge_rq(const struct i915_request *prev,
1529 			 const struct i915_request *next)
1530 {
1531 	GEM_BUG_ON(prev == next);
1532 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1533 
1534 	/*
1535 	 * We do not submit known completed requests. Therefore if the next
1536 	 * request is already completed, we can pretend to merge it in
1537 	 * with the previous context (and we will skip updating the ELSP
1538 	 * and tracking). Thus hopefully keeping the ELSP full with active
1539 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1540 	 * us.
1541 	 */
1542 	if (i915_request_completed(next))
1543 		return true;
1544 
1545 	if (unlikely((prev->fence.flags ^ next->fence.flags) &
1546 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1547 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1548 		return false;
1549 
1550 	if (!can_merge_ctx(prev->context, next->context))
1551 		return false;
1552 
1553 	return true;
1554 }
1555 
1556 static void virtual_update_register_offsets(u32 *regs,
1557 					    struct intel_engine_cs *engine)
1558 {
1559 	set_offsets(regs, reg_offsets(engine), engine, false);
1560 }
1561 
1562 static bool virtual_matches(const struct virtual_engine *ve,
1563 			    const struct i915_request *rq,
1564 			    const struct intel_engine_cs *engine)
1565 {
1566 	const struct intel_engine_cs *inflight;
1567 
1568 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1569 		return false;
1570 
1571 	/*
1572 	 * We track when the HW has completed saving the context image
1573 	 * (i.e. when we have seen the final CS event switching out of
1574 	 * the context) and must not overwrite the context image before
1575 	 * then. This restricts us to only using the active engine
1576 	 * while the previous virtualized request is inflight (so
1577 	 * we reuse the register offsets). This is a very small
1578 	 * hystersis on the greedy seelction algorithm.
1579 	 */
1580 	inflight = intel_context_inflight(&ve->context);
1581 	if (inflight && inflight != engine)
1582 		return false;
1583 
1584 	return true;
1585 }
1586 
1587 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1588 				     struct intel_engine_cs *engine)
1589 {
1590 	struct intel_engine_cs *old = ve->siblings[0];
1591 
1592 	/* All unattached (rq->engine == old) must already be completed */
1593 
1594 	spin_lock(&old->breadcrumbs.irq_lock);
1595 	if (!list_empty(&ve->context.signal_link)) {
1596 		list_move_tail(&ve->context.signal_link,
1597 			       &engine->breadcrumbs.signalers);
1598 		intel_engine_signal_breadcrumbs(engine);
1599 	}
1600 	spin_unlock(&old->breadcrumbs.irq_lock);
1601 }
1602 
1603 static struct i915_request *
1604 last_active(const struct intel_engine_execlists *execlists)
1605 {
1606 	struct i915_request * const *last = READ_ONCE(execlists->active);
1607 
1608 	while (*last && i915_request_completed(*last))
1609 		last++;
1610 
1611 	return *last;
1612 }
1613 
1614 #define for_each_waiter(p__, rq__) \
1615 	list_for_each_entry_lockless(p__, \
1616 				     &(rq__)->sched.waiters_list, \
1617 				     wait_link)
1618 
1619 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1620 {
1621 	LIST_HEAD(list);
1622 
1623 	/*
1624 	 * We want to move the interrupted request to the back of
1625 	 * the round-robin list (i.e. its priority level), but
1626 	 * in doing so, we must then move all requests that were in
1627 	 * flight and were waiting for the interrupted request to
1628 	 * be run after it again.
1629 	 */
1630 	do {
1631 		struct i915_dependency *p;
1632 
1633 		GEM_BUG_ON(i915_request_is_active(rq));
1634 		list_move_tail(&rq->sched.link, pl);
1635 
1636 		for_each_waiter(p, rq) {
1637 			struct i915_request *w =
1638 				container_of(p->waiter, typeof(*w), sched);
1639 
1640 			/* Leave semaphores spinning on the other engines */
1641 			if (w->engine != rq->engine)
1642 				continue;
1643 
1644 			/* No waiter should start before its signaler */
1645 			GEM_BUG_ON(i915_request_started(w) &&
1646 				   !i915_request_completed(rq));
1647 
1648 			GEM_BUG_ON(i915_request_is_active(w));
1649 			if (!i915_request_is_ready(w))
1650 				continue;
1651 
1652 			if (rq_prio(w) < rq_prio(rq))
1653 				continue;
1654 
1655 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1656 			list_move_tail(&w->sched.link, &list);
1657 		}
1658 
1659 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1660 	} while (rq);
1661 }
1662 
1663 static void defer_active(struct intel_engine_cs *engine)
1664 {
1665 	struct i915_request *rq;
1666 
1667 	rq = __unwind_incomplete_requests(engine);
1668 	if (!rq)
1669 		return;
1670 
1671 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1672 }
1673 
1674 static bool
1675 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1676 {
1677 	int hint;
1678 
1679 	if (!intel_engine_has_timeslices(engine))
1680 		return false;
1681 
1682 	hint = engine->execlists.queue_priority_hint;
1683 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1684 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1685 
1686 	return hint >= effective_prio(rq);
1687 }
1688 
1689 static int
1690 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1691 {
1692 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1693 		return INT_MIN;
1694 
1695 	return rq_prio(list_next_entry(rq, sched.link));
1696 }
1697 
1698 static inline unsigned long
1699 timeslice(const struct intel_engine_cs *engine)
1700 {
1701 	return READ_ONCE(engine->props.timeslice_duration_ms);
1702 }
1703 
1704 static unsigned long
1705 active_timeslice(const struct intel_engine_cs *engine)
1706 {
1707 	const struct i915_request *rq = *engine->execlists.active;
1708 
1709 	if (!rq || i915_request_completed(rq))
1710 		return 0;
1711 
1712 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1713 		return 0;
1714 
1715 	return timeslice(engine);
1716 }
1717 
1718 static void set_timeslice(struct intel_engine_cs *engine)
1719 {
1720 	if (!intel_engine_has_timeslices(engine))
1721 		return;
1722 
1723 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1724 }
1725 
1726 static void start_timeslice(struct intel_engine_cs *engine)
1727 {
1728 	struct intel_engine_execlists *execlists = &engine->execlists;
1729 
1730 	execlists->switch_priority_hint = execlists->queue_priority_hint;
1731 
1732 	if (timer_pending(&execlists->timer))
1733 		return;
1734 
1735 	set_timer_ms(&execlists->timer, timeslice(engine));
1736 }
1737 
1738 static void record_preemption(struct intel_engine_execlists *execlists)
1739 {
1740 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1741 }
1742 
1743 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1744 {
1745 	struct i915_request *rq;
1746 
1747 	rq = last_active(&engine->execlists);
1748 	if (!rq)
1749 		return 0;
1750 
1751 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1752 	if (unlikely(intel_context_is_banned(rq->context)))
1753 		return 1;
1754 
1755 	return READ_ONCE(engine->props.preempt_timeout_ms);
1756 }
1757 
1758 static void set_preempt_timeout(struct intel_engine_cs *engine)
1759 {
1760 	if (!intel_engine_has_preempt_reset(engine))
1761 		return;
1762 
1763 	set_timer_ms(&engine->execlists.preempt,
1764 		     active_preempt_timeout(engine));
1765 }
1766 
1767 static inline void clear_ports(struct i915_request **ports, int count)
1768 {
1769 	memset_p((void **)ports, NULL, count);
1770 }
1771 
1772 static void execlists_dequeue(struct intel_engine_cs *engine)
1773 {
1774 	struct intel_engine_execlists * const execlists = &engine->execlists;
1775 	struct i915_request **port = execlists->pending;
1776 	struct i915_request ** const last_port = port + execlists->port_mask;
1777 	struct i915_request *last;
1778 	struct rb_node *rb;
1779 	bool submit = false;
1780 
1781 	/*
1782 	 * Hardware submission is through 2 ports. Conceptually each port
1783 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1784 	 * static for a context, and unique to each, so we only execute
1785 	 * requests belonging to a single context from each ring. RING_HEAD
1786 	 * is maintained by the CS in the context image, it marks the place
1787 	 * where it got up to last time, and through RING_TAIL we tell the CS
1788 	 * where we want to execute up to this time.
1789 	 *
1790 	 * In this list the requests are in order of execution. Consecutive
1791 	 * requests from the same context are adjacent in the ringbuffer. We
1792 	 * can combine these requests into a single RING_TAIL update:
1793 	 *
1794 	 *              RING_HEAD...req1...req2
1795 	 *                                    ^- RING_TAIL
1796 	 * since to execute req2 the CS must first execute req1.
1797 	 *
1798 	 * Our goal then is to point each port to the end of a consecutive
1799 	 * sequence of requests as being the most optimal (fewest wake ups
1800 	 * and context switches) submission.
1801 	 */
1802 
1803 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1804 		struct virtual_engine *ve =
1805 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1806 		struct i915_request *rq = READ_ONCE(ve->request);
1807 
1808 		if (!rq) { /* lazily cleanup after another engine handled rq */
1809 			rb_erase_cached(rb, &execlists->virtual);
1810 			RB_CLEAR_NODE(rb);
1811 			rb = rb_first_cached(&execlists->virtual);
1812 			continue;
1813 		}
1814 
1815 		if (!virtual_matches(ve, rq, engine)) {
1816 			rb = rb_next(rb);
1817 			continue;
1818 		}
1819 
1820 		break;
1821 	}
1822 
1823 	/*
1824 	 * If the queue is higher priority than the last
1825 	 * request in the currently active context, submit afresh.
1826 	 * We will resubmit again afterwards in case we need to split
1827 	 * the active context to interject the preemption request,
1828 	 * i.e. we will retrigger preemption following the ack in case
1829 	 * of trouble.
1830 	 */
1831 	last = last_active(execlists);
1832 	if (last) {
1833 		if (need_preempt(engine, last, rb)) {
1834 			ENGINE_TRACE(engine,
1835 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1836 				     last->fence.context,
1837 				     last->fence.seqno,
1838 				     last->sched.attr.priority,
1839 				     execlists->queue_priority_hint);
1840 			record_preemption(execlists);
1841 
1842 			/*
1843 			 * Don't let the RING_HEAD advance past the breadcrumb
1844 			 * as we unwind (and until we resubmit) so that we do
1845 			 * not accidentally tell it to go backwards.
1846 			 */
1847 			ring_set_paused(engine, 1);
1848 
1849 			/*
1850 			 * Note that we have not stopped the GPU at this point,
1851 			 * so we are unwinding the incomplete requests as they
1852 			 * remain inflight and so by the time we do complete
1853 			 * the preemption, some of the unwound requests may
1854 			 * complete!
1855 			 */
1856 			__unwind_incomplete_requests(engine);
1857 
1858 			last = NULL;
1859 		} else if (need_timeslice(engine, last) &&
1860 			   timer_expired(&engine->execlists.timer)) {
1861 			ENGINE_TRACE(engine,
1862 				     "expired last=%llx:%lld, prio=%d, hint=%d\n",
1863 				     last->fence.context,
1864 				     last->fence.seqno,
1865 				     last->sched.attr.priority,
1866 				     execlists->queue_priority_hint);
1867 
1868 			ring_set_paused(engine, 1);
1869 			defer_active(engine);
1870 
1871 			/*
1872 			 * Unlike for preemption, if we rewind and continue
1873 			 * executing the same context as previously active,
1874 			 * the order of execution will remain the same and
1875 			 * the tail will only advance. We do not need to
1876 			 * force a full context restore, as a lite-restore
1877 			 * is sufficient to resample the monotonic TAIL.
1878 			 *
1879 			 * If we switch to any other context, similarly we
1880 			 * will not rewind TAIL of current context, and
1881 			 * normal save/restore will preserve state and allow
1882 			 * us to later continue executing the same request.
1883 			 */
1884 			last = NULL;
1885 		} else {
1886 			/*
1887 			 * Otherwise if we already have a request pending
1888 			 * for execution after the current one, we can
1889 			 * just wait until the next CS event before
1890 			 * queuing more. In either case we will force a
1891 			 * lite-restore preemption event, but if we wait
1892 			 * we hopefully coalesce several updates into a single
1893 			 * submission.
1894 			 */
1895 			if (!list_is_last(&last->sched.link,
1896 					  &engine->active.requests)) {
1897 				/*
1898 				 * Even if ELSP[1] is occupied and not worthy
1899 				 * of timeslices, our queue might be.
1900 				 */
1901 				start_timeslice(engine);
1902 				return;
1903 			}
1904 		}
1905 	}
1906 
1907 	while (rb) { /* XXX virtual is always taking precedence */
1908 		struct virtual_engine *ve =
1909 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1910 		struct i915_request *rq;
1911 
1912 		spin_lock(&ve->base.active.lock);
1913 
1914 		rq = ve->request;
1915 		if (unlikely(!rq)) { /* lost the race to a sibling */
1916 			spin_unlock(&ve->base.active.lock);
1917 			rb_erase_cached(rb, &execlists->virtual);
1918 			RB_CLEAR_NODE(rb);
1919 			rb = rb_first_cached(&execlists->virtual);
1920 			continue;
1921 		}
1922 
1923 		GEM_BUG_ON(rq != ve->request);
1924 		GEM_BUG_ON(rq->engine != &ve->base);
1925 		GEM_BUG_ON(rq->context != &ve->context);
1926 
1927 		if (rq_prio(rq) >= queue_prio(execlists)) {
1928 			if (!virtual_matches(ve, rq, engine)) {
1929 				spin_unlock(&ve->base.active.lock);
1930 				rb = rb_next(rb);
1931 				continue;
1932 			}
1933 
1934 			if (last && !can_merge_rq(last, rq)) {
1935 				spin_unlock(&ve->base.active.lock);
1936 				start_timeslice(engine);
1937 				return; /* leave this for another sibling */
1938 			}
1939 
1940 			ENGINE_TRACE(engine,
1941 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
1942 				     rq->fence.context,
1943 				     rq->fence.seqno,
1944 				     i915_request_completed(rq) ? "!" :
1945 				     i915_request_started(rq) ? "*" :
1946 				     "",
1947 				     yesno(engine != ve->siblings[0]));
1948 
1949 			ve->request = NULL;
1950 			ve->base.execlists.queue_priority_hint = INT_MIN;
1951 			rb_erase_cached(rb, &execlists->virtual);
1952 			RB_CLEAR_NODE(rb);
1953 
1954 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1955 			rq->engine = engine;
1956 
1957 			if (engine != ve->siblings[0]) {
1958 				u32 *regs = ve->context.lrc_reg_state;
1959 				unsigned int n;
1960 
1961 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1962 
1963 				if (!intel_engine_has_relative_mmio(engine))
1964 					virtual_update_register_offsets(regs,
1965 									engine);
1966 
1967 				if (!list_empty(&ve->context.signals))
1968 					virtual_xfer_breadcrumbs(ve, engine);
1969 
1970 				/*
1971 				 * Move the bound engine to the top of the list
1972 				 * for future execution. We then kick this
1973 				 * tasklet first before checking others, so that
1974 				 * we preferentially reuse this set of bound
1975 				 * registers.
1976 				 */
1977 				for (n = 1; n < ve->num_siblings; n++) {
1978 					if (ve->siblings[n] == engine) {
1979 						swap(ve->siblings[n],
1980 						     ve->siblings[0]);
1981 						break;
1982 					}
1983 				}
1984 
1985 				GEM_BUG_ON(ve->siblings[0] != engine);
1986 			}
1987 
1988 			if (__i915_request_submit(rq)) {
1989 				submit = true;
1990 				last = rq;
1991 			}
1992 			i915_request_put(rq);
1993 
1994 			/*
1995 			 * Hmm, we have a bunch of virtual engine requests,
1996 			 * but the first one was already completed (thanks
1997 			 * preempt-to-busy!). Keep looking at the veng queue
1998 			 * until we have no more relevant requests (i.e.
1999 			 * the normal submit queue has higher priority).
2000 			 */
2001 			if (!submit) {
2002 				spin_unlock(&ve->base.active.lock);
2003 				rb = rb_first_cached(&execlists->virtual);
2004 				continue;
2005 			}
2006 		}
2007 
2008 		spin_unlock(&ve->base.active.lock);
2009 		break;
2010 	}
2011 
2012 	while ((rb = rb_first_cached(&execlists->queue))) {
2013 		struct i915_priolist *p = to_priolist(rb);
2014 		struct i915_request *rq, *rn;
2015 		int i;
2016 
2017 		priolist_for_each_request_consume(rq, rn, p, i) {
2018 			bool merge = true;
2019 
2020 			/*
2021 			 * Can we combine this request with the current port?
2022 			 * It has to be the same context/ringbuffer and not
2023 			 * have any exceptions (e.g. GVT saying never to
2024 			 * combine contexts).
2025 			 *
2026 			 * If we can combine the requests, we can execute both
2027 			 * by updating the RING_TAIL to point to the end of the
2028 			 * second request, and so we never need to tell the
2029 			 * hardware about the first.
2030 			 */
2031 			if (last && !can_merge_rq(last, rq)) {
2032 				/*
2033 				 * If we are on the second port and cannot
2034 				 * combine this request with the last, then we
2035 				 * are done.
2036 				 */
2037 				if (port == last_port)
2038 					goto done;
2039 
2040 				/*
2041 				 * We must not populate both ELSP[] with the
2042 				 * same LRCA, i.e. we must submit 2 different
2043 				 * contexts if we submit 2 ELSP.
2044 				 */
2045 				if (last->context == rq->context)
2046 					goto done;
2047 
2048 				if (i915_request_has_sentinel(last))
2049 					goto done;
2050 
2051 				/*
2052 				 * If GVT overrides us we only ever submit
2053 				 * port[0], leaving port[1] empty. Note that we
2054 				 * also have to be careful that we don't queue
2055 				 * the same context (even though a different
2056 				 * request) to the second port.
2057 				 */
2058 				if (ctx_single_port_submission(last->context) ||
2059 				    ctx_single_port_submission(rq->context))
2060 					goto done;
2061 
2062 				merge = false;
2063 			}
2064 
2065 			if (__i915_request_submit(rq)) {
2066 				if (!merge) {
2067 					*port = execlists_schedule_in(last, port - execlists->pending);
2068 					port++;
2069 					last = NULL;
2070 				}
2071 
2072 				GEM_BUG_ON(last &&
2073 					   !can_merge_ctx(last->context,
2074 							  rq->context));
2075 
2076 				submit = true;
2077 				last = rq;
2078 			}
2079 		}
2080 
2081 		rb_erase_cached(&p->node, &execlists->queue);
2082 		i915_priolist_free(p);
2083 	}
2084 
2085 done:
2086 	/*
2087 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2088 	 *
2089 	 * We choose the priority hint such that if we add a request of greater
2090 	 * priority than this, we kick the submission tasklet to decide on
2091 	 * the right order of submitting the requests to hardware. We must
2092 	 * also be prepared to reorder requests as they are in-flight on the
2093 	 * HW. We derive the priority hint then as the first "hole" in
2094 	 * the HW submission ports and if there are no available slots,
2095 	 * the priority of the lowest executing request, i.e. last.
2096 	 *
2097 	 * When we do receive a higher priority request ready to run from the
2098 	 * user, see queue_request(), the priority hint is bumped to that
2099 	 * request triggering preemption on the next dequeue (or subsequent
2100 	 * interrupt for secondary ports).
2101 	 */
2102 	execlists->queue_priority_hint = queue_prio(execlists);
2103 
2104 	if (submit) {
2105 		*port = execlists_schedule_in(last, port - execlists->pending);
2106 		execlists->switch_priority_hint =
2107 			switch_prio(engine, *execlists->pending);
2108 
2109 		/*
2110 		 * Skip if we ended up with exactly the same set of requests,
2111 		 * e.g. trying to timeslice a pair of ordered contexts
2112 		 */
2113 		if (!memcmp(execlists->active, execlists->pending,
2114 			    (port - execlists->pending + 1) * sizeof(*port))) {
2115 			do
2116 				execlists_schedule_out(fetch_and_zero(port));
2117 			while (port-- != execlists->pending);
2118 
2119 			goto skip_submit;
2120 		}
2121 		clear_ports(port + 1, last_port - port);
2122 
2123 		execlists_submit_ports(engine);
2124 		set_preempt_timeout(engine);
2125 	} else {
2126 skip_submit:
2127 		ring_set_paused(engine, 0);
2128 	}
2129 }
2130 
2131 static void
2132 cancel_port_requests(struct intel_engine_execlists * const execlists)
2133 {
2134 	struct i915_request * const *port;
2135 
2136 	for (port = execlists->pending; *port; port++)
2137 		execlists_schedule_out(*port);
2138 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2139 
2140 	/* Mark the end of active before we overwrite *active */
2141 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2142 		execlists_schedule_out(*port);
2143 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2144 
2145 	WRITE_ONCE(execlists->active, execlists->inflight);
2146 }
2147 
2148 static inline void
2149 invalidate_csb_entries(const u32 *first, const u32 *last)
2150 {
2151 	clflush((void *)first);
2152 	clflush((void *)last);
2153 }
2154 
2155 static inline bool
2156 reset_in_progress(const struct intel_engine_execlists *execlists)
2157 {
2158 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
2159 }
2160 
2161 /*
2162  * Starting with Gen12, the status has a new format:
2163  *
2164  *     bit  0:     switched to new queue
2165  *     bit  1:     reserved
2166  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2167  *                 switch detail is set to "wait on semaphore"
2168  *     bits 3-5:   engine class
2169  *     bits 6-11:  engine instance
2170  *     bits 12-14: reserved
2171  *     bits 15-25: sw context id of the lrc the GT switched to
2172  *     bits 26-31: sw counter of the lrc the GT switched to
2173  *     bits 32-35: context switch detail
2174  *                  - 0: ctx complete
2175  *                  - 1: wait on sync flip
2176  *                  - 2: wait on vblank
2177  *                  - 3: wait on scanline
2178  *                  - 4: wait on semaphore
2179  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2180  *                       WAIT_FOR_EVENT)
2181  *     bit  36:    reserved
2182  *     bits 37-43: wait detail (for switch detail 1 to 4)
2183  *     bits 44-46: reserved
2184  *     bits 47-57: sw context id of the lrc the GT switched away from
2185  *     bits 58-63: sw counter of the lrc the GT switched away from
2186  */
2187 static inline bool
2188 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2189 {
2190 	u32 lower_dw = csb[0];
2191 	u32 upper_dw = csb[1];
2192 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2193 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2194 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2195 
2196 	/*
2197 	 * The context switch detail is not guaranteed to be 5 when a preemption
2198 	 * occurs, so we can't just check for that. The check below works for
2199 	 * all the cases we care about, including preemptions of WAIT
2200 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2201 	 * would require some extra handling, but we don't support that.
2202 	 */
2203 	if (!ctx_away_valid || new_queue) {
2204 		GEM_BUG_ON(!ctx_to_valid);
2205 		return true;
2206 	}
2207 
2208 	/*
2209 	 * switch detail = 5 is covered by the case above and we do not expect a
2210 	 * context switch on an unsuccessful wait instruction since we always
2211 	 * use polling mode.
2212 	 */
2213 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2214 	return false;
2215 }
2216 
2217 static inline bool
2218 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2219 {
2220 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2221 }
2222 
2223 static void process_csb(struct intel_engine_cs *engine)
2224 {
2225 	struct intel_engine_execlists * const execlists = &engine->execlists;
2226 	const u32 * const buf = execlists->csb_status;
2227 	const u8 num_entries = execlists->csb_size;
2228 	u8 head, tail;
2229 
2230 	/*
2231 	 * As we modify our execlists state tracking we require exclusive
2232 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2233 	 * and we assume that is only inside the reset paths and so serialised.
2234 	 */
2235 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2236 		   !reset_in_progress(execlists));
2237 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2238 
2239 	/*
2240 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2241 	 * When reading from the csb_write mmio register, we have to be
2242 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2243 	 * the low 4bits. As it happens we know the next 4bits are always
2244 	 * zero and so we can simply masked off the low u8 of the register
2245 	 * and treat it identically to reading from the HWSP (without having
2246 	 * to use explicit shifting and masking, and probably bifurcating
2247 	 * the code to handle the legacy mmio read).
2248 	 */
2249 	head = execlists->csb_head;
2250 	tail = READ_ONCE(*execlists->csb_write);
2251 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2252 	if (unlikely(head == tail))
2253 		return;
2254 
2255 	/*
2256 	 * Hopefully paired with a wmb() in HW!
2257 	 *
2258 	 * We must complete the read of the write pointer before any reads
2259 	 * from the CSB, so that we do not see stale values. Without an rmb
2260 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2261 	 * we perform the READ_ONCE(*csb_write).
2262 	 */
2263 	rmb();
2264 
2265 	do {
2266 		bool promote;
2267 
2268 		if (++head == num_entries)
2269 			head = 0;
2270 
2271 		/*
2272 		 * We are flying near dragons again.
2273 		 *
2274 		 * We hold a reference to the request in execlist_port[]
2275 		 * but no more than that. We are operating in softirq
2276 		 * context and so cannot hold any mutex or sleep. That
2277 		 * prevents us stopping the requests we are processing
2278 		 * in port[] from being retired simultaneously (the
2279 		 * breadcrumb will be complete before we see the
2280 		 * context-switch). As we only hold the reference to the
2281 		 * request, any pointer chasing underneath the request
2282 		 * is subject to a potential use-after-free. Thus we
2283 		 * store all of the bookkeeping within port[] as
2284 		 * required, and avoid using unguarded pointers beneath
2285 		 * request itself. The same applies to the atomic
2286 		 * status notifier.
2287 		 */
2288 
2289 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2290 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2291 
2292 		if (INTEL_GEN(engine->i915) >= 12)
2293 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2294 		else
2295 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2296 		if (promote) {
2297 			struct i915_request * const *old = execlists->active;
2298 
2299 			/* Point active to the new ELSP; prevent overwriting */
2300 			WRITE_ONCE(execlists->active, execlists->pending);
2301 
2302 			if (!inject_preempt_hang(execlists))
2303 				ring_set_paused(engine, 0);
2304 
2305 			/* cancel old inflight, prepare for switch */
2306 			trace_ports(execlists, "preempted", old);
2307 			while (*old)
2308 				execlists_schedule_out(*old++);
2309 
2310 			/* switch pending to inflight */
2311 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2312 			WRITE_ONCE(execlists->active,
2313 				   memcpy(execlists->inflight,
2314 					  execlists->pending,
2315 					  execlists_num_ports(execlists) *
2316 					  sizeof(*execlists->pending)));
2317 
2318 			WRITE_ONCE(execlists->pending[0], NULL);
2319 		} else {
2320 			GEM_BUG_ON(!*execlists->active);
2321 
2322 			/* port0 completed, advanced to port1 */
2323 			trace_ports(execlists, "completed", execlists->active);
2324 
2325 			/*
2326 			 * We rely on the hardware being strongly
2327 			 * ordered, that the breadcrumb write is
2328 			 * coherent (visible from the CPU) before the
2329 			 * user interrupt and CSB is processed.
2330 			 */
2331 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2332 				   !reset_in_progress(execlists));
2333 			execlists_schedule_out(*execlists->active++);
2334 
2335 			GEM_BUG_ON(execlists->active - execlists->inflight >
2336 				   execlists_num_ports(execlists));
2337 		}
2338 	} while (head != tail);
2339 
2340 	execlists->csb_head = head;
2341 	set_timeslice(engine);
2342 
2343 	/*
2344 	 * Gen11 has proven to fail wrt global observation point between
2345 	 * entry and tail update, failing on the ordering and thus
2346 	 * we see an old entry in the context status buffer.
2347 	 *
2348 	 * Forcibly evict out entries for the next gpu csb update,
2349 	 * to increase the odds that we get a fresh entries with non
2350 	 * working hardware. The cost for doing so comes out mostly with
2351 	 * the wash as hardware, working or not, will need to do the
2352 	 * invalidation before.
2353 	 */
2354 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2355 }
2356 
2357 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2358 {
2359 	lockdep_assert_held(&engine->active.lock);
2360 	if (!engine->execlists.pending[0]) {
2361 		rcu_read_lock(); /* protect peeking at execlists->active */
2362 		execlists_dequeue(engine);
2363 		rcu_read_unlock();
2364 	}
2365 }
2366 
2367 static void __execlists_hold(struct i915_request *rq)
2368 {
2369 	LIST_HEAD(list);
2370 
2371 	do {
2372 		struct i915_dependency *p;
2373 
2374 		if (i915_request_is_active(rq))
2375 			__i915_request_unsubmit(rq);
2376 
2377 		RQ_TRACE(rq, "on hold\n");
2378 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2379 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2380 		i915_request_set_hold(rq);
2381 
2382 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2383 			struct i915_request *w =
2384 				container_of(p->waiter, typeof(*w), sched);
2385 
2386 			/* Leave semaphores spinning on the other engines */
2387 			if (w->engine != rq->engine)
2388 				continue;
2389 
2390 			if (!i915_request_is_ready(w))
2391 				continue;
2392 
2393 			if (i915_request_completed(w))
2394 				continue;
2395 
2396 			if (i915_request_on_hold(rq))
2397 				continue;
2398 
2399 			list_move_tail(&w->sched.link, &list);
2400 		}
2401 
2402 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2403 	} while (rq);
2404 }
2405 
2406 static bool execlists_hold(struct intel_engine_cs *engine,
2407 			   struct i915_request *rq)
2408 {
2409 	spin_lock_irq(&engine->active.lock);
2410 
2411 	if (i915_request_completed(rq)) { /* too late! */
2412 		rq = NULL;
2413 		goto unlock;
2414 	}
2415 
2416 	if (rq->engine != engine) { /* preempted virtual engine */
2417 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2418 
2419 		/*
2420 		 * intel_context_inflight() is only protected by virtue
2421 		 * of process_csb() being called only by the tasklet (or
2422 		 * directly from inside reset while the tasklet is suspended).
2423 		 * Assert that neither of those are allowed to run while we
2424 		 * poke at the request queues.
2425 		 */
2426 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2427 
2428 		/*
2429 		 * An unsubmitted request along a virtual engine will
2430 		 * remain on the active (this) engine until we are able
2431 		 * to process the context switch away (and so mark the
2432 		 * context as no longer in flight). That cannot have happened
2433 		 * yet, otherwise we would not be hanging!
2434 		 */
2435 		spin_lock(&ve->base.active.lock);
2436 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2437 		GEM_BUG_ON(ve->request != rq);
2438 		ve->request = NULL;
2439 		spin_unlock(&ve->base.active.lock);
2440 		i915_request_put(rq);
2441 
2442 		rq->engine = engine;
2443 	}
2444 
2445 	/*
2446 	 * Transfer this request onto the hold queue to prevent it
2447 	 * being resumbitted to HW (and potentially completed) before we have
2448 	 * released it. Since we may have already submitted following
2449 	 * requests, we need to remove those as well.
2450 	 */
2451 	GEM_BUG_ON(i915_request_on_hold(rq));
2452 	GEM_BUG_ON(rq->engine != engine);
2453 	__execlists_hold(rq);
2454 
2455 unlock:
2456 	spin_unlock_irq(&engine->active.lock);
2457 	return rq;
2458 }
2459 
2460 static bool hold_request(const struct i915_request *rq)
2461 {
2462 	struct i915_dependency *p;
2463 
2464 	/*
2465 	 * If one of our ancestors is on hold, we must also be on hold,
2466 	 * otherwise we will bypass it and execute before it.
2467 	 */
2468 	list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
2469 		const struct i915_request *s =
2470 			container_of(p->signaler, typeof(*s), sched);
2471 
2472 		if (s->engine != rq->engine)
2473 			continue;
2474 
2475 		if (i915_request_on_hold(s))
2476 			return true;
2477 	}
2478 
2479 	return false;
2480 }
2481 
2482 static void __execlists_unhold(struct i915_request *rq)
2483 {
2484 	LIST_HEAD(list);
2485 
2486 	do {
2487 		struct i915_dependency *p;
2488 
2489 		GEM_BUG_ON(!i915_request_on_hold(rq));
2490 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2491 
2492 		i915_request_clear_hold(rq);
2493 		list_move_tail(&rq->sched.link,
2494 			       i915_sched_lookup_priolist(rq->engine,
2495 							  rq_prio(rq)));
2496 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2497 		RQ_TRACE(rq, "hold release\n");
2498 
2499 		/* Also release any children on this engine that are ready */
2500 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2501 			struct i915_request *w =
2502 				container_of(p->waiter, typeof(*w), sched);
2503 
2504 			if (w->engine != rq->engine)
2505 				continue;
2506 
2507 			if (!i915_request_on_hold(rq))
2508 				continue;
2509 
2510 			/* Check that no other parents are also on hold */
2511 			if (hold_request(rq))
2512 				continue;
2513 
2514 			list_move_tail(&w->sched.link, &list);
2515 		}
2516 
2517 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2518 	} while (rq);
2519 }
2520 
2521 static void execlists_unhold(struct intel_engine_cs *engine,
2522 			     struct i915_request *rq)
2523 {
2524 	spin_lock_irq(&engine->active.lock);
2525 
2526 	/*
2527 	 * Move this request back to the priority queue, and all of its
2528 	 * children and grandchildren that were suspended along with it.
2529 	 */
2530 	__execlists_unhold(rq);
2531 
2532 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2533 		engine->execlists.queue_priority_hint = rq_prio(rq);
2534 		tasklet_hi_schedule(&engine->execlists.tasklet);
2535 	}
2536 
2537 	spin_unlock_irq(&engine->active.lock);
2538 }
2539 
2540 struct execlists_capture {
2541 	struct work_struct work;
2542 	struct i915_request *rq;
2543 	struct i915_gpu_coredump *error;
2544 };
2545 
2546 static void execlists_capture_work(struct work_struct *work)
2547 {
2548 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2549 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2550 	struct intel_engine_cs *engine = cap->rq->engine;
2551 	struct intel_gt_coredump *gt = cap->error->gt;
2552 	struct intel_engine_capture_vma *vma;
2553 
2554 	/* Compress all the objects attached to the request, slow! */
2555 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2556 	if (vma) {
2557 		struct i915_vma_compress *compress =
2558 			i915_vma_capture_prepare(gt);
2559 
2560 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2561 		i915_vma_capture_finish(gt, compress);
2562 	}
2563 
2564 	gt->simulated = gt->engine->simulated;
2565 	cap->error->simulated = gt->simulated;
2566 
2567 	/* Publish the error state, and announce it to the world */
2568 	i915_error_state_store(cap->error);
2569 	i915_gpu_coredump_put(cap->error);
2570 
2571 	/* Return this request and all that depend upon it for signaling */
2572 	execlists_unhold(engine, cap->rq);
2573 	i915_request_put(cap->rq);
2574 
2575 	kfree(cap);
2576 }
2577 
2578 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2579 {
2580 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2581 	struct execlists_capture *cap;
2582 
2583 	cap = kmalloc(sizeof(*cap), gfp);
2584 	if (!cap)
2585 		return NULL;
2586 
2587 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2588 	if (!cap->error)
2589 		goto err_cap;
2590 
2591 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2592 	if (!cap->error->gt)
2593 		goto err_gpu;
2594 
2595 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2596 	if (!cap->error->gt->engine)
2597 		goto err_gt;
2598 
2599 	return cap;
2600 
2601 err_gt:
2602 	kfree(cap->error->gt);
2603 err_gpu:
2604 	kfree(cap->error);
2605 err_cap:
2606 	kfree(cap);
2607 	return NULL;
2608 }
2609 
2610 static bool execlists_capture(struct intel_engine_cs *engine)
2611 {
2612 	struct execlists_capture *cap;
2613 
2614 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2615 		return true;
2616 
2617 	/*
2618 	 * We need to _quickly_ capture the engine state before we reset.
2619 	 * We are inside an atomic section (softirq) here and we are delaying
2620 	 * the forced preemption event.
2621 	 */
2622 	cap = capture_regs(engine);
2623 	if (!cap)
2624 		return true;
2625 
2626 	cap->rq = execlists_active(&engine->execlists);
2627 	GEM_BUG_ON(!cap->rq);
2628 
2629 	rcu_read_lock();
2630 	cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2631 	cap->rq = i915_request_get_rcu(cap->rq);
2632 	rcu_read_unlock();
2633 	if (!cap->rq)
2634 		goto err_free;
2635 
2636 	/*
2637 	 * Remove the request from the execlists queue, and take ownership
2638 	 * of the request. We pass it to our worker who will _slowly_ compress
2639 	 * all the pages the _user_ requested for debugging their batch, after
2640 	 * which we return it to the queue for signaling.
2641 	 *
2642 	 * By removing them from the execlists queue, we also remove the
2643 	 * requests from being processed by __unwind_incomplete_requests()
2644 	 * during the intel_engine_reset(), and so they will *not* be replayed
2645 	 * afterwards.
2646 	 *
2647 	 * Note that because we have not yet reset the engine at this point,
2648 	 * it is possible for the request that we have identified as being
2649 	 * guilty, did in fact complete and we will then hit an arbitration
2650 	 * point allowing the outstanding preemption to succeed. The likelihood
2651 	 * of that is very low (as capturing of the engine registers should be
2652 	 * fast enough to run inside an irq-off atomic section!), so we will
2653 	 * simply hold that request accountable for being non-preemptible
2654 	 * long enough to force the reset.
2655 	 */
2656 	if (!execlists_hold(engine, cap->rq))
2657 		goto err_rq;
2658 
2659 	INIT_WORK(&cap->work, execlists_capture_work);
2660 	schedule_work(&cap->work);
2661 	return true;
2662 
2663 err_rq:
2664 	i915_request_put(cap->rq);
2665 err_free:
2666 	i915_gpu_coredump_put(cap->error);
2667 	kfree(cap);
2668 	return false;
2669 }
2670 
2671 static noinline void preempt_reset(struct intel_engine_cs *engine)
2672 {
2673 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2674 	unsigned long *lock = &engine->gt->reset.flags;
2675 
2676 	if (i915_modparams.reset < 3)
2677 		return;
2678 
2679 	if (test_and_set_bit(bit, lock))
2680 		return;
2681 
2682 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2683 	tasklet_disable_nosync(&engine->execlists.tasklet);
2684 
2685 	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
2686 		     READ_ONCE(engine->props.preempt_timeout_ms),
2687 		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2688 
2689 	ring_set_paused(engine, 1); /* Freeze the current request in place */
2690 	if (execlists_capture(engine))
2691 		intel_engine_reset(engine, "preemption time out");
2692 	else
2693 		ring_set_paused(engine, 0);
2694 
2695 	tasklet_enable(&engine->execlists.tasklet);
2696 	clear_and_wake_up_bit(bit, lock);
2697 }
2698 
2699 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2700 {
2701 	const struct timer_list *t = &engine->execlists.preempt;
2702 
2703 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2704 		return false;
2705 
2706 	if (!timer_expired(t))
2707 		return false;
2708 
2709 	return READ_ONCE(engine->execlists.pending[0]);
2710 }
2711 
2712 /*
2713  * Check the unread Context Status Buffers and manage the submission of new
2714  * contexts to the ELSP accordingly.
2715  */
2716 static void execlists_submission_tasklet(unsigned long data)
2717 {
2718 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2719 	bool timeout = preempt_timeout(engine);
2720 
2721 	process_csb(engine);
2722 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2723 		unsigned long flags;
2724 
2725 		spin_lock_irqsave(&engine->active.lock, flags);
2726 		__execlists_submission_tasklet(engine);
2727 		spin_unlock_irqrestore(&engine->active.lock, flags);
2728 
2729 		/* Recheck after serialising with direct-submission */
2730 		if (timeout && preempt_timeout(engine))
2731 			preempt_reset(engine);
2732 	}
2733 }
2734 
2735 static void __execlists_kick(struct intel_engine_execlists *execlists)
2736 {
2737 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2738 	tasklet_hi_schedule(&execlists->tasklet);
2739 }
2740 
2741 #define execlists_kick(t, member) \
2742 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2743 
2744 static void execlists_timeslice(struct timer_list *timer)
2745 {
2746 	execlists_kick(timer, timer);
2747 }
2748 
2749 static void execlists_preempt(struct timer_list *timer)
2750 {
2751 	execlists_kick(timer, preempt);
2752 }
2753 
2754 static void queue_request(struct intel_engine_cs *engine,
2755 			  struct i915_request *rq)
2756 {
2757 	GEM_BUG_ON(!list_empty(&rq->sched.link));
2758 	list_add_tail(&rq->sched.link,
2759 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
2760 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2761 }
2762 
2763 static void __submit_queue_imm(struct intel_engine_cs *engine)
2764 {
2765 	struct intel_engine_execlists * const execlists = &engine->execlists;
2766 
2767 	if (reset_in_progress(execlists))
2768 		return; /* defer until we restart the engine following reset */
2769 
2770 	if (execlists->tasklet.func == execlists_submission_tasklet)
2771 		__execlists_submission_tasklet(engine);
2772 	else
2773 		tasklet_hi_schedule(&execlists->tasklet);
2774 }
2775 
2776 static void submit_queue(struct intel_engine_cs *engine,
2777 			 const struct i915_request *rq)
2778 {
2779 	struct intel_engine_execlists *execlists = &engine->execlists;
2780 
2781 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2782 		return;
2783 
2784 	execlists->queue_priority_hint = rq_prio(rq);
2785 	__submit_queue_imm(engine);
2786 }
2787 
2788 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2789 			     const struct i915_request *rq)
2790 {
2791 	GEM_BUG_ON(i915_request_on_hold(rq));
2792 	return !list_empty(&engine->active.hold) && hold_request(rq);
2793 }
2794 
2795 static void execlists_submit_request(struct i915_request *request)
2796 {
2797 	struct intel_engine_cs *engine = request->engine;
2798 	unsigned long flags;
2799 
2800 	/* Will be called from irq-context when using foreign fences. */
2801 	spin_lock_irqsave(&engine->active.lock, flags);
2802 
2803 	if (unlikely(ancestor_on_hold(engine, request))) {
2804 		list_add_tail(&request->sched.link, &engine->active.hold);
2805 		i915_request_set_hold(request);
2806 	} else {
2807 		queue_request(engine, request);
2808 
2809 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2810 		GEM_BUG_ON(list_empty(&request->sched.link));
2811 
2812 		submit_queue(engine, request);
2813 	}
2814 
2815 	spin_unlock_irqrestore(&engine->active.lock, flags);
2816 }
2817 
2818 static void __execlists_context_fini(struct intel_context *ce)
2819 {
2820 	intel_ring_put(ce->ring);
2821 	i915_vma_put(ce->state);
2822 }
2823 
2824 static void execlists_context_destroy(struct kref *kref)
2825 {
2826 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2827 
2828 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2829 	GEM_BUG_ON(intel_context_is_pinned(ce));
2830 
2831 	if (ce->state)
2832 		__execlists_context_fini(ce);
2833 
2834 	intel_context_fini(ce);
2835 	intel_context_free(ce);
2836 }
2837 
2838 static void
2839 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2840 {
2841 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2842 		return;
2843 
2844 	vaddr += engine->context_size;
2845 
2846 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2847 }
2848 
2849 static void
2850 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2851 {
2852 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2853 		return;
2854 
2855 	vaddr += engine->context_size;
2856 
2857 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2858 		dev_err_once(engine->i915->drm.dev,
2859 			     "%s context redzone overwritten!\n",
2860 			     engine->name);
2861 }
2862 
2863 static void execlists_context_unpin(struct intel_context *ce)
2864 {
2865 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2866 		      ce->engine);
2867 
2868 	i915_gem_object_unpin_map(ce->state->obj);
2869 }
2870 
2871 static void
2872 __execlists_update_reg_state(const struct intel_context *ce,
2873 			     const struct intel_engine_cs *engine,
2874 			     u32 head)
2875 {
2876 	struct intel_ring *ring = ce->ring;
2877 	u32 *regs = ce->lrc_reg_state;
2878 
2879 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
2880 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2881 
2882 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2883 	regs[CTX_RING_HEAD] = head;
2884 	regs[CTX_RING_TAIL] = ring->tail;
2885 
2886 	/* RPCS */
2887 	if (engine->class == RENDER_CLASS) {
2888 		regs[CTX_R_PWR_CLK_STATE] =
2889 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2890 
2891 		i915_oa_init_reg_state(ce, engine);
2892 	}
2893 }
2894 
2895 static int
2896 __execlists_context_pin(struct intel_context *ce,
2897 			struct intel_engine_cs *engine)
2898 {
2899 	void *vaddr;
2900 
2901 	GEM_BUG_ON(!ce->state);
2902 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2903 
2904 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2905 					i915_coherent_map_type(engine->i915) |
2906 					I915_MAP_OVERRIDE);
2907 	if (IS_ERR(vaddr))
2908 		return PTR_ERR(vaddr);
2909 
2910 	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2911 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2912 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
2913 
2914 	return 0;
2915 }
2916 
2917 static int execlists_context_pin(struct intel_context *ce)
2918 {
2919 	return __execlists_context_pin(ce, ce->engine);
2920 }
2921 
2922 static int execlists_context_alloc(struct intel_context *ce)
2923 {
2924 	return __execlists_context_alloc(ce, ce->engine);
2925 }
2926 
2927 static void execlists_context_reset(struct intel_context *ce)
2928 {
2929 	CE_TRACE(ce, "reset\n");
2930 	GEM_BUG_ON(!intel_context_is_pinned(ce));
2931 
2932 	/*
2933 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2934 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2935 	 * that stored in context. As we only write new commands from
2936 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2937 	 * starts reading from its RING_HEAD from the context, it may try to
2938 	 * execute that junk and die.
2939 	 *
2940 	 * The contexts that are stilled pinned on resume belong to the
2941 	 * kernel, and are local to each engine. All other contexts will
2942 	 * have their head/tail sanitized upon pinning before use, so they
2943 	 * will never see garbage,
2944 	 *
2945 	 * So to avoid that we reset the context images upon resume. For
2946 	 * simplicity, we just zero everything out.
2947 	 */
2948 	intel_ring_reset(ce->ring, ce->ring->emit);
2949 
2950 	/* Scrub away the garbage */
2951 	execlists_init_reg_state(ce->lrc_reg_state,
2952 				 ce, ce->engine, ce->ring, true);
2953 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
2954 
2955 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
2956 }
2957 
2958 static const struct intel_context_ops execlists_context_ops = {
2959 	.alloc = execlists_context_alloc,
2960 
2961 	.pin = execlists_context_pin,
2962 	.unpin = execlists_context_unpin,
2963 
2964 	.enter = intel_context_enter_engine,
2965 	.exit = intel_context_exit_engine,
2966 
2967 	.reset = execlists_context_reset,
2968 	.destroy = execlists_context_destroy,
2969 };
2970 
2971 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2972 {
2973 	u32 *cs;
2974 
2975 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2976 
2977 	cs = intel_ring_begin(rq, 6);
2978 	if (IS_ERR(cs))
2979 		return PTR_ERR(cs);
2980 
2981 	/*
2982 	 * Check if we have been preempted before we even get started.
2983 	 *
2984 	 * After this point i915_request_started() reports true, even if
2985 	 * we get preempted and so are no longer running.
2986 	 */
2987 	*cs++ = MI_ARB_CHECK;
2988 	*cs++ = MI_NOOP;
2989 
2990 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2991 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2992 	*cs++ = 0;
2993 	*cs++ = rq->fence.seqno - 1;
2994 
2995 	intel_ring_advance(rq, cs);
2996 
2997 	/* Record the updated position of the request's payload */
2998 	rq->infix = intel_ring_offset(rq, cs);
2999 
3000 	return 0;
3001 }
3002 
3003 static int execlists_request_alloc(struct i915_request *request)
3004 {
3005 	int ret;
3006 
3007 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3008 
3009 	/*
3010 	 * Flush enough space to reduce the likelihood of waiting after
3011 	 * we start building the request - in which case we will just
3012 	 * have to repeat work.
3013 	 */
3014 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3015 
3016 	/*
3017 	 * Note that after this point, we have committed to using
3018 	 * this request as it is being used to both track the
3019 	 * state of engine initialisation and liveness of the
3020 	 * golden renderstate above. Think twice before you try
3021 	 * to cancel/unwind this request now.
3022 	 */
3023 
3024 	/* Unconditionally invalidate GPU caches and TLBs. */
3025 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3026 	if (ret)
3027 		return ret;
3028 
3029 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3030 	return 0;
3031 }
3032 
3033 /*
3034  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3035  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3036  * but there is a slight complication as this is applied in WA batch where the
3037  * values are only initialized once so we cannot take register value at the
3038  * beginning and reuse it further; hence we save its value to memory, upload a
3039  * constant value with bit21 set and then we restore it back with the saved value.
3040  * To simplify the WA, a constant value is formed by using the default value
3041  * of this register. This shouldn't be a problem because we are only modifying
3042  * it for a short period and this batch in non-premptible. We can ofcourse
3043  * use additional instructions that read the actual value of the register
3044  * at that time and set our bit of interest but it makes the WA complicated.
3045  *
3046  * This WA is also required for Gen9 so extracting as a function avoids
3047  * code duplication.
3048  */
3049 static u32 *
3050 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3051 {
3052 	/* NB no one else is allowed to scribble over scratch + 256! */
3053 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3054 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3055 	*batch++ = intel_gt_scratch_offset(engine->gt,
3056 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3057 	*batch++ = 0;
3058 
3059 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3060 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3061 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3062 
3063 	batch = gen8_emit_pipe_control(batch,
3064 				       PIPE_CONTROL_CS_STALL |
3065 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3066 				       0);
3067 
3068 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3069 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3070 	*batch++ = intel_gt_scratch_offset(engine->gt,
3071 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3072 	*batch++ = 0;
3073 
3074 	return batch;
3075 }
3076 
3077 /*
3078  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3079  * initialized at the beginning and shared across all contexts but this field
3080  * helps us to have multiple batches at different offsets and select them based
3081  * on a criteria. At the moment this batch always start at the beginning of the page
3082  * and at this point we don't have multiple wa_ctx batch buffers.
3083  *
3084  * The number of WA applied are not known at the beginning; we use this field
3085  * to return the no of DWORDS written.
3086  *
3087  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3088  * so it adds NOOPs as padding to make it cacheline aligned.
3089  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3090  * makes a complete batch buffer.
3091  */
3092 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3093 {
3094 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3095 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3096 
3097 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3098 	if (IS_BROADWELL(engine->i915))
3099 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3100 
3101 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3102 	/* Actual scratch location is at 128 bytes offset */
3103 	batch = gen8_emit_pipe_control(batch,
3104 				       PIPE_CONTROL_FLUSH_L3 |
3105 				       PIPE_CONTROL_STORE_DATA_INDEX |
3106 				       PIPE_CONTROL_CS_STALL |
3107 				       PIPE_CONTROL_QW_WRITE,
3108 				       LRC_PPHWSP_SCRATCH_ADDR);
3109 
3110 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3111 
3112 	/* Pad to end of cacheline */
3113 	while ((unsigned long)batch % CACHELINE_BYTES)
3114 		*batch++ = MI_NOOP;
3115 
3116 	/*
3117 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3118 	 * execution depends on the length specified in terms of cache lines
3119 	 * in the register CTX_RCS_INDIRECT_CTX
3120 	 */
3121 
3122 	return batch;
3123 }
3124 
3125 struct lri {
3126 	i915_reg_t reg;
3127 	u32 value;
3128 };
3129 
3130 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3131 {
3132 	GEM_BUG_ON(!count || count > 63);
3133 
3134 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3135 	do {
3136 		*batch++ = i915_mmio_reg_offset(lri->reg);
3137 		*batch++ = lri->value;
3138 	} while (lri++, --count);
3139 	*batch++ = MI_NOOP;
3140 
3141 	return batch;
3142 }
3143 
3144 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3145 {
3146 	static const struct lri lri[] = {
3147 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3148 		{
3149 			COMMON_SLICE_CHICKEN2,
3150 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3151 				       0),
3152 		},
3153 
3154 		/* BSpec: 11391 */
3155 		{
3156 			FF_SLICE_CHICKEN,
3157 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3158 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3159 		},
3160 
3161 		/* BSpec: 11299 */
3162 		{
3163 			_3D_CHICKEN3,
3164 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3165 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3166 		}
3167 	};
3168 
3169 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3170 
3171 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3172 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3173 
3174 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3175 	batch = gen8_emit_pipe_control(batch,
3176 				       PIPE_CONTROL_FLUSH_L3 |
3177 				       PIPE_CONTROL_STORE_DATA_INDEX |
3178 				       PIPE_CONTROL_CS_STALL |
3179 				       PIPE_CONTROL_QW_WRITE,
3180 				       LRC_PPHWSP_SCRATCH_ADDR);
3181 
3182 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3183 
3184 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3185 	if (HAS_POOLED_EU(engine->i915)) {
3186 		/*
3187 		 * EU pool configuration is setup along with golden context
3188 		 * during context initialization. This value depends on
3189 		 * device type (2x6 or 3x6) and needs to be updated based
3190 		 * on which subslice is disabled especially for 2x6
3191 		 * devices, however it is safe to load default
3192 		 * configuration of 3x6 device instead of masking off
3193 		 * corresponding bits because HW ignores bits of a disabled
3194 		 * subslice and drops down to appropriate config. Please
3195 		 * see render_state_setup() in i915_gem_render_state.c for
3196 		 * possible configurations, to avoid duplication they are
3197 		 * not shown here again.
3198 		 */
3199 		*batch++ = GEN9_MEDIA_POOL_STATE;
3200 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3201 		*batch++ = 0x00777000;
3202 		*batch++ = 0;
3203 		*batch++ = 0;
3204 		*batch++ = 0;
3205 	}
3206 
3207 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3208 
3209 	/* Pad to end of cacheline */
3210 	while ((unsigned long)batch % CACHELINE_BYTES)
3211 		*batch++ = MI_NOOP;
3212 
3213 	return batch;
3214 }
3215 
3216 static u32 *
3217 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3218 {
3219 	int i;
3220 
3221 	/*
3222 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3223 	 *
3224 	 * Ensure the engine is idle prior to programming a
3225 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3226 	 */
3227 	batch = gen8_emit_pipe_control(batch,
3228 				       PIPE_CONTROL_CS_STALL,
3229 				       0);
3230 	/*
3231 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3232 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3233 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3234 	 * confusing. Since gen8_emit_pipe_control() already advances the
3235 	 * batch by 6 dwords, we advance the other 10 here, completing a
3236 	 * cacheline. It's not clear if the workaround requires this padding
3237 	 * before other commands, or if it's just the regular padding we would
3238 	 * already have for the workaround bb, so leave it here for now.
3239 	 */
3240 	for (i = 0; i < 10; i++)
3241 		*batch++ = MI_NOOP;
3242 
3243 	/* Pad to end of cacheline */
3244 	while ((unsigned long)batch % CACHELINE_BYTES)
3245 		*batch++ = MI_NOOP;
3246 
3247 	return batch;
3248 }
3249 
3250 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3251 
3252 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3253 {
3254 	struct drm_i915_gem_object *obj;
3255 	struct i915_vma *vma;
3256 	int err;
3257 
3258 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3259 	if (IS_ERR(obj))
3260 		return PTR_ERR(obj);
3261 
3262 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3263 	if (IS_ERR(vma)) {
3264 		err = PTR_ERR(vma);
3265 		goto err;
3266 	}
3267 
3268 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
3269 	if (err)
3270 		goto err;
3271 
3272 	engine->wa_ctx.vma = vma;
3273 	return 0;
3274 
3275 err:
3276 	i915_gem_object_put(obj);
3277 	return err;
3278 }
3279 
3280 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3281 {
3282 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3283 }
3284 
3285 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3286 
3287 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3288 {
3289 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3290 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3291 					    &wa_ctx->per_ctx };
3292 	wa_bb_func_t wa_bb_fn[2];
3293 	struct page *page;
3294 	void *batch, *batch_ptr;
3295 	unsigned int i;
3296 	int ret;
3297 
3298 	if (engine->class != RENDER_CLASS)
3299 		return 0;
3300 
3301 	switch (INTEL_GEN(engine->i915)) {
3302 	case 12:
3303 	case 11:
3304 		return 0;
3305 	case 10:
3306 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3307 		wa_bb_fn[1] = NULL;
3308 		break;
3309 	case 9:
3310 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3311 		wa_bb_fn[1] = NULL;
3312 		break;
3313 	case 8:
3314 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3315 		wa_bb_fn[1] = NULL;
3316 		break;
3317 	default:
3318 		MISSING_CASE(INTEL_GEN(engine->i915));
3319 		return 0;
3320 	}
3321 
3322 	ret = lrc_setup_wa_ctx(engine);
3323 	if (ret) {
3324 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3325 		return ret;
3326 	}
3327 
3328 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3329 	batch = batch_ptr = kmap_atomic(page);
3330 
3331 	/*
3332 	 * Emit the two workaround batch buffers, recording the offset from the
3333 	 * start of the workaround batch buffer object for each and their
3334 	 * respective sizes.
3335 	 */
3336 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3337 		wa_bb[i]->offset = batch_ptr - batch;
3338 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3339 						  CACHELINE_BYTES))) {
3340 			ret = -EINVAL;
3341 			break;
3342 		}
3343 		if (wa_bb_fn[i])
3344 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3345 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3346 	}
3347 
3348 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3349 
3350 	kunmap_atomic(batch);
3351 	if (ret)
3352 		lrc_destroy_wa_ctx(engine);
3353 
3354 	return ret;
3355 }
3356 
3357 static void enable_execlists(struct intel_engine_cs *engine)
3358 {
3359 	u32 mode;
3360 
3361 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3362 
3363 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3364 
3365 	if (INTEL_GEN(engine->i915) >= 11)
3366 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3367 	else
3368 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3369 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3370 
3371 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3372 
3373 	ENGINE_WRITE_FW(engine,
3374 			RING_HWS_PGA,
3375 			i915_ggtt_offset(engine->status_page.vma));
3376 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3377 
3378 	engine->context_tag = 0;
3379 }
3380 
3381 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3382 {
3383 	bool unexpected = false;
3384 
3385 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3386 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3387 		unexpected = true;
3388 	}
3389 
3390 	return unexpected;
3391 }
3392 
3393 static int execlists_resume(struct intel_engine_cs *engine)
3394 {
3395 	intel_engine_apply_workarounds(engine);
3396 	intel_engine_apply_whitelist(engine);
3397 
3398 	intel_mocs_init_engine(engine);
3399 
3400 	intel_engine_reset_breadcrumbs(engine);
3401 
3402 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3403 		struct drm_printer p = drm_debug_printer(__func__);
3404 
3405 		intel_engine_dump(engine, &p, NULL);
3406 	}
3407 
3408 	enable_execlists(engine);
3409 
3410 	return 0;
3411 }
3412 
3413 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3414 {
3415 	struct intel_engine_execlists * const execlists = &engine->execlists;
3416 	unsigned long flags;
3417 
3418 	ENGINE_TRACE(engine, "depth<-%d\n",
3419 		     atomic_read(&execlists->tasklet.count));
3420 
3421 	/*
3422 	 * Prevent request submission to the hardware until we have
3423 	 * completed the reset in i915_gem_reset_finish(). If a request
3424 	 * is completed by one engine, it may then queue a request
3425 	 * to a second via its execlists->tasklet *just* as we are
3426 	 * calling engine->resume() and also writing the ELSP.
3427 	 * Turning off the execlists->tasklet until the reset is over
3428 	 * prevents the race.
3429 	 */
3430 	__tasklet_disable_sync_once(&execlists->tasklet);
3431 	GEM_BUG_ON(!reset_in_progress(execlists));
3432 
3433 	/* And flush any current direct submission. */
3434 	spin_lock_irqsave(&engine->active.lock, flags);
3435 	spin_unlock_irqrestore(&engine->active.lock, flags);
3436 
3437 	/*
3438 	 * We stop engines, otherwise we might get failed reset and a
3439 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3440 	 * from system hang if batchbuffer is progressing when
3441 	 * the reset is issued, regardless of READY_TO_RESET ack.
3442 	 * Thus assume it is best to stop engines on all gens
3443 	 * where we have a gpu reset.
3444 	 *
3445 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3446 	 *
3447 	 * FIXME: Wa for more modern gens needs to be validated
3448 	 */
3449 	intel_engine_stop_cs(engine);
3450 }
3451 
3452 static void reset_csb_pointers(struct intel_engine_cs *engine)
3453 {
3454 	struct intel_engine_execlists * const execlists = &engine->execlists;
3455 	const unsigned int reset_value = execlists->csb_size - 1;
3456 
3457 	ring_set_paused(engine, 0);
3458 
3459 	/*
3460 	 * After a reset, the HW starts writing into CSB entry [0]. We
3461 	 * therefore have to set our HEAD pointer back one entry so that
3462 	 * the *first* entry we check is entry 0. To complicate this further,
3463 	 * as we don't wait for the first interrupt after reset, we have to
3464 	 * fake the HW write to point back to the last entry so that our
3465 	 * inline comparison of our cached head position against the last HW
3466 	 * write works even before the first interrupt.
3467 	 */
3468 	execlists->csb_head = reset_value;
3469 	WRITE_ONCE(*execlists->csb_write, reset_value);
3470 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3471 
3472 	/*
3473 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3474 	 * Bludgeon them with a mmio update to be sure.
3475 	 */
3476 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3477 		     reset_value << 8 | reset_value);
3478 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3479 
3480 	invalidate_csb_entries(&execlists->csb_status[0],
3481 			       &execlists->csb_status[reset_value]);
3482 }
3483 
3484 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3485 {
3486 	int x;
3487 
3488 	x = lrc_ring_mi_mode(engine);
3489 	if (x != -1) {
3490 		regs[x + 1] &= ~STOP_RING;
3491 		regs[x + 1] |= STOP_RING << 16;
3492 	}
3493 }
3494 
3495 static void __execlists_reset_reg_state(const struct intel_context *ce,
3496 					const struct intel_engine_cs *engine)
3497 {
3498 	u32 *regs = ce->lrc_reg_state;
3499 
3500 	__reset_stop_ring(regs, engine);
3501 }
3502 
3503 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3504 {
3505 	struct intel_engine_execlists * const execlists = &engine->execlists;
3506 	struct intel_context *ce;
3507 	struct i915_request *rq;
3508 	u32 head;
3509 
3510 	mb(); /* paranoia: read the CSB pointers from after the reset */
3511 	clflush(execlists->csb_write);
3512 	mb();
3513 
3514 	process_csb(engine); /* drain preemption events */
3515 
3516 	/* Following the reset, we need to reload the CSB read/write pointers */
3517 	reset_csb_pointers(engine);
3518 
3519 	/*
3520 	 * Save the currently executing context, even if we completed
3521 	 * its request, it was still running at the time of the
3522 	 * reset and will have been clobbered.
3523 	 */
3524 	rq = execlists_active(execlists);
3525 	if (!rq)
3526 		goto unwind;
3527 
3528 	/* We still have requests in-flight; the engine should be active */
3529 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3530 
3531 	ce = rq->context;
3532 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3533 
3534 	if (i915_request_completed(rq)) {
3535 		/* Idle context; tidy up the ring so we can restart afresh */
3536 		head = intel_ring_wrap(ce->ring, rq->tail);
3537 		goto out_replay;
3538 	}
3539 
3540 	/* Context has requests still in-flight; it should not be idle! */
3541 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3542 	rq = active_request(ce->timeline, rq);
3543 	head = intel_ring_wrap(ce->ring, rq->head);
3544 	GEM_BUG_ON(head == ce->ring->tail);
3545 
3546 	/*
3547 	 * If this request hasn't started yet, e.g. it is waiting on a
3548 	 * semaphore, we need to avoid skipping the request or else we
3549 	 * break the signaling chain. However, if the context is corrupt
3550 	 * the request will not restart and we will be stuck with a wedged
3551 	 * device. It is quite often the case that if we issue a reset
3552 	 * while the GPU is loading the context image, that the context
3553 	 * image becomes corrupt.
3554 	 *
3555 	 * Otherwise, if we have not started yet, the request should replay
3556 	 * perfectly and we do not need to flag the result as being erroneous.
3557 	 */
3558 	if (!i915_request_started(rq))
3559 		goto out_replay;
3560 
3561 	/*
3562 	 * If the request was innocent, we leave the request in the ELSP
3563 	 * and will try to replay it on restarting. The context image may
3564 	 * have been corrupted by the reset, in which case we may have
3565 	 * to service a new GPU hang, but more likely we can continue on
3566 	 * without impact.
3567 	 *
3568 	 * If the request was guilty, we presume the context is corrupt
3569 	 * and have to at least restore the RING register in the context
3570 	 * image back to the expected values to skip over the guilty request.
3571 	 */
3572 	__i915_request_reset(rq, stalled);
3573 	if (!stalled)
3574 		goto out_replay;
3575 
3576 	/*
3577 	 * We want a simple context + ring to execute the breadcrumb update.
3578 	 * We cannot rely on the context being intact across the GPU hang,
3579 	 * so clear it and rebuild just what we need for the breadcrumb.
3580 	 * All pending requests for this context will be zapped, and any
3581 	 * future request will be after userspace has had the opportunity
3582 	 * to recreate its own state.
3583 	 */
3584 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3585 	restore_default_state(ce, engine);
3586 
3587 out_replay:
3588 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3589 		     head, ce->ring->tail);
3590 	__execlists_reset_reg_state(ce, engine);
3591 	__execlists_update_reg_state(ce, engine, head);
3592 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3593 
3594 unwind:
3595 	/* Push back any incomplete requests for replay after the reset. */
3596 	cancel_port_requests(execlists);
3597 	__unwind_incomplete_requests(engine);
3598 }
3599 
3600 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3601 {
3602 	unsigned long flags;
3603 
3604 	ENGINE_TRACE(engine, "\n");
3605 
3606 	spin_lock_irqsave(&engine->active.lock, flags);
3607 
3608 	__execlists_reset(engine, stalled);
3609 
3610 	spin_unlock_irqrestore(&engine->active.lock, flags);
3611 }
3612 
3613 static void nop_submission_tasklet(unsigned long data)
3614 {
3615 	/* The driver is wedged; don't process any more events. */
3616 }
3617 
3618 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3619 {
3620 	struct intel_engine_execlists * const execlists = &engine->execlists;
3621 	struct i915_request *rq, *rn;
3622 	struct rb_node *rb;
3623 	unsigned long flags;
3624 
3625 	ENGINE_TRACE(engine, "\n");
3626 
3627 	/*
3628 	 * Before we call engine->cancel_requests(), we should have exclusive
3629 	 * access to the submission state. This is arranged for us by the
3630 	 * caller disabling the interrupt generation, the tasklet and other
3631 	 * threads that may then access the same state, giving us a free hand
3632 	 * to reset state. However, we still need to let lockdep be aware that
3633 	 * we know this state may be accessed in hardirq context, so we
3634 	 * disable the irq around this manipulation and we want to keep
3635 	 * the spinlock focused on its duties and not accidentally conflate
3636 	 * coverage to the submission's irq state. (Similarly, although we
3637 	 * shouldn't need to disable irq around the manipulation of the
3638 	 * submission's irq state, we also wish to remind ourselves that
3639 	 * it is irq state.)
3640 	 */
3641 	spin_lock_irqsave(&engine->active.lock, flags);
3642 
3643 	__execlists_reset(engine, true);
3644 
3645 	/* Mark all executing requests as skipped. */
3646 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3647 		mark_eio(rq);
3648 
3649 	/* Flush the queued requests to the timeline list (for retiring). */
3650 	while ((rb = rb_first_cached(&execlists->queue))) {
3651 		struct i915_priolist *p = to_priolist(rb);
3652 		int i;
3653 
3654 		priolist_for_each_request_consume(rq, rn, p, i) {
3655 			mark_eio(rq);
3656 			__i915_request_submit(rq);
3657 		}
3658 
3659 		rb_erase_cached(&p->node, &execlists->queue);
3660 		i915_priolist_free(p);
3661 	}
3662 
3663 	/* On-hold requests will be flushed to timeline upon their release */
3664 	list_for_each_entry(rq, &engine->active.hold, sched.link)
3665 		mark_eio(rq);
3666 
3667 	/* Cancel all attached virtual engines */
3668 	while ((rb = rb_first_cached(&execlists->virtual))) {
3669 		struct virtual_engine *ve =
3670 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3671 
3672 		rb_erase_cached(rb, &execlists->virtual);
3673 		RB_CLEAR_NODE(rb);
3674 
3675 		spin_lock(&ve->base.active.lock);
3676 		rq = fetch_and_zero(&ve->request);
3677 		if (rq) {
3678 			mark_eio(rq);
3679 
3680 			rq->engine = engine;
3681 			__i915_request_submit(rq);
3682 			i915_request_put(rq);
3683 
3684 			ve->base.execlists.queue_priority_hint = INT_MIN;
3685 		}
3686 		spin_unlock(&ve->base.active.lock);
3687 	}
3688 
3689 	/* Remaining _unready_ requests will be nop'ed when submitted */
3690 
3691 	execlists->queue_priority_hint = INT_MIN;
3692 	execlists->queue = RB_ROOT_CACHED;
3693 
3694 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3695 	execlists->tasklet.func = nop_submission_tasklet;
3696 
3697 	spin_unlock_irqrestore(&engine->active.lock, flags);
3698 }
3699 
3700 static void execlists_reset_finish(struct intel_engine_cs *engine)
3701 {
3702 	struct intel_engine_execlists * const execlists = &engine->execlists;
3703 
3704 	/*
3705 	 * After a GPU reset, we may have requests to replay. Do so now while
3706 	 * we still have the forcewake to be sure that the GPU is not allowed
3707 	 * to sleep before we restart and reload a context.
3708 	 */
3709 	GEM_BUG_ON(!reset_in_progress(execlists));
3710 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3711 		execlists->tasklet.func(execlists->tasklet.data);
3712 
3713 	if (__tasklet_enable(&execlists->tasklet))
3714 		/* And kick in case we missed a new request submission. */
3715 		tasklet_hi_schedule(&execlists->tasklet);
3716 	ENGINE_TRACE(engine, "depth->%d\n",
3717 		     atomic_read(&execlists->tasklet.count));
3718 }
3719 
3720 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3721 				    u64 offset, u32 len,
3722 				    const unsigned int flags)
3723 {
3724 	u32 *cs;
3725 
3726 	cs = intel_ring_begin(rq, 4);
3727 	if (IS_ERR(cs))
3728 		return PTR_ERR(cs);
3729 
3730 	/*
3731 	 * WaDisableCtxRestoreArbitration:bdw,chv
3732 	 *
3733 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3734 	 * particular all the gen that do not need the w/a at all!), if we
3735 	 * took care to make sure that on every switch into this context
3736 	 * (both ordinary and for preemption) that arbitrartion was enabled
3737 	 * we would be fine.  However, for gen8 there is another w/a that
3738 	 * requires us to not preempt inside GPGPU execution, so we keep
3739 	 * arbitration disabled for gen8 batches. Arbitration will be
3740 	 * re-enabled before we close the request
3741 	 * (engine->emit_fini_breadcrumb).
3742 	 */
3743 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3744 
3745 	/* FIXME(BDW+): Address space and security selectors. */
3746 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3747 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3748 	*cs++ = lower_32_bits(offset);
3749 	*cs++ = upper_32_bits(offset);
3750 
3751 	intel_ring_advance(rq, cs);
3752 
3753 	return 0;
3754 }
3755 
3756 static int gen8_emit_bb_start(struct i915_request *rq,
3757 			      u64 offset, u32 len,
3758 			      const unsigned int flags)
3759 {
3760 	u32 *cs;
3761 
3762 	cs = intel_ring_begin(rq, 6);
3763 	if (IS_ERR(cs))
3764 		return PTR_ERR(cs);
3765 
3766 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3767 
3768 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3769 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3770 	*cs++ = lower_32_bits(offset);
3771 	*cs++ = upper_32_bits(offset);
3772 
3773 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3774 	*cs++ = MI_NOOP;
3775 
3776 	intel_ring_advance(rq, cs);
3777 
3778 	return 0;
3779 }
3780 
3781 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3782 {
3783 	ENGINE_WRITE(engine, RING_IMR,
3784 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3785 	ENGINE_POSTING_READ(engine, RING_IMR);
3786 }
3787 
3788 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3789 {
3790 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3791 }
3792 
3793 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3794 {
3795 	u32 cmd, *cs;
3796 
3797 	cs = intel_ring_begin(request, 4);
3798 	if (IS_ERR(cs))
3799 		return PTR_ERR(cs);
3800 
3801 	cmd = MI_FLUSH_DW + 1;
3802 
3803 	/* We always require a command barrier so that subsequent
3804 	 * commands, such as breadcrumb interrupts, are strictly ordered
3805 	 * wrt the contents of the write cache being flushed to memory
3806 	 * (and thus being coherent from the CPU).
3807 	 */
3808 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3809 
3810 	if (mode & EMIT_INVALIDATE) {
3811 		cmd |= MI_INVALIDATE_TLB;
3812 		if (request->engine->class == VIDEO_DECODE_CLASS)
3813 			cmd |= MI_INVALIDATE_BSD;
3814 	}
3815 
3816 	*cs++ = cmd;
3817 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3818 	*cs++ = 0; /* upper addr */
3819 	*cs++ = 0; /* value */
3820 	intel_ring_advance(request, cs);
3821 
3822 	return 0;
3823 }
3824 
3825 static int gen8_emit_flush_render(struct i915_request *request,
3826 				  u32 mode)
3827 {
3828 	bool vf_flush_wa = false, dc_flush_wa = false;
3829 	u32 *cs, flags = 0;
3830 	int len;
3831 
3832 	flags |= PIPE_CONTROL_CS_STALL;
3833 
3834 	if (mode & EMIT_FLUSH) {
3835 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3836 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3837 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3838 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3839 	}
3840 
3841 	if (mode & EMIT_INVALIDATE) {
3842 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3843 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3844 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3845 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3846 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3847 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3848 		flags |= PIPE_CONTROL_QW_WRITE;
3849 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3850 
3851 		/*
3852 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3853 		 * pipe control.
3854 		 */
3855 		if (IS_GEN(request->i915, 9))
3856 			vf_flush_wa = true;
3857 
3858 		/* WaForGAMHang:kbl */
3859 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3860 			dc_flush_wa = true;
3861 	}
3862 
3863 	len = 6;
3864 
3865 	if (vf_flush_wa)
3866 		len += 6;
3867 
3868 	if (dc_flush_wa)
3869 		len += 12;
3870 
3871 	cs = intel_ring_begin(request, len);
3872 	if (IS_ERR(cs))
3873 		return PTR_ERR(cs);
3874 
3875 	if (vf_flush_wa)
3876 		cs = gen8_emit_pipe_control(cs, 0, 0);
3877 
3878 	if (dc_flush_wa)
3879 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3880 					    0);
3881 
3882 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3883 
3884 	if (dc_flush_wa)
3885 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3886 
3887 	intel_ring_advance(request, cs);
3888 
3889 	return 0;
3890 }
3891 
3892 static int gen11_emit_flush_render(struct i915_request *request,
3893 				   u32 mode)
3894 {
3895 	if (mode & EMIT_FLUSH) {
3896 		u32 *cs;
3897 		u32 flags = 0;
3898 
3899 		flags |= PIPE_CONTROL_CS_STALL;
3900 
3901 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3902 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3903 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3904 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3905 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3906 		flags |= PIPE_CONTROL_QW_WRITE;
3907 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3908 
3909 		cs = intel_ring_begin(request, 6);
3910 		if (IS_ERR(cs))
3911 			return PTR_ERR(cs);
3912 
3913 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3914 		intel_ring_advance(request, cs);
3915 	}
3916 
3917 	if (mode & EMIT_INVALIDATE) {
3918 		u32 *cs;
3919 		u32 flags = 0;
3920 
3921 		flags |= PIPE_CONTROL_CS_STALL;
3922 
3923 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3924 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3925 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3926 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3927 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3928 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3929 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3930 		flags |= PIPE_CONTROL_QW_WRITE;
3931 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3932 
3933 		cs = intel_ring_begin(request, 6);
3934 		if (IS_ERR(cs))
3935 			return PTR_ERR(cs);
3936 
3937 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3938 		intel_ring_advance(request, cs);
3939 	}
3940 
3941 	return 0;
3942 }
3943 
3944 static u32 preparser_disable(bool state)
3945 {
3946 	return MI_ARB_CHECK | 1 << 8 | state;
3947 }
3948 
3949 static int gen12_emit_flush_render(struct i915_request *request,
3950 				   u32 mode)
3951 {
3952 	if (mode & EMIT_FLUSH) {
3953 		u32 flags = 0;
3954 		u32 *cs;
3955 
3956 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3957 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3958 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3959 		/* Wa_1409600907:tgl */
3960 		flags |= PIPE_CONTROL_DEPTH_STALL;
3961 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3962 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3963 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3964 
3965 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3966 		flags |= PIPE_CONTROL_QW_WRITE;
3967 
3968 		flags |= PIPE_CONTROL_CS_STALL;
3969 
3970 		cs = intel_ring_begin(request, 6);
3971 		if (IS_ERR(cs))
3972 			return PTR_ERR(cs);
3973 
3974 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3975 		intel_ring_advance(request, cs);
3976 	}
3977 
3978 	if (mode & EMIT_INVALIDATE) {
3979 		u32 flags = 0;
3980 		u32 *cs;
3981 
3982 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3983 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3984 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3985 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3986 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3987 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3988 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3989 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3990 
3991 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3992 		flags |= PIPE_CONTROL_QW_WRITE;
3993 
3994 		flags |= PIPE_CONTROL_CS_STALL;
3995 
3996 		cs = intel_ring_begin(request, 8);
3997 		if (IS_ERR(cs))
3998 			return PTR_ERR(cs);
3999 
4000 		/*
4001 		 * Prevent the pre-parser from skipping past the TLB
4002 		 * invalidate and loading a stale page for the batch
4003 		 * buffer / request payload.
4004 		 */
4005 		*cs++ = preparser_disable(true);
4006 
4007 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4008 
4009 		*cs++ = preparser_disable(false);
4010 		intel_ring_advance(request, cs);
4011 
4012 		/*
4013 		 * Wa_1604544889:tgl
4014 		 */
4015 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
4016 			flags = 0;
4017 			flags |= PIPE_CONTROL_CS_STALL;
4018 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4019 
4020 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4021 			flags |= PIPE_CONTROL_QW_WRITE;
4022 
4023 			cs = intel_ring_begin(request, 6);
4024 			if (IS_ERR(cs))
4025 				return PTR_ERR(cs);
4026 
4027 			cs = gen8_emit_pipe_control(cs, flags,
4028 						    LRC_PPHWSP_SCRATCH_ADDR);
4029 			intel_ring_advance(request, cs);
4030 		}
4031 	}
4032 
4033 	return 0;
4034 }
4035 
4036 /*
4037  * Reserve space for 2 NOOPs at the end of each request to be
4038  * used as a workaround for not being allowed to do lite
4039  * restore with HEAD==TAIL (WaIdleLiteRestore).
4040  */
4041 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4042 {
4043 	/* Ensure there's always at least one preemption point per-request. */
4044 	*cs++ = MI_ARB_CHECK;
4045 	*cs++ = MI_NOOP;
4046 	request->wa_tail = intel_ring_offset(request, cs);
4047 
4048 	return cs;
4049 }
4050 
4051 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4052 {
4053 	*cs++ = MI_SEMAPHORE_WAIT |
4054 		MI_SEMAPHORE_GLOBAL_GTT |
4055 		MI_SEMAPHORE_POLL |
4056 		MI_SEMAPHORE_SAD_EQ_SDD;
4057 	*cs++ = 0;
4058 	*cs++ = intel_hws_preempt_address(request->engine);
4059 	*cs++ = 0;
4060 
4061 	return cs;
4062 }
4063 
4064 static __always_inline u32*
4065 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4066 				 u32 *cs)
4067 {
4068 	*cs++ = MI_USER_INTERRUPT;
4069 
4070 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4071 	if (intel_engine_has_semaphores(request->engine))
4072 		cs = emit_preempt_busywait(request, cs);
4073 
4074 	request->tail = intel_ring_offset(request, cs);
4075 	assert_ring_tail_valid(request->ring, request->tail);
4076 
4077 	return gen8_emit_wa_tail(request, cs);
4078 }
4079 
4080 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4081 {
4082 	cs = gen8_emit_ggtt_write(cs,
4083 				  request->fence.seqno,
4084 				  i915_request_active_timeline(request)->hwsp_offset,
4085 				  0);
4086 
4087 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4088 }
4089 
4090 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4091 {
4092 	cs = gen8_emit_pipe_control(cs,
4093 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4094 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4095 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4096 				    0);
4097 
4098 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4099 	cs = gen8_emit_ggtt_write_rcs(cs,
4100 				      request->fence.seqno,
4101 				      i915_request_active_timeline(request)->hwsp_offset,
4102 				      PIPE_CONTROL_FLUSH_ENABLE |
4103 				      PIPE_CONTROL_CS_STALL);
4104 
4105 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4106 }
4107 
4108 static u32 *
4109 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4110 {
4111 	cs = gen8_emit_ggtt_write_rcs(cs,
4112 				      request->fence.seqno,
4113 				      i915_request_active_timeline(request)->hwsp_offset,
4114 				      PIPE_CONTROL_CS_STALL |
4115 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4116 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4117 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4118 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4119 				      PIPE_CONTROL_FLUSH_ENABLE);
4120 
4121 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4122 }
4123 
4124 /*
4125  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4126  * flush and will continue pre-fetching the instructions after it before the
4127  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4128  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4129  * of the next request before the memory has been flushed, we're guaranteed that
4130  * we won't access the batch itself too early.
4131  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4132  * so, if the current request is modifying an instruction in the next request on
4133  * the same intel_context, we might pre-fetch and then execute the pre-update
4134  * instruction. To avoid this, the users of self-modifying code should either
4135  * disable the parser around the code emitting the memory writes, via a new flag
4136  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4137  * the in-kernel use-cases we've opted to use a separate context, see
4138  * reloc_gpu() as an example.
4139  * All the above applies only to the instructions themselves. Non-inline data
4140  * used by the instructions is not pre-fetched.
4141  */
4142 
4143 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4144 {
4145 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4146 		MI_SEMAPHORE_GLOBAL_GTT |
4147 		MI_SEMAPHORE_POLL |
4148 		MI_SEMAPHORE_SAD_EQ_SDD;
4149 	*cs++ = 0;
4150 	*cs++ = intel_hws_preempt_address(request->engine);
4151 	*cs++ = 0;
4152 	*cs++ = 0;
4153 	*cs++ = MI_NOOP;
4154 
4155 	return cs;
4156 }
4157 
4158 static __always_inline u32*
4159 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4160 {
4161 	*cs++ = MI_USER_INTERRUPT;
4162 
4163 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4164 	if (intel_engine_has_semaphores(request->engine))
4165 		cs = gen12_emit_preempt_busywait(request, cs);
4166 
4167 	request->tail = intel_ring_offset(request, cs);
4168 	assert_ring_tail_valid(request->ring, request->tail);
4169 
4170 	return gen8_emit_wa_tail(request, cs);
4171 }
4172 
4173 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4174 {
4175 	cs = gen8_emit_ggtt_write(cs,
4176 				  request->fence.seqno,
4177 				  i915_request_active_timeline(request)->hwsp_offset,
4178 				  0);
4179 
4180 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4181 }
4182 
4183 static u32 *
4184 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4185 {
4186 	cs = gen8_emit_ggtt_write_rcs(cs,
4187 				      request->fence.seqno,
4188 				      i915_request_active_timeline(request)->hwsp_offset,
4189 				      PIPE_CONTROL_CS_STALL |
4190 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4191 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4192 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4193 				      /* Wa_1409600907:tgl */
4194 				      PIPE_CONTROL_DEPTH_STALL |
4195 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4196 				      PIPE_CONTROL_FLUSH_ENABLE |
4197 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4198 
4199 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4200 }
4201 
4202 static void execlists_park(struct intel_engine_cs *engine)
4203 {
4204 	cancel_timer(&engine->execlists.timer);
4205 	cancel_timer(&engine->execlists.preempt);
4206 }
4207 
4208 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4209 {
4210 	engine->submit_request = execlists_submit_request;
4211 	engine->schedule = i915_schedule;
4212 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4213 
4214 	engine->reset.prepare = execlists_reset_prepare;
4215 	engine->reset.rewind = execlists_reset_rewind;
4216 	engine->reset.cancel = execlists_reset_cancel;
4217 	engine->reset.finish = execlists_reset_finish;
4218 
4219 	engine->park = execlists_park;
4220 	engine->unpark = NULL;
4221 
4222 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4223 	if (!intel_vgpu_active(engine->i915)) {
4224 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4225 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4226 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4227 	}
4228 
4229 	if (INTEL_GEN(engine->i915) >= 12)
4230 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4231 
4232 	if (intel_engine_has_preemption(engine))
4233 		engine->emit_bb_start = gen8_emit_bb_start;
4234 	else
4235 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4236 }
4237 
4238 static void execlists_shutdown(struct intel_engine_cs *engine)
4239 {
4240 	/* Synchronise with residual timers and any softirq they raise */
4241 	del_timer_sync(&engine->execlists.timer);
4242 	del_timer_sync(&engine->execlists.preempt);
4243 	tasklet_kill(&engine->execlists.tasklet);
4244 }
4245 
4246 static void execlists_release(struct intel_engine_cs *engine)
4247 {
4248 	execlists_shutdown(engine);
4249 
4250 	intel_engine_cleanup_common(engine);
4251 	lrc_destroy_wa_ctx(engine);
4252 }
4253 
4254 static void
4255 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4256 {
4257 	/* Default vfuncs which can be overriden by each engine. */
4258 
4259 	engine->resume = execlists_resume;
4260 
4261 	engine->cops = &execlists_context_ops;
4262 	engine->request_alloc = execlists_request_alloc;
4263 
4264 	engine->emit_flush = gen8_emit_flush;
4265 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4266 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4267 	if (INTEL_GEN(engine->i915) >= 12)
4268 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4269 
4270 	engine->set_default_submission = intel_execlists_set_default_submission;
4271 
4272 	if (INTEL_GEN(engine->i915) < 11) {
4273 		engine->irq_enable = gen8_logical_ring_enable_irq;
4274 		engine->irq_disable = gen8_logical_ring_disable_irq;
4275 	} else {
4276 		/*
4277 		 * TODO: On Gen11 interrupt masks need to be clear
4278 		 * to allow C6 entry. Keep interrupts enabled at
4279 		 * and take the hit of generating extra interrupts
4280 		 * until a more refined solution exists.
4281 		 */
4282 	}
4283 }
4284 
4285 static inline void
4286 logical_ring_default_irqs(struct intel_engine_cs *engine)
4287 {
4288 	unsigned int shift = 0;
4289 
4290 	if (INTEL_GEN(engine->i915) < 11) {
4291 		const u8 irq_shifts[] = {
4292 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
4293 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
4294 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4295 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4296 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
4297 		};
4298 
4299 		shift = irq_shifts[engine->id];
4300 	}
4301 
4302 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4303 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4304 }
4305 
4306 static void rcs_submission_override(struct intel_engine_cs *engine)
4307 {
4308 	switch (INTEL_GEN(engine->i915)) {
4309 	case 12:
4310 		engine->emit_flush = gen12_emit_flush_render;
4311 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4312 		break;
4313 	case 11:
4314 		engine->emit_flush = gen11_emit_flush_render;
4315 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4316 		break;
4317 	default:
4318 		engine->emit_flush = gen8_emit_flush_render;
4319 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4320 		break;
4321 	}
4322 }
4323 
4324 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4325 {
4326 	struct intel_engine_execlists * const execlists = &engine->execlists;
4327 	struct drm_i915_private *i915 = engine->i915;
4328 	struct intel_uncore *uncore = engine->uncore;
4329 	u32 base = engine->mmio_base;
4330 
4331 	tasklet_init(&engine->execlists.tasklet,
4332 		     execlists_submission_tasklet, (unsigned long)engine);
4333 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4334 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4335 
4336 	logical_ring_default_vfuncs(engine);
4337 	logical_ring_default_irqs(engine);
4338 
4339 	if (engine->class == RENDER_CLASS)
4340 		rcs_submission_override(engine);
4341 
4342 	if (intel_init_workaround_bb(engine))
4343 		/*
4344 		 * We continue even if we fail to initialize WA batch
4345 		 * because we only expect rare glitches but nothing
4346 		 * critical to prevent us from using GPU
4347 		 */
4348 		DRM_ERROR("WA batch buffer initialization failed\n");
4349 
4350 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
4351 		execlists->submit_reg = uncore->regs +
4352 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4353 		execlists->ctrl_reg = uncore->regs +
4354 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4355 	} else {
4356 		execlists->submit_reg = uncore->regs +
4357 			i915_mmio_reg_offset(RING_ELSP(base));
4358 	}
4359 
4360 	execlists->csb_status =
4361 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4362 
4363 	execlists->csb_write =
4364 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
4365 
4366 	if (INTEL_GEN(i915) < 11)
4367 		execlists->csb_size = GEN8_CSB_ENTRIES;
4368 	else
4369 		execlists->csb_size = GEN11_CSB_ENTRIES;
4370 
4371 	reset_csb_pointers(engine);
4372 
4373 	/* Finally, take ownership and responsibility for cleanup! */
4374 	engine->release = execlists_release;
4375 
4376 	return 0;
4377 }
4378 
4379 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4380 {
4381 	u32 indirect_ctx_offset;
4382 
4383 	switch (INTEL_GEN(engine->i915)) {
4384 	default:
4385 		MISSING_CASE(INTEL_GEN(engine->i915));
4386 		/* fall through */
4387 	case 12:
4388 		indirect_ctx_offset =
4389 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4390 		break;
4391 	case 11:
4392 		indirect_ctx_offset =
4393 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4394 		break;
4395 	case 10:
4396 		indirect_ctx_offset =
4397 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4398 		break;
4399 	case 9:
4400 		indirect_ctx_offset =
4401 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4402 		break;
4403 	case 8:
4404 		indirect_ctx_offset =
4405 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4406 		break;
4407 	}
4408 
4409 	return indirect_ctx_offset;
4410 }
4411 
4412 
4413 static void init_common_reg_state(u32 * const regs,
4414 				  const struct intel_engine_cs *engine,
4415 				  const struct intel_ring *ring,
4416 				  bool inhibit)
4417 {
4418 	u32 ctl;
4419 
4420 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4421 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4422 	if (inhibit)
4423 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4424 	if (INTEL_GEN(engine->i915) < 11)
4425 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4426 					   CTX_CTRL_RS_CTX_ENABLE);
4427 	regs[CTX_CONTEXT_CONTROL] = ctl;
4428 
4429 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4430 }
4431 
4432 static void init_wa_bb_reg_state(u32 * const regs,
4433 				 const struct intel_engine_cs *engine,
4434 				 u32 pos_bb_per_ctx)
4435 {
4436 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4437 
4438 	if (wa_ctx->per_ctx.size) {
4439 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4440 
4441 		regs[pos_bb_per_ctx] =
4442 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4443 	}
4444 
4445 	if (wa_ctx->indirect_ctx.size) {
4446 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4447 
4448 		regs[pos_bb_per_ctx + 2] =
4449 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
4450 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4451 
4452 		regs[pos_bb_per_ctx + 4] =
4453 			intel_lr_indirect_ctx_offset(engine) << 6;
4454 	}
4455 }
4456 
4457 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4458 {
4459 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
4460 		/* 64b PPGTT (48bit canonical)
4461 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
4462 		 * other PDP Descriptors are ignored.
4463 		 */
4464 		ASSIGN_CTX_PML4(ppgtt, regs);
4465 	} else {
4466 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
4467 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
4468 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
4469 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
4470 	}
4471 }
4472 
4473 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4474 {
4475 	if (i915_is_ggtt(vm))
4476 		return i915_vm_to_ggtt(vm)->alias;
4477 	else
4478 		return i915_vm_to_ppgtt(vm);
4479 }
4480 
4481 static void execlists_init_reg_state(u32 *regs,
4482 				     const struct intel_context *ce,
4483 				     const struct intel_engine_cs *engine,
4484 				     const struct intel_ring *ring,
4485 				     bool inhibit)
4486 {
4487 	/*
4488 	 * A context is actually a big batch buffer with several
4489 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4490 	 * values we are setting here are only for the first context restore:
4491 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
4492 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4493 	 * we are not initializing here).
4494 	 *
4495 	 * Must keep consistent with virtual_update_register_offsets().
4496 	 */
4497 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
4498 
4499 	init_common_reg_state(regs, engine, ring, inhibit);
4500 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4501 
4502 	init_wa_bb_reg_state(regs, engine,
4503 			     INTEL_GEN(engine->i915) >= 12 ?
4504 			     GEN12_CTX_BB_PER_CTX_PTR :
4505 			     CTX_BB_PER_CTX_PTR);
4506 
4507 	__reset_stop_ring(regs, engine);
4508 }
4509 
4510 static int
4511 populate_lr_context(struct intel_context *ce,
4512 		    struct drm_i915_gem_object *ctx_obj,
4513 		    struct intel_engine_cs *engine,
4514 		    struct intel_ring *ring)
4515 {
4516 	bool inhibit = true;
4517 	void *vaddr;
4518 	int ret;
4519 
4520 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4521 	if (IS_ERR(vaddr)) {
4522 		ret = PTR_ERR(vaddr);
4523 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4524 		return ret;
4525 	}
4526 
4527 	set_redzone(vaddr, engine);
4528 
4529 	if (engine->default_state) {
4530 		void *defaults;
4531 
4532 		defaults = i915_gem_object_pin_map(engine->default_state,
4533 						   I915_MAP_WB);
4534 		if (IS_ERR(defaults)) {
4535 			ret = PTR_ERR(defaults);
4536 			goto err_unpin_ctx;
4537 		}
4538 
4539 		memcpy(vaddr, defaults, engine->context_size);
4540 		i915_gem_object_unpin_map(engine->default_state);
4541 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
4542 		inhibit = false;
4543 	}
4544 
4545 	/* The second page of the context object contains some fields which must
4546 	 * be set up prior to the first execution. */
4547 	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4548 				 ce, engine, ring, inhibit);
4549 
4550 	ret = 0;
4551 err_unpin_ctx:
4552 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4553 	i915_gem_object_unpin_map(ctx_obj);
4554 	return ret;
4555 }
4556 
4557 static int __execlists_context_alloc(struct intel_context *ce,
4558 				     struct intel_engine_cs *engine)
4559 {
4560 	struct drm_i915_gem_object *ctx_obj;
4561 	struct intel_ring *ring;
4562 	struct i915_vma *vma;
4563 	u32 context_size;
4564 	int ret;
4565 
4566 	GEM_BUG_ON(ce->state);
4567 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4568 
4569 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4570 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4571 
4572 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4573 	if (IS_ERR(ctx_obj))
4574 		return PTR_ERR(ctx_obj);
4575 
4576 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4577 	if (IS_ERR(vma)) {
4578 		ret = PTR_ERR(vma);
4579 		goto error_deref_obj;
4580 	}
4581 
4582 	if (!ce->timeline) {
4583 		struct intel_timeline *tl;
4584 
4585 		tl = intel_timeline_create(engine->gt, NULL);
4586 		if (IS_ERR(tl)) {
4587 			ret = PTR_ERR(tl);
4588 			goto error_deref_obj;
4589 		}
4590 
4591 		ce->timeline = tl;
4592 	}
4593 
4594 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4595 	if (IS_ERR(ring)) {
4596 		ret = PTR_ERR(ring);
4597 		goto error_deref_obj;
4598 	}
4599 
4600 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4601 	if (ret) {
4602 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4603 		goto error_ring_free;
4604 	}
4605 
4606 	ce->ring = ring;
4607 	ce->state = vma;
4608 
4609 	return 0;
4610 
4611 error_ring_free:
4612 	intel_ring_put(ring);
4613 error_deref_obj:
4614 	i915_gem_object_put(ctx_obj);
4615 	return ret;
4616 }
4617 
4618 static struct list_head *virtual_queue(struct virtual_engine *ve)
4619 {
4620 	return &ve->base.execlists.default_priolist.requests[0];
4621 }
4622 
4623 static void virtual_context_destroy(struct kref *kref)
4624 {
4625 	struct virtual_engine *ve =
4626 		container_of(kref, typeof(*ve), context.ref);
4627 	unsigned int n;
4628 
4629 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4630 	GEM_BUG_ON(ve->request);
4631 	GEM_BUG_ON(ve->context.inflight);
4632 
4633 	for (n = 0; n < ve->num_siblings; n++) {
4634 		struct intel_engine_cs *sibling = ve->siblings[n];
4635 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4636 		unsigned long flags;
4637 
4638 		if (RB_EMPTY_NODE(node))
4639 			continue;
4640 
4641 		spin_lock_irqsave(&sibling->active.lock, flags);
4642 
4643 		/* Detachment is lazily performed in the execlists tasklet */
4644 		if (!RB_EMPTY_NODE(node))
4645 			rb_erase_cached(node, &sibling->execlists.virtual);
4646 
4647 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4648 	}
4649 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4650 
4651 	if (ve->context.state)
4652 		__execlists_context_fini(&ve->context);
4653 	intel_context_fini(&ve->context);
4654 
4655 	kfree(ve->bonds);
4656 	kfree(ve);
4657 }
4658 
4659 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4660 {
4661 	int swp;
4662 
4663 	/*
4664 	 * Pick a random sibling on starting to help spread the load around.
4665 	 *
4666 	 * New contexts are typically created with exactly the same order
4667 	 * of siblings, and often started in batches. Due to the way we iterate
4668 	 * the array of sibling when submitting requests, sibling[0] is
4669 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4670 	 * randomised across the system, we also help spread the load by the
4671 	 * first engine we inspect being different each time.
4672 	 *
4673 	 * NB This does not force us to execute on this engine, it will just
4674 	 * typically be the first we inspect for submission.
4675 	 */
4676 	swp = prandom_u32_max(ve->num_siblings);
4677 	if (!swp)
4678 		return;
4679 
4680 	swap(ve->siblings[swp], ve->siblings[0]);
4681 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4682 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4683 						ve->siblings[0]);
4684 }
4685 
4686 static int virtual_context_alloc(struct intel_context *ce)
4687 {
4688 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4689 
4690 	return __execlists_context_alloc(ce, ve->siblings[0]);
4691 }
4692 
4693 static int virtual_context_pin(struct intel_context *ce)
4694 {
4695 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4696 	int err;
4697 
4698 	/* Note: we must use a real engine class for setting up reg state */
4699 	err = __execlists_context_pin(ce, ve->siblings[0]);
4700 	if (err)
4701 		return err;
4702 
4703 	virtual_engine_initial_hint(ve);
4704 	return 0;
4705 }
4706 
4707 static void virtual_context_enter(struct intel_context *ce)
4708 {
4709 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4710 	unsigned int n;
4711 
4712 	for (n = 0; n < ve->num_siblings; n++)
4713 		intel_engine_pm_get(ve->siblings[n]);
4714 
4715 	intel_timeline_enter(ce->timeline);
4716 }
4717 
4718 static void virtual_context_exit(struct intel_context *ce)
4719 {
4720 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4721 	unsigned int n;
4722 
4723 	intel_timeline_exit(ce->timeline);
4724 
4725 	for (n = 0; n < ve->num_siblings; n++)
4726 		intel_engine_pm_put(ve->siblings[n]);
4727 }
4728 
4729 static const struct intel_context_ops virtual_context_ops = {
4730 	.alloc = virtual_context_alloc,
4731 
4732 	.pin = virtual_context_pin,
4733 	.unpin = execlists_context_unpin,
4734 
4735 	.enter = virtual_context_enter,
4736 	.exit = virtual_context_exit,
4737 
4738 	.destroy = virtual_context_destroy,
4739 };
4740 
4741 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4742 {
4743 	struct i915_request *rq;
4744 	intel_engine_mask_t mask;
4745 
4746 	rq = READ_ONCE(ve->request);
4747 	if (!rq)
4748 		return 0;
4749 
4750 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4751 	mask = rq->execution_mask;
4752 	if (unlikely(!mask)) {
4753 		/* Invalid selection, submit to a random engine in error */
4754 		i915_request_skip(rq, -ENODEV);
4755 		mask = ve->siblings[0]->mask;
4756 	}
4757 
4758 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4759 		     rq->fence.context, rq->fence.seqno,
4760 		     mask, ve->base.execlists.queue_priority_hint);
4761 
4762 	return mask;
4763 }
4764 
4765 static void virtual_submission_tasklet(unsigned long data)
4766 {
4767 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4768 	const int prio = ve->base.execlists.queue_priority_hint;
4769 	intel_engine_mask_t mask;
4770 	unsigned int n;
4771 
4772 	rcu_read_lock();
4773 	mask = virtual_submission_mask(ve);
4774 	rcu_read_unlock();
4775 	if (unlikely(!mask))
4776 		return;
4777 
4778 	local_irq_disable();
4779 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4780 		struct intel_engine_cs *sibling = ve->siblings[n];
4781 		struct ve_node * const node = &ve->nodes[sibling->id];
4782 		struct rb_node **parent, *rb;
4783 		bool first;
4784 
4785 		if (unlikely(!(mask & sibling->mask))) {
4786 			if (!RB_EMPTY_NODE(&node->rb)) {
4787 				spin_lock(&sibling->active.lock);
4788 				rb_erase_cached(&node->rb,
4789 						&sibling->execlists.virtual);
4790 				RB_CLEAR_NODE(&node->rb);
4791 				spin_unlock(&sibling->active.lock);
4792 			}
4793 			continue;
4794 		}
4795 
4796 		spin_lock(&sibling->active.lock);
4797 
4798 		if (!RB_EMPTY_NODE(&node->rb)) {
4799 			/*
4800 			 * Cheat and avoid rebalancing the tree if we can
4801 			 * reuse this node in situ.
4802 			 */
4803 			first = rb_first_cached(&sibling->execlists.virtual) ==
4804 				&node->rb;
4805 			if (prio == node->prio || (prio > node->prio && first))
4806 				goto submit_engine;
4807 
4808 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4809 		}
4810 
4811 		rb = NULL;
4812 		first = true;
4813 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4814 		while (*parent) {
4815 			struct ve_node *other;
4816 
4817 			rb = *parent;
4818 			other = rb_entry(rb, typeof(*other), rb);
4819 			if (prio > other->prio) {
4820 				parent = &rb->rb_left;
4821 			} else {
4822 				parent = &rb->rb_right;
4823 				first = false;
4824 			}
4825 		}
4826 
4827 		rb_link_node(&node->rb, rb, parent);
4828 		rb_insert_color_cached(&node->rb,
4829 				       &sibling->execlists.virtual,
4830 				       first);
4831 
4832 submit_engine:
4833 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4834 		node->prio = prio;
4835 		if (first && prio > sibling->execlists.queue_priority_hint) {
4836 			sibling->execlists.queue_priority_hint = prio;
4837 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4838 		}
4839 
4840 		spin_unlock(&sibling->active.lock);
4841 	}
4842 	local_irq_enable();
4843 }
4844 
4845 static void virtual_submit_request(struct i915_request *rq)
4846 {
4847 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4848 	struct i915_request *old;
4849 	unsigned long flags;
4850 
4851 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4852 		     rq->fence.context,
4853 		     rq->fence.seqno);
4854 
4855 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4856 
4857 	spin_lock_irqsave(&ve->base.active.lock, flags);
4858 
4859 	old = ve->request;
4860 	if (old) { /* background completion event from preempt-to-busy */
4861 		GEM_BUG_ON(!i915_request_completed(old));
4862 		__i915_request_submit(old);
4863 		i915_request_put(old);
4864 	}
4865 
4866 	if (i915_request_completed(rq)) {
4867 		__i915_request_submit(rq);
4868 
4869 		ve->base.execlists.queue_priority_hint = INT_MIN;
4870 		ve->request = NULL;
4871 	} else {
4872 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4873 		ve->request = i915_request_get(rq);
4874 
4875 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4876 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4877 
4878 		tasklet_schedule(&ve->base.execlists.tasklet);
4879 	}
4880 
4881 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4882 }
4883 
4884 static struct ve_bond *
4885 virtual_find_bond(struct virtual_engine *ve,
4886 		  const struct intel_engine_cs *master)
4887 {
4888 	int i;
4889 
4890 	for (i = 0; i < ve->num_bonds; i++) {
4891 		if (ve->bonds[i].master == master)
4892 			return &ve->bonds[i];
4893 	}
4894 
4895 	return NULL;
4896 }
4897 
4898 static void
4899 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4900 {
4901 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4902 	intel_engine_mask_t allowed, exec;
4903 	struct ve_bond *bond;
4904 
4905 	allowed = ~to_request(signal)->engine->mask;
4906 
4907 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4908 	if (bond)
4909 		allowed &= bond->sibling_mask;
4910 
4911 	/* Restrict the bonded request to run on only the available engines */
4912 	exec = READ_ONCE(rq->execution_mask);
4913 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4914 		;
4915 
4916 	/* Prevent the master from being re-run on the bonded engines */
4917 	to_request(signal)->execution_mask &= ~allowed;
4918 }
4919 
4920 struct intel_context *
4921 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
4922 			       unsigned int count)
4923 {
4924 	struct virtual_engine *ve;
4925 	unsigned int n;
4926 	int err;
4927 
4928 	if (count == 0)
4929 		return ERR_PTR(-EINVAL);
4930 
4931 	if (count == 1)
4932 		return intel_context_create(siblings[0]);
4933 
4934 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4935 	if (!ve)
4936 		return ERR_PTR(-ENOMEM);
4937 
4938 	ve->base.i915 = siblings[0]->i915;
4939 	ve->base.gt = siblings[0]->gt;
4940 	ve->base.uncore = siblings[0]->uncore;
4941 	ve->base.id = -1;
4942 
4943 	ve->base.class = OTHER_CLASS;
4944 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4945 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4946 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4947 
4948 	/*
4949 	 * The decision on whether to submit a request using semaphores
4950 	 * depends on the saturated state of the engine. We only compute
4951 	 * this during HW submission of the request, and we need for this
4952 	 * state to be globally applied to all requests being submitted
4953 	 * to this engine. Virtual engines encompass more than one physical
4954 	 * engine and so we cannot accurately tell in advance if one of those
4955 	 * engines is already saturated and so cannot afford to use a semaphore
4956 	 * and be pessimized in priority for doing so -- if we are the only
4957 	 * context using semaphores after all other clients have stopped, we
4958 	 * will be starved on the saturated system. Such a global switch for
4959 	 * semaphores is less than ideal, but alas is the current compromise.
4960 	 */
4961 	ve->base.saturated = ALL_ENGINES;
4962 
4963 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4964 
4965 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4966 	intel_engine_init_breadcrumbs(&ve->base);
4967 	intel_engine_init_execlists(&ve->base);
4968 
4969 	ve->base.cops = &virtual_context_ops;
4970 	ve->base.request_alloc = execlists_request_alloc;
4971 
4972 	ve->base.schedule = i915_schedule;
4973 	ve->base.submit_request = virtual_submit_request;
4974 	ve->base.bond_execute = virtual_bond_execute;
4975 
4976 	INIT_LIST_HEAD(virtual_queue(ve));
4977 	ve->base.execlists.queue_priority_hint = INT_MIN;
4978 	tasklet_init(&ve->base.execlists.tasklet,
4979 		     virtual_submission_tasklet,
4980 		     (unsigned long)ve);
4981 
4982 	intel_context_init(&ve->context, &ve->base);
4983 
4984 	for (n = 0; n < count; n++) {
4985 		struct intel_engine_cs *sibling = siblings[n];
4986 
4987 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4988 		if (sibling->mask & ve->base.mask) {
4989 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4990 				  sibling->name);
4991 			err = -EINVAL;
4992 			goto err_put;
4993 		}
4994 
4995 		/*
4996 		 * The virtual engine implementation is tightly coupled to
4997 		 * the execlists backend -- we push out request directly
4998 		 * into a tree inside each physical engine. We could support
4999 		 * layering if we handle cloning of the requests and
5000 		 * submitting a copy into each backend.
5001 		 */
5002 		if (sibling->execlists.tasklet.func !=
5003 		    execlists_submission_tasklet) {
5004 			err = -ENODEV;
5005 			goto err_put;
5006 		}
5007 
5008 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5009 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5010 
5011 		ve->siblings[ve->num_siblings++] = sibling;
5012 		ve->base.mask |= sibling->mask;
5013 
5014 		/*
5015 		 * All physical engines must be compatible for their emission
5016 		 * functions (as we build the instructions during request
5017 		 * construction and do not alter them before submission
5018 		 * on the physical engine). We use the engine class as a guide
5019 		 * here, although that could be refined.
5020 		 */
5021 		if (ve->base.class != OTHER_CLASS) {
5022 			if (ve->base.class != sibling->class) {
5023 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5024 					  sibling->class, ve->base.class);
5025 				err = -EINVAL;
5026 				goto err_put;
5027 			}
5028 			continue;
5029 		}
5030 
5031 		ve->base.class = sibling->class;
5032 		ve->base.uabi_class = sibling->uabi_class;
5033 		snprintf(ve->base.name, sizeof(ve->base.name),
5034 			 "v%dx%d", ve->base.class, count);
5035 		ve->base.context_size = sibling->context_size;
5036 
5037 		ve->base.emit_bb_start = sibling->emit_bb_start;
5038 		ve->base.emit_flush = sibling->emit_flush;
5039 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5040 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5041 		ve->base.emit_fini_breadcrumb_dw =
5042 			sibling->emit_fini_breadcrumb_dw;
5043 
5044 		ve->base.flags = sibling->flags;
5045 	}
5046 
5047 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5048 
5049 	return &ve->context;
5050 
5051 err_put:
5052 	intel_context_put(&ve->context);
5053 	return ERR_PTR(err);
5054 }
5055 
5056 struct intel_context *
5057 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5058 {
5059 	struct virtual_engine *se = to_virtual_engine(src);
5060 	struct intel_context *dst;
5061 
5062 	dst = intel_execlists_create_virtual(se->siblings,
5063 					     se->num_siblings);
5064 	if (IS_ERR(dst))
5065 		return dst;
5066 
5067 	if (se->num_bonds) {
5068 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5069 
5070 		de->bonds = kmemdup(se->bonds,
5071 				    sizeof(*se->bonds) * se->num_bonds,
5072 				    GFP_KERNEL);
5073 		if (!de->bonds) {
5074 			intel_context_put(dst);
5075 			return ERR_PTR(-ENOMEM);
5076 		}
5077 
5078 		de->num_bonds = se->num_bonds;
5079 	}
5080 
5081 	return dst;
5082 }
5083 
5084 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5085 				     const struct intel_engine_cs *master,
5086 				     const struct intel_engine_cs *sibling)
5087 {
5088 	struct virtual_engine *ve = to_virtual_engine(engine);
5089 	struct ve_bond *bond;
5090 	int n;
5091 
5092 	/* Sanity check the sibling is part of the virtual engine */
5093 	for (n = 0; n < ve->num_siblings; n++)
5094 		if (sibling == ve->siblings[n])
5095 			break;
5096 	if (n == ve->num_siblings)
5097 		return -EINVAL;
5098 
5099 	bond = virtual_find_bond(ve, master);
5100 	if (bond) {
5101 		bond->sibling_mask |= sibling->mask;
5102 		return 0;
5103 	}
5104 
5105 	bond = krealloc(ve->bonds,
5106 			sizeof(*bond) * (ve->num_bonds + 1),
5107 			GFP_KERNEL);
5108 	if (!bond)
5109 		return -ENOMEM;
5110 
5111 	bond[ve->num_bonds].master = master;
5112 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5113 
5114 	ve->bonds = bond;
5115 	ve->num_bonds++;
5116 
5117 	return 0;
5118 }
5119 
5120 struct intel_engine_cs *
5121 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5122 				 unsigned int sibling)
5123 {
5124 	struct virtual_engine *ve = to_virtual_engine(engine);
5125 
5126 	if (sibling >= ve->num_siblings)
5127 		return NULL;
5128 
5129 	return ve->siblings[sibling];
5130 }
5131 
5132 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5133 				   struct drm_printer *m,
5134 				   void (*show_request)(struct drm_printer *m,
5135 							struct i915_request *rq,
5136 							const char *prefix),
5137 				   unsigned int max)
5138 {
5139 	const struct intel_engine_execlists *execlists = &engine->execlists;
5140 	struct i915_request *rq, *last;
5141 	unsigned long flags;
5142 	unsigned int count;
5143 	struct rb_node *rb;
5144 
5145 	spin_lock_irqsave(&engine->active.lock, flags);
5146 
5147 	last = NULL;
5148 	count = 0;
5149 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5150 		if (count++ < max - 1)
5151 			show_request(m, rq, "\t\tE ");
5152 		else
5153 			last = rq;
5154 	}
5155 	if (last) {
5156 		if (count > max) {
5157 			drm_printf(m,
5158 				   "\t\t...skipping %d executing requests...\n",
5159 				   count - max);
5160 		}
5161 		show_request(m, last, "\t\tE ");
5162 	}
5163 
5164 	last = NULL;
5165 	count = 0;
5166 	if (execlists->queue_priority_hint != INT_MIN)
5167 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5168 			   execlists->queue_priority_hint);
5169 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5170 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5171 		int i;
5172 
5173 		priolist_for_each_request(rq, p, i) {
5174 			if (count++ < max - 1)
5175 				show_request(m, rq, "\t\tQ ");
5176 			else
5177 				last = rq;
5178 		}
5179 	}
5180 	if (last) {
5181 		if (count > max) {
5182 			drm_printf(m,
5183 				   "\t\t...skipping %d queued requests...\n",
5184 				   count - max);
5185 		}
5186 		show_request(m, last, "\t\tQ ");
5187 	}
5188 
5189 	last = NULL;
5190 	count = 0;
5191 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5192 		struct virtual_engine *ve =
5193 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5194 		struct i915_request *rq = READ_ONCE(ve->request);
5195 
5196 		if (rq) {
5197 			if (count++ < max - 1)
5198 				show_request(m, rq, "\t\tV ");
5199 			else
5200 				last = rq;
5201 		}
5202 	}
5203 	if (last) {
5204 		if (count > max) {
5205 			drm_printf(m,
5206 				   "\t\t...skipping %d virtual requests...\n",
5207 				   count - max);
5208 		}
5209 		show_request(m, last, "\t\tV ");
5210 	}
5211 
5212 	spin_unlock_irqrestore(&engine->active.lock, flags);
5213 }
5214 
5215 void intel_lr_context_reset(struct intel_engine_cs *engine,
5216 			    struct intel_context *ce,
5217 			    u32 head,
5218 			    bool scrub)
5219 {
5220 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5221 
5222 	/*
5223 	 * We want a simple context + ring to execute the breadcrumb update.
5224 	 * We cannot rely on the context being intact across the GPU hang,
5225 	 * so clear it and rebuild just what we need for the breadcrumb.
5226 	 * All pending requests for this context will be zapped, and any
5227 	 * future request will be after userspace has had the opportunity
5228 	 * to recreate its own state.
5229 	 */
5230 	if (scrub)
5231 		restore_default_state(ce, engine);
5232 
5233 	/* Rerun the request; its payload has been neutered (if guilty). */
5234 	__execlists_update_reg_state(ce, engine, head);
5235 }
5236 
5237 bool
5238 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5239 {
5240 	return engine->set_default_submission ==
5241 	       intel_execlists_set_default_submission;
5242 }
5243 
5244 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5245 #include "selftest_lrc.c"
5246 #endif
5247