xref: /linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 2305f60b76110cb3e8658a4ae85d1f7eb0c66a5b)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 #define WA_TAIL_DWORDS 2
180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[0];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine);
241 
242 static void mark_eio(struct i915_request *rq)
243 {
244 	if (i915_request_completed(rq))
245 		return;
246 
247 	GEM_BUG_ON(i915_request_signaled(rq));
248 
249 	dma_fence_set_error(&rq->fence, -EIO);
250 	i915_request_mark_complete(rq);
251 }
252 
253 static struct i915_request *
254 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
255 {
256 	struct i915_request *active = rq;
257 
258 	rcu_read_lock();
259 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
260 		if (i915_request_completed(rq))
261 			break;
262 
263 		active = rq;
264 	}
265 	rcu_read_unlock();
266 
267 	return active;
268 }
269 
270 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
271 {
272 	return (i915_ggtt_offset(engine->status_page.vma) +
273 		I915_GEM_HWS_PREEMPT_ADDR);
274 }
275 
276 static inline void
277 ring_set_paused(const struct intel_engine_cs *engine, int state)
278 {
279 	/*
280 	 * We inspect HWS_PREEMPT with a semaphore inside
281 	 * engine->emit_fini_breadcrumb. If the dword is true,
282 	 * the ring is paused as the semaphore will busywait
283 	 * until the dword is false.
284 	 */
285 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
286 	if (state)
287 		wmb();
288 }
289 
290 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
291 {
292 	return rb_entry(rb, struct i915_priolist, node);
293 }
294 
295 static inline int rq_prio(const struct i915_request *rq)
296 {
297 	return rq->sched.attr.priority;
298 }
299 
300 static int effective_prio(const struct i915_request *rq)
301 {
302 	int prio = rq_prio(rq);
303 
304 	/*
305 	 * If this request is special and must not be interrupted at any
306 	 * cost, so be it. Note we are only checking the most recent request
307 	 * in the context and so may be masking an earlier vip request. It
308 	 * is hoped that under the conditions where nopreempt is used, this
309 	 * will not matter (i.e. all requests to that context will be
310 	 * nopreempt for as long as desired).
311 	 */
312 	if (i915_request_has_nopreempt(rq))
313 		prio = I915_PRIORITY_UNPREEMPTABLE;
314 
315 	/*
316 	 * On unwinding the active request, we give it a priority bump
317 	 * if it has completed waiting on any semaphore. If we know that
318 	 * the request has already started, we can prevent an unwanted
319 	 * preempt-to-idle cycle by taking that into account now.
320 	 */
321 	if (__i915_request_has_started(rq))
322 		prio |= I915_PRIORITY_NOSEMAPHORE;
323 
324 	/* Restrict mere WAIT boosts from triggering preemption */
325 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
326 	return prio | __NO_PREEMPTION;
327 }
328 
329 static int queue_prio(const struct intel_engine_execlists *execlists)
330 {
331 	struct i915_priolist *p;
332 	struct rb_node *rb;
333 
334 	rb = rb_first_cached(&execlists->queue);
335 	if (!rb)
336 		return INT_MIN;
337 
338 	/*
339 	 * As the priolist[] are inverted, with the highest priority in [0],
340 	 * we have to flip the index value to become priority.
341 	 */
342 	p = to_priolist(rb);
343 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
344 }
345 
346 static inline bool need_preempt(const struct intel_engine_cs *engine,
347 				const struct i915_request *rq,
348 				struct rb_node *rb)
349 {
350 	int last_prio;
351 
352 	if (!intel_engine_has_semaphores(engine))
353 		return false;
354 
355 	/*
356 	 * Check if the current priority hint merits a preemption attempt.
357 	 *
358 	 * We record the highest value priority we saw during rescheduling
359 	 * prior to this dequeue, therefore we know that if it is strictly
360 	 * less than the current tail of ESLP[0], we do not need to force
361 	 * a preempt-to-idle cycle.
362 	 *
363 	 * However, the priority hint is a mere hint that we may need to
364 	 * preempt. If that hint is stale or we may be trying to preempt
365 	 * ourselves, ignore the request.
366 	 *
367 	 * More naturally we would write
368 	 *      prio >= max(0, last);
369 	 * except that we wish to prevent triggering preemption at the same
370 	 * priority level: the task that is running should remain running
371 	 * to preserve FIFO ordering of dependencies.
372 	 */
373 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
374 	if (engine->execlists.queue_priority_hint <= last_prio)
375 		return false;
376 
377 	/*
378 	 * Check against the first request in ELSP[1], it will, thanks to the
379 	 * power of PI, be the highest priority of that context.
380 	 */
381 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
382 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
383 		return true;
384 
385 	if (rb) {
386 		struct virtual_engine *ve =
387 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
388 		bool preempt = false;
389 
390 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
391 			struct i915_request *next;
392 
393 			rcu_read_lock();
394 			next = READ_ONCE(ve->request);
395 			if (next)
396 				preempt = rq_prio(next) > last_prio;
397 			rcu_read_unlock();
398 		}
399 
400 		if (preempt)
401 			return preempt;
402 	}
403 
404 	/*
405 	 * If the inflight context did not trigger the preemption, then maybe
406 	 * it was the set of queued requests? Pick the highest priority in
407 	 * the queue (the first active priolist) and see if it deserves to be
408 	 * running instead of ELSP[0].
409 	 *
410 	 * The highest priority request in the queue can not be either
411 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
412 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
413 	 */
414 	return queue_prio(&engine->execlists) > last_prio;
415 }
416 
417 __maybe_unused static inline bool
418 assert_priority_queue(const struct i915_request *prev,
419 		      const struct i915_request *next)
420 {
421 	/*
422 	 * Without preemption, the prev may refer to the still active element
423 	 * which we refuse to let go.
424 	 *
425 	 * Even with preemption, there are times when we think it is better not
426 	 * to preempt and leave an ostensibly lower priority request in flight.
427 	 */
428 	if (i915_request_is_active(prev))
429 		return true;
430 
431 	return rq_prio(prev) >= rq_prio(next);
432 }
433 
434 /*
435  * The context descriptor encodes various attributes of a context,
436  * including its GTT address and some flags. Because it's fairly
437  * expensive to calculate, we'll just do it once and cache the result,
438  * which remains valid until the context is unpinned.
439  *
440  * This is what a descriptor looks like, from LSB to MSB::
441  *
442  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
443  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
444  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
445  *      bits 53-54:    mbz, reserved for use by hardware
446  *      bits 55-63:    group ID, currently unused and set to 0
447  *
448  * Starting from Gen11, the upper dword of the descriptor has a new format:
449  *
450  *      bits 32-36:    reserved
451  *      bits 37-47:    SW context ID
452  *      bits 48:53:    engine instance
453  *      bit 54:        mbz, reserved for use by hardware
454  *      bits 55-60:    SW counter
455  *      bits 61-63:    engine class
456  *
457  * engine info, SW context ID and SW counter need to form a unique number
458  * (Context ID) per lrc.
459  */
460 static u64
461 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
462 {
463 	u64 desc;
464 
465 	desc = INTEL_LEGACY_32B_CONTEXT;
466 	if (i915_vm_is_4lvl(ce->vm))
467 		desc = INTEL_LEGACY_64B_CONTEXT;
468 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
469 
470 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
471 	if (IS_GEN(engine->i915, 8))
472 		desc |= GEN8_CTX_L3LLC_COHERENT;
473 
474 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
475 	/*
476 	 * The following 32bits are copied into the OA reports (dword 2).
477 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
478 	 * anything below.
479 	 */
480 	if (INTEL_GEN(engine->i915) >= 11) {
481 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
482 								/* bits 48-53 */
483 
484 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
485 								/* bits 61-63 */
486 	}
487 
488 	return desc;
489 }
490 
491 static inline unsigned int dword_in_page(void *addr)
492 {
493 	return offset_in_page(addr) / sizeof(u32);
494 }
495 
496 static void set_offsets(u32 *regs,
497 			const u8 *data,
498 			const struct intel_engine_cs *engine,
499 			bool clear)
500 #define NOP(x) (BIT(7) | (x))
501 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
502 #define POSTED BIT(0)
503 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
504 #define REG16(x) \
505 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
506 	(((x) >> 2) & 0x7f)
507 #define END(x) 0, (x)
508 {
509 	const u32 base = engine->mmio_base;
510 
511 	while (*data) {
512 		u8 count, flags;
513 
514 		if (*data & BIT(7)) { /* skip */
515 			count = *data++ & ~BIT(7);
516 			if (clear)
517 				memset32(regs, MI_NOOP, count);
518 			regs += count;
519 			continue;
520 		}
521 
522 		count = *data & 0x3f;
523 		flags = *data >> 6;
524 		data++;
525 
526 		*regs = MI_LOAD_REGISTER_IMM(count);
527 		if (flags & POSTED)
528 			*regs |= MI_LRI_FORCE_POSTED;
529 		if (INTEL_GEN(engine->i915) >= 11)
530 			*regs |= MI_LRI_CS_MMIO;
531 		regs++;
532 
533 		GEM_BUG_ON(!count);
534 		do {
535 			u32 offset = 0;
536 			u8 v;
537 
538 			do {
539 				v = *data++;
540 				offset <<= 7;
541 				offset |= v & ~BIT(7);
542 			} while (v & BIT(7));
543 
544 			regs[0] = base + (offset << 2);
545 			if (clear)
546 				regs[1] = 0;
547 			regs += 2;
548 		} while (--count);
549 	}
550 
551 	if (clear) {
552 		u8 count = *++data;
553 
554 		/* Clear past the tail for HW access */
555 		GEM_BUG_ON(dword_in_page(regs) > count);
556 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
557 
558 		/* Close the batch; used mainly by live_lrc_layout() */
559 		*regs = MI_BATCH_BUFFER_END;
560 		if (INTEL_GEN(engine->i915) >= 10)
561 			*regs |= BIT(0);
562 	}
563 }
564 
565 static const u8 gen8_xcs_offsets[] = {
566 	NOP(1),
567 	LRI(11, 0),
568 	REG16(0x244),
569 	REG(0x034),
570 	REG(0x030),
571 	REG(0x038),
572 	REG(0x03c),
573 	REG(0x168),
574 	REG(0x140),
575 	REG(0x110),
576 	REG(0x11c),
577 	REG(0x114),
578 	REG(0x118),
579 
580 	NOP(9),
581 	LRI(9, 0),
582 	REG16(0x3a8),
583 	REG16(0x28c),
584 	REG16(0x288),
585 	REG16(0x284),
586 	REG16(0x280),
587 	REG16(0x27c),
588 	REG16(0x278),
589 	REG16(0x274),
590 	REG16(0x270),
591 
592 	NOP(13),
593 	LRI(2, 0),
594 	REG16(0x200),
595 	REG(0x028),
596 
597 	END(80)
598 };
599 
600 static const u8 gen9_xcs_offsets[] = {
601 	NOP(1),
602 	LRI(14, POSTED),
603 	REG16(0x244),
604 	REG(0x034),
605 	REG(0x030),
606 	REG(0x038),
607 	REG(0x03c),
608 	REG(0x168),
609 	REG(0x140),
610 	REG(0x110),
611 	REG(0x11c),
612 	REG(0x114),
613 	REG(0x118),
614 	REG(0x1c0),
615 	REG(0x1c4),
616 	REG(0x1c8),
617 
618 	NOP(3),
619 	LRI(9, POSTED),
620 	REG16(0x3a8),
621 	REG16(0x28c),
622 	REG16(0x288),
623 	REG16(0x284),
624 	REG16(0x280),
625 	REG16(0x27c),
626 	REG16(0x278),
627 	REG16(0x274),
628 	REG16(0x270),
629 
630 	NOP(13),
631 	LRI(1, POSTED),
632 	REG16(0x200),
633 
634 	NOP(13),
635 	LRI(44, POSTED),
636 	REG(0x028),
637 	REG(0x09c),
638 	REG(0x0c0),
639 	REG(0x178),
640 	REG(0x17c),
641 	REG16(0x358),
642 	REG(0x170),
643 	REG(0x150),
644 	REG(0x154),
645 	REG(0x158),
646 	REG16(0x41c),
647 	REG16(0x600),
648 	REG16(0x604),
649 	REG16(0x608),
650 	REG16(0x60c),
651 	REG16(0x610),
652 	REG16(0x614),
653 	REG16(0x618),
654 	REG16(0x61c),
655 	REG16(0x620),
656 	REG16(0x624),
657 	REG16(0x628),
658 	REG16(0x62c),
659 	REG16(0x630),
660 	REG16(0x634),
661 	REG16(0x638),
662 	REG16(0x63c),
663 	REG16(0x640),
664 	REG16(0x644),
665 	REG16(0x648),
666 	REG16(0x64c),
667 	REG16(0x650),
668 	REG16(0x654),
669 	REG16(0x658),
670 	REG16(0x65c),
671 	REG16(0x660),
672 	REG16(0x664),
673 	REG16(0x668),
674 	REG16(0x66c),
675 	REG16(0x670),
676 	REG16(0x674),
677 	REG16(0x678),
678 	REG16(0x67c),
679 	REG(0x068),
680 
681 	END(176)
682 };
683 
684 static const u8 gen12_xcs_offsets[] = {
685 	NOP(1),
686 	LRI(13, POSTED),
687 	REG16(0x244),
688 	REG(0x034),
689 	REG(0x030),
690 	REG(0x038),
691 	REG(0x03c),
692 	REG(0x168),
693 	REG(0x140),
694 	REG(0x110),
695 	REG(0x1c0),
696 	REG(0x1c4),
697 	REG(0x1c8),
698 	REG(0x180),
699 	REG16(0x2b4),
700 
701 	NOP(5),
702 	LRI(9, POSTED),
703 	REG16(0x3a8),
704 	REG16(0x28c),
705 	REG16(0x288),
706 	REG16(0x284),
707 	REG16(0x280),
708 	REG16(0x27c),
709 	REG16(0x278),
710 	REG16(0x274),
711 	REG16(0x270),
712 
713 	END(80)
714 };
715 
716 static const u8 gen8_rcs_offsets[] = {
717 	NOP(1),
718 	LRI(14, POSTED),
719 	REG16(0x244),
720 	REG(0x034),
721 	REG(0x030),
722 	REG(0x038),
723 	REG(0x03c),
724 	REG(0x168),
725 	REG(0x140),
726 	REG(0x110),
727 	REG(0x11c),
728 	REG(0x114),
729 	REG(0x118),
730 	REG(0x1c0),
731 	REG(0x1c4),
732 	REG(0x1c8),
733 
734 	NOP(3),
735 	LRI(9, POSTED),
736 	REG16(0x3a8),
737 	REG16(0x28c),
738 	REG16(0x288),
739 	REG16(0x284),
740 	REG16(0x280),
741 	REG16(0x27c),
742 	REG16(0x278),
743 	REG16(0x274),
744 	REG16(0x270),
745 
746 	NOP(13),
747 	LRI(1, 0),
748 	REG(0x0c8),
749 
750 	END(80)
751 };
752 
753 static const u8 gen9_rcs_offsets[] = {
754 	NOP(1),
755 	LRI(14, POSTED),
756 	REG16(0x244),
757 	REG(0x34),
758 	REG(0x30),
759 	REG(0x38),
760 	REG(0x3c),
761 	REG(0x168),
762 	REG(0x140),
763 	REG(0x110),
764 	REG(0x11c),
765 	REG(0x114),
766 	REG(0x118),
767 	REG(0x1c0),
768 	REG(0x1c4),
769 	REG(0x1c8),
770 
771 	NOP(3),
772 	LRI(9, POSTED),
773 	REG16(0x3a8),
774 	REG16(0x28c),
775 	REG16(0x288),
776 	REG16(0x284),
777 	REG16(0x280),
778 	REG16(0x27c),
779 	REG16(0x278),
780 	REG16(0x274),
781 	REG16(0x270),
782 
783 	NOP(13),
784 	LRI(1, 0),
785 	REG(0xc8),
786 
787 	NOP(13),
788 	LRI(44, POSTED),
789 	REG(0x28),
790 	REG(0x9c),
791 	REG(0xc0),
792 	REG(0x178),
793 	REG(0x17c),
794 	REG16(0x358),
795 	REG(0x170),
796 	REG(0x150),
797 	REG(0x154),
798 	REG(0x158),
799 	REG16(0x41c),
800 	REG16(0x600),
801 	REG16(0x604),
802 	REG16(0x608),
803 	REG16(0x60c),
804 	REG16(0x610),
805 	REG16(0x614),
806 	REG16(0x618),
807 	REG16(0x61c),
808 	REG16(0x620),
809 	REG16(0x624),
810 	REG16(0x628),
811 	REG16(0x62c),
812 	REG16(0x630),
813 	REG16(0x634),
814 	REG16(0x638),
815 	REG16(0x63c),
816 	REG16(0x640),
817 	REG16(0x644),
818 	REG16(0x648),
819 	REG16(0x64c),
820 	REG16(0x650),
821 	REG16(0x654),
822 	REG16(0x658),
823 	REG16(0x65c),
824 	REG16(0x660),
825 	REG16(0x664),
826 	REG16(0x668),
827 	REG16(0x66c),
828 	REG16(0x670),
829 	REG16(0x674),
830 	REG16(0x678),
831 	REG16(0x67c),
832 	REG(0x68),
833 
834 	END(176)
835 };
836 
837 static const u8 gen11_rcs_offsets[] = {
838 	NOP(1),
839 	LRI(15, POSTED),
840 	REG16(0x244),
841 	REG(0x034),
842 	REG(0x030),
843 	REG(0x038),
844 	REG(0x03c),
845 	REG(0x168),
846 	REG(0x140),
847 	REG(0x110),
848 	REG(0x11c),
849 	REG(0x114),
850 	REG(0x118),
851 	REG(0x1c0),
852 	REG(0x1c4),
853 	REG(0x1c8),
854 	REG(0x180),
855 
856 	NOP(1),
857 	LRI(9, POSTED),
858 	REG16(0x3a8),
859 	REG16(0x28c),
860 	REG16(0x288),
861 	REG16(0x284),
862 	REG16(0x280),
863 	REG16(0x27c),
864 	REG16(0x278),
865 	REG16(0x274),
866 	REG16(0x270),
867 
868 	LRI(1, POSTED),
869 	REG(0x1b0),
870 
871 	NOP(10),
872 	LRI(1, 0),
873 	REG(0x0c8),
874 
875 	END(80)
876 };
877 
878 static const u8 gen12_rcs_offsets[] = {
879 	NOP(1),
880 	LRI(13, POSTED),
881 	REG16(0x244),
882 	REG(0x034),
883 	REG(0x030),
884 	REG(0x038),
885 	REG(0x03c),
886 	REG(0x168),
887 	REG(0x140),
888 	REG(0x110),
889 	REG(0x1c0),
890 	REG(0x1c4),
891 	REG(0x1c8),
892 	REG(0x180),
893 	REG16(0x2b4),
894 
895 	NOP(5),
896 	LRI(9, POSTED),
897 	REG16(0x3a8),
898 	REG16(0x28c),
899 	REG16(0x288),
900 	REG16(0x284),
901 	REG16(0x280),
902 	REG16(0x27c),
903 	REG16(0x278),
904 	REG16(0x274),
905 	REG16(0x270),
906 
907 	LRI(3, POSTED),
908 	REG(0x1b0),
909 	REG16(0x5a8),
910 	REG16(0x5ac),
911 
912 	NOP(6),
913 	LRI(1, 0),
914 	REG(0x0c8),
915 
916 	END(80)
917 };
918 
919 #undef END
920 #undef REG16
921 #undef REG
922 #undef LRI
923 #undef NOP
924 
925 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
926 {
927 	/*
928 	 * The gen12+ lists only have the registers we program in the basic
929 	 * default state. We rely on the context image using relative
930 	 * addressing to automatic fixup the register state between the
931 	 * physical engines for virtual engine.
932 	 */
933 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
934 		   !intel_engine_has_relative_mmio(engine));
935 
936 	if (engine->class == RENDER_CLASS) {
937 		if (INTEL_GEN(engine->i915) >= 12)
938 			return gen12_rcs_offsets;
939 		else if (INTEL_GEN(engine->i915) >= 11)
940 			return gen11_rcs_offsets;
941 		else if (INTEL_GEN(engine->i915) >= 9)
942 			return gen9_rcs_offsets;
943 		else
944 			return gen8_rcs_offsets;
945 	} else {
946 		if (INTEL_GEN(engine->i915) >= 12)
947 			return gen12_xcs_offsets;
948 		else if (INTEL_GEN(engine->i915) >= 9)
949 			return gen9_xcs_offsets;
950 		else
951 			return gen8_xcs_offsets;
952 	}
953 }
954 
955 static struct i915_request *
956 __unwind_incomplete_requests(struct intel_engine_cs *engine)
957 {
958 	struct i915_request *rq, *rn, *active = NULL;
959 	struct list_head *uninitialized_var(pl);
960 	int prio = I915_PRIORITY_INVALID;
961 
962 	lockdep_assert_held(&engine->active.lock);
963 
964 	list_for_each_entry_safe_reverse(rq, rn,
965 					 &engine->active.requests,
966 					 sched.link) {
967 		if (i915_request_completed(rq))
968 			continue; /* XXX */
969 
970 		__i915_request_unsubmit(rq);
971 
972 		/*
973 		 * Push the request back into the queue for later resubmission.
974 		 * If this request is not native to this physical engine (i.e.
975 		 * it came from a virtual source), push it back onto the virtual
976 		 * engine so that it can be moved across onto another physical
977 		 * engine as load dictates.
978 		 */
979 		if (likely(rq->execution_mask == engine->mask)) {
980 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
981 			if (rq_prio(rq) != prio) {
982 				prio = rq_prio(rq);
983 				pl = i915_sched_lookup_priolist(engine, prio);
984 			}
985 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
986 
987 			list_move(&rq->sched.link, pl);
988 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
989 
990 			active = rq;
991 		} else {
992 			struct intel_engine_cs *owner = rq->context->engine;
993 
994 			/*
995 			 * Decouple the virtual breadcrumb before moving it
996 			 * back to the virtual engine -- we don't want the
997 			 * request to complete in the background and try
998 			 * and cancel the breadcrumb on the virtual engine
999 			 * (instead of the old engine where it is linked)!
1000 			 */
1001 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1002 				     &rq->fence.flags)) {
1003 				spin_lock_nested(&rq->lock,
1004 						 SINGLE_DEPTH_NESTING);
1005 				i915_request_cancel_breadcrumb(rq);
1006 				spin_unlock(&rq->lock);
1007 			}
1008 			rq->engine = owner;
1009 			owner->submit_request(rq);
1010 			active = NULL;
1011 		}
1012 	}
1013 
1014 	return active;
1015 }
1016 
1017 struct i915_request *
1018 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1019 {
1020 	struct intel_engine_cs *engine =
1021 		container_of(execlists, typeof(*engine), execlists);
1022 
1023 	return __unwind_incomplete_requests(engine);
1024 }
1025 
1026 static inline void
1027 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1028 {
1029 	/*
1030 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1031 	 * The compiler should eliminate this function as dead-code.
1032 	 */
1033 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1034 		return;
1035 
1036 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1037 				   status, rq);
1038 }
1039 
1040 static void intel_engine_context_in(struct intel_engine_cs *engine)
1041 {
1042 	unsigned long flags;
1043 
1044 	if (READ_ONCE(engine->stats.enabled) == 0)
1045 		return;
1046 
1047 	write_seqlock_irqsave(&engine->stats.lock, flags);
1048 
1049 	if (engine->stats.enabled > 0) {
1050 		if (engine->stats.active++ == 0)
1051 			engine->stats.start = ktime_get();
1052 		GEM_BUG_ON(engine->stats.active == 0);
1053 	}
1054 
1055 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1056 }
1057 
1058 static void intel_engine_context_out(struct intel_engine_cs *engine)
1059 {
1060 	unsigned long flags;
1061 
1062 	if (READ_ONCE(engine->stats.enabled) == 0)
1063 		return;
1064 
1065 	write_seqlock_irqsave(&engine->stats.lock, flags);
1066 
1067 	if (engine->stats.enabled > 0) {
1068 		ktime_t last;
1069 
1070 		if (engine->stats.active && --engine->stats.active == 0) {
1071 			/*
1072 			 * Decrement the active context count and in case GPU
1073 			 * is now idle add up to the running total.
1074 			 */
1075 			last = ktime_sub(ktime_get(), engine->stats.start);
1076 
1077 			engine->stats.total = ktime_add(engine->stats.total,
1078 							last);
1079 		} else if (engine->stats.active == 0) {
1080 			/*
1081 			 * After turning on engine stats, context out might be
1082 			 * the first event in which case we account from the
1083 			 * time stats gathering was turned on.
1084 			 */
1085 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1086 
1087 			engine->stats.total = ktime_add(engine->stats.total,
1088 							last);
1089 		}
1090 	}
1091 
1092 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1093 }
1094 
1095 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1096 {
1097 	if (INTEL_GEN(engine->i915) >= 12)
1098 		return 0x60;
1099 	else if (INTEL_GEN(engine->i915) >= 9)
1100 		return 0x54;
1101 	else if (engine->class == RENDER_CLASS)
1102 		return 0x58;
1103 	else
1104 		return -1;
1105 }
1106 
1107 static void
1108 execlists_check_context(const struct intel_context *ce,
1109 			const struct intel_engine_cs *engine)
1110 {
1111 	const struct intel_ring *ring = ce->ring;
1112 	u32 *regs = ce->lrc_reg_state;
1113 	bool valid = true;
1114 	int x;
1115 
1116 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1117 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1118 		       engine->name,
1119 		       regs[CTX_RING_START],
1120 		       i915_ggtt_offset(ring->vma));
1121 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1122 		valid = false;
1123 	}
1124 
1125 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1126 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1127 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1128 		       engine->name,
1129 		       regs[CTX_RING_CTL],
1130 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1131 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1132 		valid = false;
1133 	}
1134 
1135 	x = lrc_ring_mi_mode(engine);
1136 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1137 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1138 		       engine->name, regs[x + 1]);
1139 		regs[x + 1] &= ~STOP_RING;
1140 		regs[x + 1] |= STOP_RING << 16;
1141 		valid = false;
1142 	}
1143 
1144 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1145 }
1146 
1147 static void restore_default_state(struct intel_context *ce,
1148 				  struct intel_engine_cs *engine)
1149 {
1150 	u32 *regs = ce->lrc_reg_state;
1151 
1152 	if (engine->pinned_default_state)
1153 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1154 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1155 		       engine->context_size - PAGE_SIZE);
1156 
1157 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1158 }
1159 
1160 static void reset_active(struct i915_request *rq,
1161 			 struct intel_engine_cs *engine)
1162 {
1163 	struct intel_context * const ce = rq->context;
1164 	u32 head;
1165 
1166 	/*
1167 	 * The executing context has been cancelled. We want to prevent
1168 	 * further execution along this context and propagate the error on
1169 	 * to anything depending on its results.
1170 	 *
1171 	 * In __i915_request_submit(), we apply the -EIO and remove the
1172 	 * requests' payloads for any banned requests. But first, we must
1173 	 * rewind the context back to the start of the incomplete request so
1174 	 * that we do not jump back into the middle of the batch.
1175 	 *
1176 	 * We preserve the breadcrumbs and semaphores of the incomplete
1177 	 * requests so that inter-timeline dependencies (i.e other timelines)
1178 	 * remain correctly ordered. And we defer to __i915_request_submit()
1179 	 * so that all asynchronous waits are correctly handled.
1180 	 */
1181 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1182 		     rq->fence.context, rq->fence.seqno);
1183 
1184 	/* On resubmission of the active request, payload will be scrubbed */
1185 	if (i915_request_completed(rq))
1186 		head = rq->tail;
1187 	else
1188 		head = active_request(ce->timeline, rq)->head;
1189 	ce->ring->head = intel_ring_wrap(ce->ring, head);
1190 	intel_ring_update_space(ce->ring);
1191 
1192 	/* Scrub the context image to prevent replaying the previous batch */
1193 	restore_default_state(ce, engine);
1194 	__execlists_update_reg_state(ce, engine);
1195 
1196 	/* We've switched away, so this should be a no-op, but intent matters */
1197 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1198 }
1199 
1200 static inline struct intel_engine_cs *
1201 __execlists_schedule_in(struct i915_request *rq)
1202 {
1203 	struct intel_engine_cs * const engine = rq->engine;
1204 	struct intel_context * const ce = rq->context;
1205 
1206 	intel_context_get(ce);
1207 
1208 	if (unlikely(intel_context_is_banned(ce)))
1209 		reset_active(rq, engine);
1210 
1211 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1212 		execlists_check_context(ce, engine);
1213 
1214 	if (ce->tag) {
1215 		/* Use a fixed tag for OA and friends */
1216 		ce->lrc_desc |= (u64)ce->tag << 32;
1217 	} else {
1218 		/* We don't need a strict matching tag, just different values */
1219 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1220 		ce->lrc_desc |=
1221 			(u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1222 			GEN11_SW_CTX_ID_SHIFT;
1223 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1224 	}
1225 
1226 	__intel_gt_pm_get(engine->gt);
1227 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1228 	intel_engine_context_in(engine);
1229 
1230 	return engine;
1231 }
1232 
1233 static inline struct i915_request *
1234 execlists_schedule_in(struct i915_request *rq, int idx)
1235 {
1236 	struct intel_context * const ce = rq->context;
1237 	struct intel_engine_cs *old;
1238 
1239 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1240 	trace_i915_request_in(rq, idx);
1241 
1242 	old = READ_ONCE(ce->inflight);
1243 	do {
1244 		if (!old) {
1245 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1246 			break;
1247 		}
1248 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1249 
1250 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1251 	return i915_request_get(rq);
1252 }
1253 
1254 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1255 {
1256 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1257 	struct i915_request *next = READ_ONCE(ve->request);
1258 
1259 	if (next && next->execution_mask & ~rq->execution_mask)
1260 		tasklet_schedule(&ve->base.execlists.tasklet);
1261 }
1262 
1263 static inline void
1264 __execlists_schedule_out(struct i915_request *rq,
1265 			 struct intel_engine_cs * const engine)
1266 {
1267 	struct intel_context * const ce = rq->context;
1268 
1269 	/*
1270 	 * NB process_csb() is not under the engine->active.lock and hence
1271 	 * schedule_out can race with schedule_in meaning that we should
1272 	 * refrain from doing non-trivial work here.
1273 	 */
1274 
1275 	/*
1276 	 * If we have just completed this context, the engine may now be
1277 	 * idle and we want to re-enter powersaving.
1278 	 */
1279 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
1280 	    i915_request_completed(rq))
1281 		intel_engine_add_retire(engine, ce->timeline);
1282 
1283 	intel_engine_context_out(engine);
1284 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1285 	intel_gt_pm_put_async(engine->gt);
1286 
1287 	/*
1288 	 * If this is part of a virtual engine, its next request may
1289 	 * have been blocked waiting for access to the active context.
1290 	 * We have to kick all the siblings again in case we need to
1291 	 * switch (e.g. the next request is not runnable on this
1292 	 * engine). Hopefully, we will already have submitted the next
1293 	 * request before the tasklet runs and do not need to rebuild
1294 	 * each virtual tree and kick everyone again.
1295 	 */
1296 	if (ce->engine != engine)
1297 		kick_siblings(rq, ce);
1298 
1299 	intel_context_put(ce);
1300 }
1301 
1302 static inline void
1303 execlists_schedule_out(struct i915_request *rq)
1304 {
1305 	struct intel_context * const ce = rq->context;
1306 	struct intel_engine_cs *cur, *old;
1307 
1308 	trace_i915_request_out(rq);
1309 
1310 	old = READ_ONCE(ce->inflight);
1311 	do
1312 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1313 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1314 	if (!cur)
1315 		__execlists_schedule_out(rq, old);
1316 
1317 	i915_request_put(rq);
1318 }
1319 
1320 static u64 execlists_update_context(struct i915_request *rq)
1321 {
1322 	struct intel_context *ce = rq->context;
1323 	u64 desc = ce->lrc_desc;
1324 	u32 tail;
1325 
1326 	/*
1327 	 * WaIdleLiteRestore:bdw,skl
1328 	 *
1329 	 * We should never submit the context with the same RING_TAIL twice
1330 	 * just in case we submit an empty ring, which confuses the HW.
1331 	 *
1332 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1333 	 * the normal request to be able to always advance the RING_TAIL on
1334 	 * subsequent resubmissions (for lite restore). Should that fail us,
1335 	 * and we try and submit the same tail again, force the context
1336 	 * reload.
1337 	 */
1338 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1339 	if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail))
1340 		desc |= CTX_DESC_FORCE_RESTORE;
1341 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1342 	rq->tail = rq->wa_tail;
1343 
1344 	/*
1345 	 * Make sure the context image is complete before we submit it to HW.
1346 	 *
1347 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1348 	 * an uncached write such as our mmio register access, the empirical
1349 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1350 	 * may not be visible to the HW prior to the completion of the UC
1351 	 * register write and that we may begin execution from the context
1352 	 * before its image is complete leading to invalid PD chasing.
1353 	 */
1354 	wmb();
1355 
1356 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1357 	return desc;
1358 }
1359 
1360 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1361 {
1362 	if (execlists->ctrl_reg) {
1363 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1364 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1365 	} else {
1366 		writel(upper_32_bits(desc), execlists->submit_reg);
1367 		writel(lower_32_bits(desc), execlists->submit_reg);
1368 	}
1369 }
1370 
1371 static __maybe_unused void
1372 trace_ports(const struct intel_engine_execlists *execlists,
1373 	    const char *msg,
1374 	    struct i915_request * const *ports)
1375 {
1376 	const struct intel_engine_cs *engine =
1377 		container_of(execlists, typeof(*engine), execlists);
1378 
1379 	if (!ports[0])
1380 		return;
1381 
1382 	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1383 		     ports[0]->fence.context,
1384 		     ports[0]->fence.seqno,
1385 		     i915_request_completed(ports[0]) ? "!" :
1386 		     i915_request_started(ports[0]) ? "*" :
1387 		     "",
1388 		     ports[1] ? ports[1]->fence.context : 0,
1389 		     ports[1] ? ports[1]->fence.seqno : 0);
1390 }
1391 
1392 static __maybe_unused bool
1393 assert_pending_valid(const struct intel_engine_execlists *execlists,
1394 		     const char *msg)
1395 {
1396 	struct i915_request * const *port, *rq;
1397 	struct intel_context *ce = NULL;
1398 
1399 	trace_ports(execlists, msg, execlists->pending);
1400 
1401 	if (!execlists->pending[0]) {
1402 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1403 		return false;
1404 	}
1405 
1406 	if (execlists->pending[execlists_num_ports(execlists)]) {
1407 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1408 			      execlists_num_ports(execlists));
1409 		return false;
1410 	}
1411 
1412 	for (port = execlists->pending; (rq = *port); port++) {
1413 		unsigned long flags;
1414 		bool ok = true;
1415 
1416 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1417 		GEM_BUG_ON(!i915_request_is_active(rq));
1418 
1419 		if (ce == rq->context) {
1420 			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1421 				      ce->timeline->fence_context,
1422 				      port - execlists->pending);
1423 			return false;
1424 		}
1425 		ce = rq->context;
1426 
1427 		/* Hold tightly onto the lock to prevent concurrent retires! */
1428 		if (!spin_trylock_irqsave(&rq->lock, flags))
1429 			continue;
1430 
1431 		if (i915_request_completed(rq))
1432 			goto unlock;
1433 
1434 		if (i915_active_is_idle(&ce->active) &&
1435 		    !intel_context_is_barrier(ce)) {
1436 			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1437 				      ce->timeline->fence_context,
1438 				      port - execlists->pending);
1439 			ok = false;
1440 			goto unlock;
1441 		}
1442 
1443 		if (!i915_vma_is_pinned(ce->state)) {
1444 			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1445 				      ce->timeline->fence_context,
1446 				      port - execlists->pending);
1447 			ok = false;
1448 			goto unlock;
1449 		}
1450 
1451 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1452 			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1453 				      ce->timeline->fence_context,
1454 				      port - execlists->pending);
1455 			ok = false;
1456 			goto unlock;
1457 		}
1458 
1459 unlock:
1460 		spin_unlock_irqrestore(&rq->lock, flags);
1461 		if (!ok)
1462 			return false;
1463 	}
1464 
1465 	return ce;
1466 }
1467 
1468 static void execlists_submit_ports(struct intel_engine_cs *engine)
1469 {
1470 	struct intel_engine_execlists *execlists = &engine->execlists;
1471 	unsigned int n;
1472 
1473 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1474 
1475 	/*
1476 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1477 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1478 	 * not be relinquished until the device is idle (see
1479 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1480 	 * that all ELSP are drained i.e. we have processed the CSB,
1481 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1482 	 */
1483 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1484 
1485 	/*
1486 	 * ELSQ note: the submit queue is not cleared after being submitted
1487 	 * to the HW so we need to make sure we always clean it up. This is
1488 	 * currently ensured by the fact that we always write the same number
1489 	 * of elsq entries, keep this in mind before changing the loop below.
1490 	 */
1491 	for (n = execlists_num_ports(execlists); n--; ) {
1492 		struct i915_request *rq = execlists->pending[n];
1493 
1494 		write_desc(execlists,
1495 			   rq ? execlists_update_context(rq) : 0,
1496 			   n);
1497 	}
1498 
1499 	/* we need to manually load the submit queue */
1500 	if (execlists->ctrl_reg)
1501 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1502 }
1503 
1504 static bool ctx_single_port_submission(const struct intel_context *ce)
1505 {
1506 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1507 		intel_context_force_single_submission(ce));
1508 }
1509 
1510 static bool can_merge_ctx(const struct intel_context *prev,
1511 			  const struct intel_context *next)
1512 {
1513 	if (prev != next)
1514 		return false;
1515 
1516 	if (ctx_single_port_submission(prev))
1517 		return false;
1518 
1519 	return true;
1520 }
1521 
1522 static bool can_merge_rq(const struct i915_request *prev,
1523 			 const struct i915_request *next)
1524 {
1525 	GEM_BUG_ON(prev == next);
1526 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1527 
1528 	/*
1529 	 * We do not submit known completed requests. Therefore if the next
1530 	 * request is already completed, we can pretend to merge it in
1531 	 * with the previous context (and we will skip updating the ELSP
1532 	 * and tracking). Thus hopefully keeping the ELSP full with active
1533 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1534 	 * us.
1535 	 */
1536 	if (i915_request_completed(next))
1537 		return true;
1538 
1539 	if (unlikely((prev->fence.flags ^ next->fence.flags) &
1540 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1541 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1542 		return false;
1543 
1544 	if (!can_merge_ctx(prev->context, next->context))
1545 		return false;
1546 
1547 	return true;
1548 }
1549 
1550 static void virtual_update_register_offsets(u32 *regs,
1551 					    struct intel_engine_cs *engine)
1552 {
1553 	set_offsets(regs, reg_offsets(engine), engine, false);
1554 }
1555 
1556 static bool virtual_matches(const struct virtual_engine *ve,
1557 			    const struct i915_request *rq,
1558 			    const struct intel_engine_cs *engine)
1559 {
1560 	const struct intel_engine_cs *inflight;
1561 
1562 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1563 		return false;
1564 
1565 	/*
1566 	 * We track when the HW has completed saving the context image
1567 	 * (i.e. when we have seen the final CS event switching out of
1568 	 * the context) and must not overwrite the context image before
1569 	 * then. This restricts us to only using the active engine
1570 	 * while the previous virtualized request is inflight (so
1571 	 * we reuse the register offsets). This is a very small
1572 	 * hystersis on the greedy seelction algorithm.
1573 	 */
1574 	inflight = intel_context_inflight(&ve->context);
1575 	if (inflight && inflight != engine)
1576 		return false;
1577 
1578 	return true;
1579 }
1580 
1581 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1582 				     struct intel_engine_cs *engine)
1583 {
1584 	struct intel_engine_cs *old = ve->siblings[0];
1585 
1586 	/* All unattached (rq->engine == old) must already be completed */
1587 
1588 	spin_lock(&old->breadcrumbs.irq_lock);
1589 	if (!list_empty(&ve->context.signal_link)) {
1590 		list_move_tail(&ve->context.signal_link,
1591 			       &engine->breadcrumbs.signalers);
1592 		intel_engine_signal_breadcrumbs(engine);
1593 	}
1594 	spin_unlock(&old->breadcrumbs.irq_lock);
1595 }
1596 
1597 static struct i915_request *
1598 last_active(const struct intel_engine_execlists *execlists)
1599 {
1600 	struct i915_request * const *last = READ_ONCE(execlists->active);
1601 
1602 	while (*last && i915_request_completed(*last))
1603 		last++;
1604 
1605 	return *last;
1606 }
1607 
1608 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1609 {
1610 	LIST_HEAD(list);
1611 
1612 	/*
1613 	 * We want to move the interrupted request to the back of
1614 	 * the round-robin list (i.e. its priority level), but
1615 	 * in doing so, we must then move all requests that were in
1616 	 * flight and were waiting for the interrupted request to
1617 	 * be run after it again.
1618 	 */
1619 	do {
1620 		struct i915_dependency *p;
1621 
1622 		GEM_BUG_ON(i915_request_is_active(rq));
1623 		list_move_tail(&rq->sched.link, pl);
1624 
1625 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1626 			struct i915_request *w =
1627 				container_of(p->waiter, typeof(*w), sched);
1628 
1629 			/* Leave semaphores spinning on the other engines */
1630 			if (w->engine != rq->engine)
1631 				continue;
1632 
1633 			/* No waiter should start before its signaler */
1634 			GEM_BUG_ON(i915_request_started(w) &&
1635 				   !i915_request_completed(rq));
1636 
1637 			GEM_BUG_ON(i915_request_is_active(w));
1638 			if (!i915_request_is_ready(w))
1639 				continue;
1640 
1641 			if (rq_prio(w) < rq_prio(rq))
1642 				continue;
1643 
1644 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1645 			list_move_tail(&w->sched.link, &list);
1646 		}
1647 
1648 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1649 	} while (rq);
1650 }
1651 
1652 static void defer_active(struct intel_engine_cs *engine)
1653 {
1654 	struct i915_request *rq;
1655 
1656 	rq = __unwind_incomplete_requests(engine);
1657 	if (!rq)
1658 		return;
1659 
1660 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1661 }
1662 
1663 static bool
1664 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1665 {
1666 	int hint;
1667 
1668 	if (!intel_engine_has_timeslices(engine))
1669 		return false;
1670 
1671 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1672 		return false;
1673 
1674 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
1675 		   engine->execlists.queue_priority_hint);
1676 
1677 	return hint >= effective_prio(rq);
1678 }
1679 
1680 static int
1681 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1682 {
1683 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1684 		return INT_MIN;
1685 
1686 	return rq_prio(list_next_entry(rq, sched.link));
1687 }
1688 
1689 static inline unsigned long
1690 timeslice(const struct intel_engine_cs *engine)
1691 {
1692 	return READ_ONCE(engine->props.timeslice_duration_ms);
1693 }
1694 
1695 static unsigned long
1696 active_timeslice(const struct intel_engine_cs *engine)
1697 {
1698 	const struct i915_request *rq = *engine->execlists.active;
1699 
1700 	if (!rq || i915_request_completed(rq))
1701 		return 0;
1702 
1703 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1704 		return 0;
1705 
1706 	return timeslice(engine);
1707 }
1708 
1709 static void set_timeslice(struct intel_engine_cs *engine)
1710 {
1711 	if (!intel_engine_has_timeslices(engine))
1712 		return;
1713 
1714 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1715 }
1716 
1717 static void record_preemption(struct intel_engine_execlists *execlists)
1718 {
1719 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1720 }
1721 
1722 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1723 {
1724 	struct i915_request *rq;
1725 
1726 	rq = last_active(&engine->execlists);
1727 	if (!rq)
1728 		return 0;
1729 
1730 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1731 	if (unlikely(intel_context_is_banned(rq->context)))
1732 		return 1;
1733 
1734 	return READ_ONCE(engine->props.preempt_timeout_ms);
1735 }
1736 
1737 static void set_preempt_timeout(struct intel_engine_cs *engine)
1738 {
1739 	if (!intel_engine_has_preempt_reset(engine))
1740 		return;
1741 
1742 	set_timer_ms(&engine->execlists.preempt,
1743 		     active_preempt_timeout(engine));
1744 }
1745 
1746 static inline void clear_ports(struct i915_request **ports, int count)
1747 {
1748 	memset_p((void **)ports, NULL, count);
1749 }
1750 
1751 static void execlists_dequeue(struct intel_engine_cs *engine)
1752 {
1753 	struct intel_engine_execlists * const execlists = &engine->execlists;
1754 	struct i915_request **port = execlists->pending;
1755 	struct i915_request ** const last_port = port + execlists->port_mask;
1756 	struct i915_request *last;
1757 	struct rb_node *rb;
1758 	bool submit = false;
1759 
1760 	/*
1761 	 * Hardware submission is through 2 ports. Conceptually each port
1762 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1763 	 * static for a context, and unique to each, so we only execute
1764 	 * requests belonging to a single context from each ring. RING_HEAD
1765 	 * is maintained by the CS in the context image, it marks the place
1766 	 * where it got up to last time, and through RING_TAIL we tell the CS
1767 	 * where we want to execute up to this time.
1768 	 *
1769 	 * In this list the requests are in order of execution. Consecutive
1770 	 * requests from the same context are adjacent in the ringbuffer. We
1771 	 * can combine these requests into a single RING_TAIL update:
1772 	 *
1773 	 *              RING_HEAD...req1...req2
1774 	 *                                    ^- RING_TAIL
1775 	 * since to execute req2 the CS must first execute req1.
1776 	 *
1777 	 * Our goal then is to point each port to the end of a consecutive
1778 	 * sequence of requests as being the most optimal (fewest wake ups
1779 	 * and context switches) submission.
1780 	 */
1781 
1782 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1783 		struct virtual_engine *ve =
1784 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1785 		struct i915_request *rq = READ_ONCE(ve->request);
1786 
1787 		if (!rq) { /* lazily cleanup after another engine handled rq */
1788 			rb_erase_cached(rb, &execlists->virtual);
1789 			RB_CLEAR_NODE(rb);
1790 			rb = rb_first_cached(&execlists->virtual);
1791 			continue;
1792 		}
1793 
1794 		if (!virtual_matches(ve, rq, engine)) {
1795 			rb = rb_next(rb);
1796 			continue;
1797 		}
1798 
1799 		break;
1800 	}
1801 
1802 	/*
1803 	 * If the queue is higher priority than the last
1804 	 * request in the currently active context, submit afresh.
1805 	 * We will resubmit again afterwards in case we need to split
1806 	 * the active context to interject the preemption request,
1807 	 * i.e. we will retrigger preemption following the ack in case
1808 	 * of trouble.
1809 	 */
1810 	last = last_active(execlists);
1811 	if (last) {
1812 		if (need_preempt(engine, last, rb)) {
1813 			ENGINE_TRACE(engine,
1814 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1815 				     last->fence.context,
1816 				     last->fence.seqno,
1817 				     last->sched.attr.priority,
1818 				     execlists->queue_priority_hint);
1819 			record_preemption(execlists);
1820 
1821 			/*
1822 			 * Don't let the RING_HEAD advance past the breadcrumb
1823 			 * as we unwind (and until we resubmit) so that we do
1824 			 * not accidentally tell it to go backwards.
1825 			 */
1826 			ring_set_paused(engine, 1);
1827 
1828 			/*
1829 			 * Note that we have not stopped the GPU at this point,
1830 			 * so we are unwinding the incomplete requests as they
1831 			 * remain inflight and so by the time we do complete
1832 			 * the preemption, some of the unwound requests may
1833 			 * complete!
1834 			 */
1835 			__unwind_incomplete_requests(engine);
1836 
1837 			/*
1838 			 * If we need to return to the preempted context, we
1839 			 * need to skip the lite-restore and force it to
1840 			 * reload the RING_TAIL. Otherwise, the HW has a
1841 			 * tendency to ignore us rewinding the TAIL to the
1842 			 * end of an earlier request.
1843 			 */
1844 			last->context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1845 			last = NULL;
1846 		} else if (need_timeslice(engine, last) &&
1847 			   timer_expired(&engine->execlists.timer)) {
1848 			ENGINE_TRACE(engine,
1849 				     "expired last=%llx:%lld, prio=%d, hint=%d\n",
1850 				     last->fence.context,
1851 				     last->fence.seqno,
1852 				     last->sched.attr.priority,
1853 				     execlists->queue_priority_hint);
1854 
1855 			ring_set_paused(engine, 1);
1856 			defer_active(engine);
1857 
1858 			/*
1859 			 * Unlike for preemption, if we rewind and continue
1860 			 * executing the same context as previously active,
1861 			 * the order of execution will remain the same and
1862 			 * the tail will only advance. We do not need to
1863 			 * force a full context restore, as a lite-restore
1864 			 * is sufficient to resample the monotonic TAIL.
1865 			 *
1866 			 * If we switch to any other context, similarly we
1867 			 * will not rewind TAIL of current context, and
1868 			 * normal save/restore will preserve state and allow
1869 			 * us to later continue executing the same request.
1870 			 */
1871 			last = NULL;
1872 		} else {
1873 			/*
1874 			 * Otherwise if we already have a request pending
1875 			 * for execution after the current one, we can
1876 			 * just wait until the next CS event before
1877 			 * queuing more. In either case we will force a
1878 			 * lite-restore preemption event, but if we wait
1879 			 * we hopefully coalesce several updates into a single
1880 			 * submission.
1881 			 */
1882 			if (!list_is_last(&last->sched.link,
1883 					  &engine->active.requests)) {
1884 				/*
1885 				 * Even if ELSP[1] is occupied and not worthy
1886 				 * of timeslices, our queue might be.
1887 				 */
1888 				if (!execlists->timer.expires &&
1889 				    need_timeslice(engine, last))
1890 					set_timer_ms(&execlists->timer,
1891 						     timeslice(engine));
1892 
1893 				return;
1894 			}
1895 		}
1896 	}
1897 
1898 	while (rb) { /* XXX virtual is always taking precedence */
1899 		struct virtual_engine *ve =
1900 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1901 		struct i915_request *rq;
1902 
1903 		spin_lock(&ve->base.active.lock);
1904 
1905 		rq = ve->request;
1906 		if (unlikely(!rq)) { /* lost the race to a sibling */
1907 			spin_unlock(&ve->base.active.lock);
1908 			rb_erase_cached(rb, &execlists->virtual);
1909 			RB_CLEAR_NODE(rb);
1910 			rb = rb_first_cached(&execlists->virtual);
1911 			continue;
1912 		}
1913 
1914 		GEM_BUG_ON(rq != ve->request);
1915 		GEM_BUG_ON(rq->engine != &ve->base);
1916 		GEM_BUG_ON(rq->context != &ve->context);
1917 
1918 		if (rq_prio(rq) >= queue_prio(execlists)) {
1919 			if (!virtual_matches(ve, rq, engine)) {
1920 				spin_unlock(&ve->base.active.lock);
1921 				rb = rb_next(rb);
1922 				continue;
1923 			}
1924 
1925 			if (last && !can_merge_rq(last, rq)) {
1926 				spin_unlock(&ve->base.active.lock);
1927 				return; /* leave this for another */
1928 			}
1929 
1930 			ENGINE_TRACE(engine,
1931 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
1932 				     rq->fence.context,
1933 				     rq->fence.seqno,
1934 				     i915_request_completed(rq) ? "!" :
1935 				     i915_request_started(rq) ? "*" :
1936 				     "",
1937 				     yesno(engine != ve->siblings[0]));
1938 
1939 			ve->request = NULL;
1940 			ve->base.execlists.queue_priority_hint = INT_MIN;
1941 			rb_erase_cached(rb, &execlists->virtual);
1942 			RB_CLEAR_NODE(rb);
1943 
1944 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1945 			rq->engine = engine;
1946 
1947 			if (engine != ve->siblings[0]) {
1948 				u32 *regs = ve->context.lrc_reg_state;
1949 				unsigned int n;
1950 
1951 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1952 
1953 				if (!intel_engine_has_relative_mmio(engine))
1954 					virtual_update_register_offsets(regs,
1955 									engine);
1956 
1957 				if (!list_empty(&ve->context.signals))
1958 					virtual_xfer_breadcrumbs(ve, engine);
1959 
1960 				/*
1961 				 * Move the bound engine to the top of the list
1962 				 * for future execution. We then kick this
1963 				 * tasklet first before checking others, so that
1964 				 * we preferentially reuse this set of bound
1965 				 * registers.
1966 				 */
1967 				for (n = 1; n < ve->num_siblings; n++) {
1968 					if (ve->siblings[n] == engine) {
1969 						swap(ve->siblings[n],
1970 						     ve->siblings[0]);
1971 						break;
1972 					}
1973 				}
1974 
1975 				GEM_BUG_ON(ve->siblings[0] != engine);
1976 			}
1977 
1978 			if (__i915_request_submit(rq)) {
1979 				submit = true;
1980 				last = rq;
1981 			}
1982 			i915_request_put(rq);
1983 
1984 			/*
1985 			 * Hmm, we have a bunch of virtual engine requests,
1986 			 * but the first one was already completed (thanks
1987 			 * preempt-to-busy!). Keep looking at the veng queue
1988 			 * until we have no more relevant requests (i.e.
1989 			 * the normal submit queue has higher priority).
1990 			 */
1991 			if (!submit) {
1992 				spin_unlock(&ve->base.active.lock);
1993 				rb = rb_first_cached(&execlists->virtual);
1994 				continue;
1995 			}
1996 		}
1997 
1998 		spin_unlock(&ve->base.active.lock);
1999 		break;
2000 	}
2001 
2002 	while ((rb = rb_first_cached(&execlists->queue))) {
2003 		struct i915_priolist *p = to_priolist(rb);
2004 		struct i915_request *rq, *rn;
2005 		int i;
2006 
2007 		priolist_for_each_request_consume(rq, rn, p, i) {
2008 			bool merge = true;
2009 
2010 			/*
2011 			 * Can we combine this request with the current port?
2012 			 * It has to be the same context/ringbuffer and not
2013 			 * have any exceptions (e.g. GVT saying never to
2014 			 * combine contexts).
2015 			 *
2016 			 * If we can combine the requests, we can execute both
2017 			 * by updating the RING_TAIL to point to the end of the
2018 			 * second request, and so we never need to tell the
2019 			 * hardware about the first.
2020 			 */
2021 			if (last && !can_merge_rq(last, rq)) {
2022 				/*
2023 				 * If we are on the second port and cannot
2024 				 * combine this request with the last, then we
2025 				 * are done.
2026 				 */
2027 				if (port == last_port)
2028 					goto done;
2029 
2030 				/*
2031 				 * We must not populate both ELSP[] with the
2032 				 * same LRCA, i.e. we must submit 2 different
2033 				 * contexts if we submit 2 ELSP.
2034 				 */
2035 				if (last->context == rq->context)
2036 					goto done;
2037 
2038 				if (i915_request_has_sentinel(last))
2039 					goto done;
2040 
2041 				/*
2042 				 * If GVT overrides us we only ever submit
2043 				 * port[0], leaving port[1] empty. Note that we
2044 				 * also have to be careful that we don't queue
2045 				 * the same context (even though a different
2046 				 * request) to the second port.
2047 				 */
2048 				if (ctx_single_port_submission(last->context) ||
2049 				    ctx_single_port_submission(rq->context))
2050 					goto done;
2051 
2052 				merge = false;
2053 			}
2054 
2055 			if (__i915_request_submit(rq)) {
2056 				if (!merge) {
2057 					*port = execlists_schedule_in(last, port - execlists->pending);
2058 					port++;
2059 					last = NULL;
2060 				}
2061 
2062 				GEM_BUG_ON(last &&
2063 					   !can_merge_ctx(last->context,
2064 							  rq->context));
2065 
2066 				submit = true;
2067 				last = rq;
2068 			}
2069 		}
2070 
2071 		rb_erase_cached(&p->node, &execlists->queue);
2072 		i915_priolist_free(p);
2073 	}
2074 
2075 done:
2076 	/*
2077 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2078 	 *
2079 	 * We choose the priority hint such that if we add a request of greater
2080 	 * priority than this, we kick the submission tasklet to decide on
2081 	 * the right order of submitting the requests to hardware. We must
2082 	 * also be prepared to reorder requests as they are in-flight on the
2083 	 * HW. We derive the priority hint then as the first "hole" in
2084 	 * the HW submission ports and if there are no available slots,
2085 	 * the priority of the lowest executing request, i.e. last.
2086 	 *
2087 	 * When we do receive a higher priority request ready to run from the
2088 	 * user, see queue_request(), the priority hint is bumped to that
2089 	 * request triggering preemption on the next dequeue (or subsequent
2090 	 * interrupt for secondary ports).
2091 	 */
2092 	execlists->queue_priority_hint = queue_prio(execlists);
2093 
2094 	if (submit) {
2095 		*port = execlists_schedule_in(last, port - execlists->pending);
2096 		execlists->switch_priority_hint =
2097 			switch_prio(engine, *execlists->pending);
2098 
2099 		/*
2100 		 * Skip if we ended up with exactly the same set of requests,
2101 		 * e.g. trying to timeslice a pair of ordered contexts
2102 		 */
2103 		if (!memcmp(execlists->active, execlists->pending,
2104 			    (port - execlists->pending + 1) * sizeof(*port))) {
2105 			do
2106 				execlists_schedule_out(fetch_and_zero(port));
2107 			while (port-- != execlists->pending);
2108 
2109 			goto skip_submit;
2110 		}
2111 		clear_ports(port + 1, last_port - port);
2112 
2113 		execlists_submit_ports(engine);
2114 		set_preempt_timeout(engine);
2115 	} else {
2116 skip_submit:
2117 		ring_set_paused(engine, 0);
2118 	}
2119 }
2120 
2121 static void
2122 cancel_port_requests(struct intel_engine_execlists * const execlists)
2123 {
2124 	struct i915_request * const *port;
2125 
2126 	for (port = execlists->pending; *port; port++)
2127 		execlists_schedule_out(*port);
2128 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2129 
2130 	/* Mark the end of active before we overwrite *active */
2131 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2132 		execlists_schedule_out(*port);
2133 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2134 
2135 	WRITE_ONCE(execlists->active, execlists->inflight);
2136 }
2137 
2138 static inline void
2139 invalidate_csb_entries(const u32 *first, const u32 *last)
2140 {
2141 	clflush((void *)first);
2142 	clflush((void *)last);
2143 }
2144 
2145 static inline bool
2146 reset_in_progress(const struct intel_engine_execlists *execlists)
2147 {
2148 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
2149 }
2150 
2151 /*
2152  * Starting with Gen12, the status has a new format:
2153  *
2154  *     bit  0:     switched to new queue
2155  *     bit  1:     reserved
2156  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2157  *                 switch detail is set to "wait on semaphore"
2158  *     bits 3-5:   engine class
2159  *     bits 6-11:  engine instance
2160  *     bits 12-14: reserved
2161  *     bits 15-25: sw context id of the lrc the GT switched to
2162  *     bits 26-31: sw counter of the lrc the GT switched to
2163  *     bits 32-35: context switch detail
2164  *                  - 0: ctx complete
2165  *                  - 1: wait on sync flip
2166  *                  - 2: wait on vblank
2167  *                  - 3: wait on scanline
2168  *                  - 4: wait on semaphore
2169  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2170  *                       WAIT_FOR_EVENT)
2171  *     bit  36:    reserved
2172  *     bits 37-43: wait detail (for switch detail 1 to 4)
2173  *     bits 44-46: reserved
2174  *     bits 47-57: sw context id of the lrc the GT switched away from
2175  *     bits 58-63: sw counter of the lrc the GT switched away from
2176  */
2177 static inline bool
2178 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2179 {
2180 	u32 lower_dw = csb[0];
2181 	u32 upper_dw = csb[1];
2182 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2183 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2184 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2185 
2186 	/*
2187 	 * The context switch detail is not guaranteed to be 5 when a preemption
2188 	 * occurs, so we can't just check for that. The check below works for
2189 	 * all the cases we care about, including preemptions of WAIT
2190 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2191 	 * would require some extra handling, but we don't support that.
2192 	 */
2193 	if (!ctx_away_valid || new_queue) {
2194 		GEM_BUG_ON(!ctx_to_valid);
2195 		return true;
2196 	}
2197 
2198 	/*
2199 	 * switch detail = 5 is covered by the case above and we do not expect a
2200 	 * context switch on an unsuccessful wait instruction since we always
2201 	 * use polling mode.
2202 	 */
2203 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2204 	return false;
2205 }
2206 
2207 static inline bool
2208 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2209 {
2210 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2211 }
2212 
2213 static void process_csb(struct intel_engine_cs *engine)
2214 {
2215 	struct intel_engine_execlists * const execlists = &engine->execlists;
2216 	const u32 * const buf = execlists->csb_status;
2217 	const u8 num_entries = execlists->csb_size;
2218 	u8 head, tail;
2219 
2220 	/*
2221 	 * As we modify our execlists state tracking we require exclusive
2222 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2223 	 * and we assume that is only inside the reset paths and so serialised.
2224 	 */
2225 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2226 		   !reset_in_progress(execlists));
2227 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2228 
2229 	/*
2230 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2231 	 * When reading from the csb_write mmio register, we have to be
2232 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2233 	 * the low 4bits. As it happens we know the next 4bits are always
2234 	 * zero and so we can simply masked off the low u8 of the register
2235 	 * and treat it identically to reading from the HWSP (without having
2236 	 * to use explicit shifting and masking, and probably bifurcating
2237 	 * the code to handle the legacy mmio read).
2238 	 */
2239 	head = execlists->csb_head;
2240 	tail = READ_ONCE(*execlists->csb_write);
2241 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2242 	if (unlikely(head == tail))
2243 		return;
2244 
2245 	/*
2246 	 * Hopefully paired with a wmb() in HW!
2247 	 *
2248 	 * We must complete the read of the write pointer before any reads
2249 	 * from the CSB, so that we do not see stale values. Without an rmb
2250 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2251 	 * we perform the READ_ONCE(*csb_write).
2252 	 */
2253 	rmb();
2254 
2255 	do {
2256 		bool promote;
2257 
2258 		if (++head == num_entries)
2259 			head = 0;
2260 
2261 		/*
2262 		 * We are flying near dragons again.
2263 		 *
2264 		 * We hold a reference to the request in execlist_port[]
2265 		 * but no more than that. We are operating in softirq
2266 		 * context and so cannot hold any mutex or sleep. That
2267 		 * prevents us stopping the requests we are processing
2268 		 * in port[] from being retired simultaneously (the
2269 		 * breadcrumb will be complete before we see the
2270 		 * context-switch). As we only hold the reference to the
2271 		 * request, any pointer chasing underneath the request
2272 		 * is subject to a potential use-after-free. Thus we
2273 		 * store all of the bookkeeping within port[] as
2274 		 * required, and avoid using unguarded pointers beneath
2275 		 * request itself. The same applies to the atomic
2276 		 * status notifier.
2277 		 */
2278 
2279 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2280 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2281 
2282 		if (INTEL_GEN(engine->i915) >= 12)
2283 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2284 		else
2285 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2286 		if (promote) {
2287 			struct i915_request * const *old = execlists->active;
2288 
2289 			/* Point active to the new ELSP; prevent overwriting */
2290 			WRITE_ONCE(execlists->active, execlists->pending);
2291 
2292 			if (!inject_preempt_hang(execlists))
2293 				ring_set_paused(engine, 0);
2294 
2295 			/* cancel old inflight, prepare for switch */
2296 			trace_ports(execlists, "preempted", old);
2297 			while (*old)
2298 				execlists_schedule_out(*old++);
2299 
2300 			/* switch pending to inflight */
2301 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2302 			WRITE_ONCE(execlists->active,
2303 				   memcpy(execlists->inflight,
2304 					  execlists->pending,
2305 					  execlists_num_ports(execlists) *
2306 					  sizeof(*execlists->pending)));
2307 
2308 			WRITE_ONCE(execlists->pending[0], NULL);
2309 		} else {
2310 			GEM_BUG_ON(!*execlists->active);
2311 
2312 			/* port0 completed, advanced to port1 */
2313 			trace_ports(execlists, "completed", execlists->active);
2314 
2315 			/*
2316 			 * We rely on the hardware being strongly
2317 			 * ordered, that the breadcrumb write is
2318 			 * coherent (visible from the CPU) before the
2319 			 * user interrupt and CSB is processed.
2320 			 */
2321 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2322 				   !reset_in_progress(execlists));
2323 			execlists_schedule_out(*execlists->active++);
2324 
2325 			GEM_BUG_ON(execlists->active - execlists->inflight >
2326 				   execlists_num_ports(execlists));
2327 		}
2328 	} while (head != tail);
2329 
2330 	execlists->csb_head = head;
2331 	set_timeslice(engine);
2332 
2333 	/*
2334 	 * Gen11 has proven to fail wrt global observation point between
2335 	 * entry and tail update, failing on the ordering and thus
2336 	 * we see an old entry in the context status buffer.
2337 	 *
2338 	 * Forcibly evict out entries for the next gpu csb update,
2339 	 * to increase the odds that we get a fresh entries with non
2340 	 * working hardware. The cost for doing so comes out mostly with
2341 	 * the wash as hardware, working or not, will need to do the
2342 	 * invalidation before.
2343 	 */
2344 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2345 }
2346 
2347 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2348 {
2349 	lockdep_assert_held(&engine->active.lock);
2350 	if (!engine->execlists.pending[0]) {
2351 		rcu_read_lock(); /* protect peeking at execlists->active */
2352 		execlists_dequeue(engine);
2353 		rcu_read_unlock();
2354 	}
2355 }
2356 
2357 static void __execlists_hold(struct i915_request *rq)
2358 {
2359 	LIST_HEAD(list);
2360 
2361 	do {
2362 		struct i915_dependency *p;
2363 
2364 		if (i915_request_is_active(rq))
2365 			__i915_request_unsubmit(rq);
2366 
2367 		RQ_TRACE(rq, "on hold\n");
2368 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2369 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2370 		i915_request_set_hold(rq);
2371 
2372 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2373 			struct i915_request *w =
2374 				container_of(p->waiter, typeof(*w), sched);
2375 
2376 			/* Leave semaphores spinning on the other engines */
2377 			if (w->engine != rq->engine)
2378 				continue;
2379 
2380 			if (!i915_request_is_ready(w))
2381 				continue;
2382 
2383 			if (i915_request_completed(w))
2384 				continue;
2385 
2386 			if (i915_request_on_hold(rq))
2387 				continue;
2388 
2389 			list_move_tail(&w->sched.link, &list);
2390 		}
2391 
2392 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2393 	} while (rq);
2394 }
2395 
2396 static bool execlists_hold(struct intel_engine_cs *engine,
2397 			   struct i915_request *rq)
2398 {
2399 	spin_lock_irq(&engine->active.lock);
2400 
2401 	if (i915_request_completed(rq)) { /* too late! */
2402 		rq = NULL;
2403 		goto unlock;
2404 	}
2405 
2406 	if (rq->engine != engine) { /* preempted virtual engine */
2407 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2408 
2409 		/*
2410 		 * intel_context_inflight() is only protected by virtue
2411 		 * of process_csb() being called only by the tasklet (or
2412 		 * directly from inside reset while the tasklet is suspended).
2413 		 * Assert that neither of those are allowed to run while we
2414 		 * poke at the request queues.
2415 		 */
2416 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2417 
2418 		/*
2419 		 * An unsubmitted request along a virtual engine will
2420 		 * remain on the active (this) engine until we are able
2421 		 * to process the context switch away (and so mark the
2422 		 * context as no longer in flight). That cannot have happened
2423 		 * yet, otherwise we would not be hanging!
2424 		 */
2425 		spin_lock(&ve->base.active.lock);
2426 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2427 		GEM_BUG_ON(ve->request != rq);
2428 		ve->request = NULL;
2429 		spin_unlock(&ve->base.active.lock);
2430 		i915_request_put(rq);
2431 
2432 		rq->engine = engine;
2433 	}
2434 
2435 	/*
2436 	 * Transfer this request onto the hold queue to prevent it
2437 	 * being resumbitted to HW (and potentially completed) before we have
2438 	 * released it. Since we may have already submitted following
2439 	 * requests, we need to remove those as well.
2440 	 */
2441 	GEM_BUG_ON(i915_request_on_hold(rq));
2442 	GEM_BUG_ON(rq->engine != engine);
2443 	__execlists_hold(rq);
2444 
2445 unlock:
2446 	spin_unlock_irq(&engine->active.lock);
2447 	return rq;
2448 }
2449 
2450 static bool hold_request(const struct i915_request *rq)
2451 {
2452 	struct i915_dependency *p;
2453 
2454 	/*
2455 	 * If one of our ancestors is on hold, we must also be on hold,
2456 	 * otherwise we will bypass it and execute before it.
2457 	 */
2458 	list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
2459 		const struct i915_request *s =
2460 			container_of(p->signaler, typeof(*s), sched);
2461 
2462 		if (s->engine != rq->engine)
2463 			continue;
2464 
2465 		if (i915_request_on_hold(s))
2466 			return true;
2467 	}
2468 
2469 	return false;
2470 }
2471 
2472 static void __execlists_unhold(struct i915_request *rq)
2473 {
2474 	LIST_HEAD(list);
2475 
2476 	do {
2477 		struct i915_dependency *p;
2478 
2479 		GEM_BUG_ON(!i915_request_on_hold(rq));
2480 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2481 
2482 		i915_request_clear_hold(rq);
2483 		list_move_tail(&rq->sched.link,
2484 			       i915_sched_lookup_priolist(rq->engine,
2485 							  rq_prio(rq)));
2486 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2487 		RQ_TRACE(rq, "hold release\n");
2488 
2489 		/* Also release any children on this engine that are ready */
2490 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2491 			struct i915_request *w =
2492 				container_of(p->waiter, typeof(*w), sched);
2493 
2494 			if (w->engine != rq->engine)
2495 				continue;
2496 
2497 			if (!i915_request_on_hold(rq))
2498 				continue;
2499 
2500 			/* Check that no other parents are also on hold */
2501 			if (hold_request(rq))
2502 				continue;
2503 
2504 			list_move_tail(&w->sched.link, &list);
2505 		}
2506 
2507 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2508 	} while (rq);
2509 }
2510 
2511 static void execlists_unhold(struct intel_engine_cs *engine,
2512 			     struct i915_request *rq)
2513 {
2514 	spin_lock_irq(&engine->active.lock);
2515 
2516 	/*
2517 	 * Move this request back to the priority queue, and all of its
2518 	 * children and grandchildren that were suspended along with it.
2519 	 */
2520 	__execlists_unhold(rq);
2521 
2522 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2523 		engine->execlists.queue_priority_hint = rq_prio(rq);
2524 		tasklet_hi_schedule(&engine->execlists.tasklet);
2525 	}
2526 
2527 	spin_unlock_irq(&engine->active.lock);
2528 }
2529 
2530 struct execlists_capture {
2531 	struct work_struct work;
2532 	struct i915_request *rq;
2533 	struct i915_gpu_coredump *error;
2534 };
2535 
2536 static void execlists_capture_work(struct work_struct *work)
2537 {
2538 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2539 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2540 	struct intel_engine_cs *engine = cap->rq->engine;
2541 	struct intel_gt_coredump *gt = cap->error->gt;
2542 	struct intel_engine_capture_vma *vma;
2543 
2544 	/* Compress all the objects attached to the request, slow! */
2545 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2546 	if (vma) {
2547 		struct i915_vma_compress *compress =
2548 			i915_vma_capture_prepare(gt);
2549 
2550 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2551 		i915_vma_capture_finish(gt, compress);
2552 	}
2553 
2554 	gt->simulated = gt->engine->simulated;
2555 	cap->error->simulated = gt->simulated;
2556 
2557 	/* Publish the error state, and announce it to the world */
2558 	i915_error_state_store(cap->error);
2559 	i915_gpu_coredump_put(cap->error);
2560 
2561 	/* Return this request and all that depend upon it for signaling */
2562 	execlists_unhold(engine, cap->rq);
2563 	i915_request_put(cap->rq);
2564 
2565 	kfree(cap);
2566 }
2567 
2568 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2569 {
2570 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2571 	struct execlists_capture *cap;
2572 
2573 	cap = kmalloc(sizeof(*cap), gfp);
2574 	if (!cap)
2575 		return NULL;
2576 
2577 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2578 	if (!cap->error)
2579 		goto err_cap;
2580 
2581 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2582 	if (!cap->error->gt)
2583 		goto err_gpu;
2584 
2585 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2586 	if (!cap->error->gt->engine)
2587 		goto err_gt;
2588 
2589 	return cap;
2590 
2591 err_gt:
2592 	kfree(cap->error->gt);
2593 err_gpu:
2594 	kfree(cap->error);
2595 err_cap:
2596 	kfree(cap);
2597 	return NULL;
2598 }
2599 
2600 static bool execlists_capture(struct intel_engine_cs *engine)
2601 {
2602 	struct execlists_capture *cap;
2603 
2604 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2605 		return true;
2606 
2607 	/*
2608 	 * We need to _quickly_ capture the engine state before we reset.
2609 	 * We are inside an atomic section (softirq) here and we are delaying
2610 	 * the forced preemption event.
2611 	 */
2612 	cap = capture_regs(engine);
2613 	if (!cap)
2614 		return true;
2615 
2616 	cap->rq = execlists_active(&engine->execlists);
2617 	GEM_BUG_ON(!cap->rq);
2618 
2619 	rcu_read_lock();
2620 	cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2621 	cap->rq = i915_request_get_rcu(cap->rq);
2622 	rcu_read_unlock();
2623 	if (!cap->rq)
2624 		goto err_free;
2625 
2626 	/*
2627 	 * Remove the request from the execlists queue, and take ownership
2628 	 * of the request. We pass it to our worker who will _slowly_ compress
2629 	 * all the pages the _user_ requested for debugging their batch, after
2630 	 * which we return it to the queue for signaling.
2631 	 *
2632 	 * By removing them from the execlists queue, we also remove the
2633 	 * requests from being processed by __unwind_incomplete_requests()
2634 	 * during the intel_engine_reset(), and so they will *not* be replayed
2635 	 * afterwards.
2636 	 *
2637 	 * Note that because we have not yet reset the engine at this point,
2638 	 * it is possible for the request that we have identified as being
2639 	 * guilty, did in fact complete and we will then hit an arbitration
2640 	 * point allowing the outstanding preemption to succeed. The likelihood
2641 	 * of that is very low (as capturing of the engine registers should be
2642 	 * fast enough to run inside an irq-off atomic section!), so we will
2643 	 * simply hold that request accountable for being non-preemptible
2644 	 * long enough to force the reset.
2645 	 */
2646 	if (!execlists_hold(engine, cap->rq))
2647 		goto err_rq;
2648 
2649 	INIT_WORK(&cap->work, execlists_capture_work);
2650 	schedule_work(&cap->work);
2651 	return true;
2652 
2653 err_rq:
2654 	i915_request_put(cap->rq);
2655 err_free:
2656 	i915_gpu_coredump_put(cap->error);
2657 	kfree(cap);
2658 	return false;
2659 }
2660 
2661 static noinline void preempt_reset(struct intel_engine_cs *engine)
2662 {
2663 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2664 	unsigned long *lock = &engine->gt->reset.flags;
2665 
2666 	if (i915_modparams.reset < 3)
2667 		return;
2668 
2669 	if (test_and_set_bit(bit, lock))
2670 		return;
2671 
2672 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2673 	tasklet_disable_nosync(&engine->execlists.tasklet);
2674 
2675 	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
2676 		     READ_ONCE(engine->props.preempt_timeout_ms),
2677 		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2678 
2679 	ring_set_paused(engine, 1); /* Freeze the current request in place */
2680 	if (execlists_capture(engine))
2681 		intel_engine_reset(engine, "preemption time out");
2682 	else
2683 		ring_set_paused(engine, 0);
2684 
2685 	tasklet_enable(&engine->execlists.tasklet);
2686 	clear_and_wake_up_bit(bit, lock);
2687 }
2688 
2689 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2690 {
2691 	const struct timer_list *t = &engine->execlists.preempt;
2692 
2693 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2694 		return false;
2695 
2696 	if (!timer_expired(t))
2697 		return false;
2698 
2699 	return READ_ONCE(engine->execlists.pending[0]);
2700 }
2701 
2702 /*
2703  * Check the unread Context Status Buffers and manage the submission of new
2704  * contexts to the ELSP accordingly.
2705  */
2706 static void execlists_submission_tasklet(unsigned long data)
2707 {
2708 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2709 	bool timeout = preempt_timeout(engine);
2710 
2711 	process_csb(engine);
2712 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2713 		unsigned long flags;
2714 
2715 		spin_lock_irqsave(&engine->active.lock, flags);
2716 		__execlists_submission_tasklet(engine);
2717 		spin_unlock_irqrestore(&engine->active.lock, flags);
2718 
2719 		/* Recheck after serialising with direct-submission */
2720 		if (timeout && preempt_timeout(engine))
2721 			preempt_reset(engine);
2722 	}
2723 }
2724 
2725 static void __execlists_kick(struct intel_engine_execlists *execlists)
2726 {
2727 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2728 	tasklet_hi_schedule(&execlists->tasklet);
2729 }
2730 
2731 #define execlists_kick(t, member) \
2732 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2733 
2734 static void execlists_timeslice(struct timer_list *timer)
2735 {
2736 	execlists_kick(timer, timer);
2737 }
2738 
2739 static void execlists_preempt(struct timer_list *timer)
2740 {
2741 	execlists_kick(timer, preempt);
2742 }
2743 
2744 static void queue_request(struct intel_engine_cs *engine,
2745 			  struct i915_request *rq)
2746 {
2747 	GEM_BUG_ON(!list_empty(&rq->sched.link));
2748 	list_add_tail(&rq->sched.link,
2749 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
2750 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2751 }
2752 
2753 static void __submit_queue_imm(struct intel_engine_cs *engine)
2754 {
2755 	struct intel_engine_execlists * const execlists = &engine->execlists;
2756 
2757 	if (reset_in_progress(execlists))
2758 		return; /* defer until we restart the engine following reset */
2759 
2760 	if (execlists->tasklet.func == execlists_submission_tasklet)
2761 		__execlists_submission_tasklet(engine);
2762 	else
2763 		tasklet_hi_schedule(&execlists->tasklet);
2764 }
2765 
2766 static void submit_queue(struct intel_engine_cs *engine,
2767 			 const struct i915_request *rq)
2768 {
2769 	struct intel_engine_execlists *execlists = &engine->execlists;
2770 
2771 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2772 		return;
2773 
2774 	execlists->queue_priority_hint = rq_prio(rq);
2775 	__submit_queue_imm(engine);
2776 }
2777 
2778 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2779 			     const struct i915_request *rq)
2780 {
2781 	GEM_BUG_ON(i915_request_on_hold(rq));
2782 	return !list_empty(&engine->active.hold) && hold_request(rq);
2783 }
2784 
2785 static void execlists_submit_request(struct i915_request *request)
2786 {
2787 	struct intel_engine_cs *engine = request->engine;
2788 	unsigned long flags;
2789 
2790 	/* Will be called from irq-context when using foreign fences. */
2791 	spin_lock_irqsave(&engine->active.lock, flags);
2792 
2793 	if (unlikely(ancestor_on_hold(engine, request))) {
2794 		list_add_tail(&request->sched.link, &engine->active.hold);
2795 		i915_request_set_hold(request);
2796 	} else {
2797 		queue_request(engine, request);
2798 
2799 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2800 		GEM_BUG_ON(list_empty(&request->sched.link));
2801 
2802 		submit_queue(engine, request);
2803 	}
2804 
2805 	spin_unlock_irqrestore(&engine->active.lock, flags);
2806 }
2807 
2808 static void __execlists_context_fini(struct intel_context *ce)
2809 {
2810 	intel_ring_put(ce->ring);
2811 	i915_vma_put(ce->state);
2812 }
2813 
2814 static void execlists_context_destroy(struct kref *kref)
2815 {
2816 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2817 
2818 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2819 	GEM_BUG_ON(intel_context_is_pinned(ce));
2820 
2821 	if (ce->state)
2822 		__execlists_context_fini(ce);
2823 
2824 	intel_context_fini(ce);
2825 	intel_context_free(ce);
2826 }
2827 
2828 static void
2829 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2830 {
2831 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2832 		return;
2833 
2834 	vaddr += engine->context_size;
2835 
2836 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2837 }
2838 
2839 static void
2840 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2841 {
2842 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2843 		return;
2844 
2845 	vaddr += engine->context_size;
2846 
2847 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2848 		dev_err_once(engine->i915->drm.dev,
2849 			     "%s context redzone overwritten!\n",
2850 			     engine->name);
2851 }
2852 
2853 static void execlists_context_unpin(struct intel_context *ce)
2854 {
2855 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2856 		      ce->engine);
2857 
2858 	i915_gem_object_unpin_map(ce->state->obj);
2859 }
2860 
2861 static void
2862 __execlists_update_reg_state(const struct intel_context *ce,
2863 			     const struct intel_engine_cs *engine)
2864 {
2865 	struct intel_ring *ring = ce->ring;
2866 	u32 *regs = ce->lrc_reg_state;
2867 
2868 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2869 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2870 
2871 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2872 	regs[CTX_RING_HEAD] = ring->head;
2873 	regs[CTX_RING_TAIL] = ring->tail;
2874 
2875 	/* RPCS */
2876 	if (engine->class == RENDER_CLASS) {
2877 		regs[CTX_R_PWR_CLK_STATE] =
2878 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2879 
2880 		i915_oa_init_reg_state(ce, engine);
2881 	}
2882 }
2883 
2884 static int
2885 __execlists_context_pin(struct intel_context *ce,
2886 			struct intel_engine_cs *engine)
2887 {
2888 	void *vaddr;
2889 
2890 	GEM_BUG_ON(!ce->state);
2891 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2892 
2893 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2894 					i915_coherent_map_type(engine->i915) |
2895 					I915_MAP_OVERRIDE);
2896 	if (IS_ERR(vaddr))
2897 		return PTR_ERR(vaddr);
2898 
2899 	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2900 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2901 	__execlists_update_reg_state(ce, engine);
2902 
2903 	return 0;
2904 }
2905 
2906 static int execlists_context_pin(struct intel_context *ce)
2907 {
2908 	return __execlists_context_pin(ce, ce->engine);
2909 }
2910 
2911 static int execlists_context_alloc(struct intel_context *ce)
2912 {
2913 	return __execlists_context_alloc(ce, ce->engine);
2914 }
2915 
2916 static void execlists_context_reset(struct intel_context *ce)
2917 {
2918 	CE_TRACE(ce, "reset\n");
2919 	GEM_BUG_ON(!intel_context_is_pinned(ce));
2920 
2921 	/*
2922 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2923 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2924 	 * that stored in context. As we only write new commands from
2925 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2926 	 * starts reading from its RING_HEAD from the context, it may try to
2927 	 * execute that junk and die.
2928 	 *
2929 	 * The contexts that are stilled pinned on resume belong to the
2930 	 * kernel, and are local to each engine. All other contexts will
2931 	 * have their head/tail sanitized upon pinning before use, so they
2932 	 * will never see garbage,
2933 	 *
2934 	 * So to avoid that we reset the context images upon resume. For
2935 	 * simplicity, we just zero everything out.
2936 	 */
2937 	intel_ring_reset(ce->ring, ce->ring->emit);
2938 
2939 	/* Scrub away the garbage */
2940 	execlists_init_reg_state(ce->lrc_reg_state,
2941 				 ce, ce->engine, ce->ring, true);
2942 	__execlists_update_reg_state(ce, ce->engine);
2943 
2944 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
2945 }
2946 
2947 static const struct intel_context_ops execlists_context_ops = {
2948 	.alloc = execlists_context_alloc,
2949 
2950 	.pin = execlists_context_pin,
2951 	.unpin = execlists_context_unpin,
2952 
2953 	.enter = intel_context_enter_engine,
2954 	.exit = intel_context_exit_engine,
2955 
2956 	.reset = execlists_context_reset,
2957 	.destroy = execlists_context_destroy,
2958 };
2959 
2960 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2961 {
2962 	u32 *cs;
2963 
2964 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2965 
2966 	cs = intel_ring_begin(rq, 6);
2967 	if (IS_ERR(cs))
2968 		return PTR_ERR(cs);
2969 
2970 	/*
2971 	 * Check if we have been preempted before we even get started.
2972 	 *
2973 	 * After this point i915_request_started() reports true, even if
2974 	 * we get preempted and so are no longer running.
2975 	 */
2976 	*cs++ = MI_ARB_CHECK;
2977 	*cs++ = MI_NOOP;
2978 
2979 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2980 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2981 	*cs++ = 0;
2982 	*cs++ = rq->fence.seqno - 1;
2983 
2984 	intel_ring_advance(rq, cs);
2985 
2986 	/* Record the updated position of the request's payload */
2987 	rq->infix = intel_ring_offset(rq, cs);
2988 
2989 	return 0;
2990 }
2991 
2992 static int execlists_request_alloc(struct i915_request *request)
2993 {
2994 	int ret;
2995 
2996 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
2997 
2998 	/*
2999 	 * Flush enough space to reduce the likelihood of waiting after
3000 	 * we start building the request - in which case we will just
3001 	 * have to repeat work.
3002 	 */
3003 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3004 
3005 	/*
3006 	 * Note that after this point, we have committed to using
3007 	 * this request as it is being used to both track the
3008 	 * state of engine initialisation and liveness of the
3009 	 * golden renderstate above. Think twice before you try
3010 	 * to cancel/unwind this request now.
3011 	 */
3012 
3013 	/* Unconditionally invalidate GPU caches and TLBs. */
3014 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3015 	if (ret)
3016 		return ret;
3017 
3018 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3019 	return 0;
3020 }
3021 
3022 /*
3023  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3024  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3025  * but there is a slight complication as this is applied in WA batch where the
3026  * values are only initialized once so we cannot take register value at the
3027  * beginning and reuse it further; hence we save its value to memory, upload a
3028  * constant value with bit21 set and then we restore it back with the saved value.
3029  * To simplify the WA, a constant value is formed by using the default value
3030  * of this register. This shouldn't be a problem because we are only modifying
3031  * it for a short period and this batch in non-premptible. We can ofcourse
3032  * use additional instructions that read the actual value of the register
3033  * at that time and set our bit of interest but it makes the WA complicated.
3034  *
3035  * This WA is also required for Gen9 so extracting as a function avoids
3036  * code duplication.
3037  */
3038 static u32 *
3039 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3040 {
3041 	/* NB no one else is allowed to scribble over scratch + 256! */
3042 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3043 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3044 	*batch++ = intel_gt_scratch_offset(engine->gt,
3045 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3046 	*batch++ = 0;
3047 
3048 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3049 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3050 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3051 
3052 	batch = gen8_emit_pipe_control(batch,
3053 				       PIPE_CONTROL_CS_STALL |
3054 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3055 				       0);
3056 
3057 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3058 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3059 	*batch++ = intel_gt_scratch_offset(engine->gt,
3060 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3061 	*batch++ = 0;
3062 
3063 	return batch;
3064 }
3065 
3066 /*
3067  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3068  * initialized at the beginning and shared across all contexts but this field
3069  * helps us to have multiple batches at different offsets and select them based
3070  * on a criteria. At the moment this batch always start at the beginning of the page
3071  * and at this point we don't have multiple wa_ctx batch buffers.
3072  *
3073  * The number of WA applied are not known at the beginning; we use this field
3074  * to return the no of DWORDS written.
3075  *
3076  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3077  * so it adds NOOPs as padding to make it cacheline aligned.
3078  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3079  * makes a complete batch buffer.
3080  */
3081 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3082 {
3083 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3084 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3085 
3086 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3087 	if (IS_BROADWELL(engine->i915))
3088 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3089 
3090 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3091 	/* Actual scratch location is at 128 bytes offset */
3092 	batch = gen8_emit_pipe_control(batch,
3093 				       PIPE_CONTROL_FLUSH_L3 |
3094 				       PIPE_CONTROL_STORE_DATA_INDEX |
3095 				       PIPE_CONTROL_CS_STALL |
3096 				       PIPE_CONTROL_QW_WRITE,
3097 				       LRC_PPHWSP_SCRATCH_ADDR);
3098 
3099 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3100 
3101 	/* Pad to end of cacheline */
3102 	while ((unsigned long)batch % CACHELINE_BYTES)
3103 		*batch++ = MI_NOOP;
3104 
3105 	/*
3106 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3107 	 * execution depends on the length specified in terms of cache lines
3108 	 * in the register CTX_RCS_INDIRECT_CTX
3109 	 */
3110 
3111 	return batch;
3112 }
3113 
3114 struct lri {
3115 	i915_reg_t reg;
3116 	u32 value;
3117 };
3118 
3119 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3120 {
3121 	GEM_BUG_ON(!count || count > 63);
3122 
3123 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3124 	do {
3125 		*batch++ = i915_mmio_reg_offset(lri->reg);
3126 		*batch++ = lri->value;
3127 	} while (lri++, --count);
3128 	*batch++ = MI_NOOP;
3129 
3130 	return batch;
3131 }
3132 
3133 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3134 {
3135 	static const struct lri lri[] = {
3136 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3137 		{
3138 			COMMON_SLICE_CHICKEN2,
3139 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3140 				       0),
3141 		},
3142 
3143 		/* BSpec: 11391 */
3144 		{
3145 			FF_SLICE_CHICKEN,
3146 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3147 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3148 		},
3149 
3150 		/* BSpec: 11299 */
3151 		{
3152 			_3D_CHICKEN3,
3153 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3154 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3155 		}
3156 	};
3157 
3158 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3159 
3160 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3161 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3162 
3163 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3164 	batch = gen8_emit_pipe_control(batch,
3165 				       PIPE_CONTROL_FLUSH_L3 |
3166 				       PIPE_CONTROL_STORE_DATA_INDEX |
3167 				       PIPE_CONTROL_CS_STALL |
3168 				       PIPE_CONTROL_QW_WRITE,
3169 				       LRC_PPHWSP_SCRATCH_ADDR);
3170 
3171 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3172 
3173 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3174 	if (HAS_POOLED_EU(engine->i915)) {
3175 		/*
3176 		 * EU pool configuration is setup along with golden context
3177 		 * during context initialization. This value depends on
3178 		 * device type (2x6 or 3x6) and needs to be updated based
3179 		 * on which subslice is disabled especially for 2x6
3180 		 * devices, however it is safe to load default
3181 		 * configuration of 3x6 device instead of masking off
3182 		 * corresponding bits because HW ignores bits of a disabled
3183 		 * subslice and drops down to appropriate config. Please
3184 		 * see render_state_setup() in i915_gem_render_state.c for
3185 		 * possible configurations, to avoid duplication they are
3186 		 * not shown here again.
3187 		 */
3188 		*batch++ = GEN9_MEDIA_POOL_STATE;
3189 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3190 		*batch++ = 0x00777000;
3191 		*batch++ = 0;
3192 		*batch++ = 0;
3193 		*batch++ = 0;
3194 	}
3195 
3196 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3197 
3198 	/* Pad to end of cacheline */
3199 	while ((unsigned long)batch % CACHELINE_BYTES)
3200 		*batch++ = MI_NOOP;
3201 
3202 	return batch;
3203 }
3204 
3205 static u32 *
3206 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3207 {
3208 	int i;
3209 
3210 	/*
3211 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3212 	 *
3213 	 * Ensure the engine is idle prior to programming a
3214 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3215 	 */
3216 	batch = gen8_emit_pipe_control(batch,
3217 				       PIPE_CONTROL_CS_STALL,
3218 				       0);
3219 	/*
3220 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3221 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3222 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3223 	 * confusing. Since gen8_emit_pipe_control() already advances the
3224 	 * batch by 6 dwords, we advance the other 10 here, completing a
3225 	 * cacheline. It's not clear if the workaround requires this padding
3226 	 * before other commands, or if it's just the regular padding we would
3227 	 * already have for the workaround bb, so leave it here for now.
3228 	 */
3229 	for (i = 0; i < 10; i++)
3230 		*batch++ = MI_NOOP;
3231 
3232 	/* Pad to end of cacheline */
3233 	while ((unsigned long)batch % CACHELINE_BYTES)
3234 		*batch++ = MI_NOOP;
3235 
3236 	return batch;
3237 }
3238 
3239 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3240 
3241 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3242 {
3243 	struct drm_i915_gem_object *obj;
3244 	struct i915_vma *vma;
3245 	int err;
3246 
3247 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3248 	if (IS_ERR(obj))
3249 		return PTR_ERR(obj);
3250 
3251 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3252 	if (IS_ERR(vma)) {
3253 		err = PTR_ERR(vma);
3254 		goto err;
3255 	}
3256 
3257 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
3258 	if (err)
3259 		goto err;
3260 
3261 	engine->wa_ctx.vma = vma;
3262 	return 0;
3263 
3264 err:
3265 	i915_gem_object_put(obj);
3266 	return err;
3267 }
3268 
3269 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3270 {
3271 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3272 }
3273 
3274 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3275 
3276 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3277 {
3278 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3279 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3280 					    &wa_ctx->per_ctx };
3281 	wa_bb_func_t wa_bb_fn[2];
3282 	struct page *page;
3283 	void *batch, *batch_ptr;
3284 	unsigned int i;
3285 	int ret;
3286 
3287 	if (engine->class != RENDER_CLASS)
3288 		return 0;
3289 
3290 	switch (INTEL_GEN(engine->i915)) {
3291 	case 12:
3292 	case 11:
3293 		return 0;
3294 	case 10:
3295 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3296 		wa_bb_fn[1] = NULL;
3297 		break;
3298 	case 9:
3299 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3300 		wa_bb_fn[1] = NULL;
3301 		break;
3302 	case 8:
3303 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3304 		wa_bb_fn[1] = NULL;
3305 		break;
3306 	default:
3307 		MISSING_CASE(INTEL_GEN(engine->i915));
3308 		return 0;
3309 	}
3310 
3311 	ret = lrc_setup_wa_ctx(engine);
3312 	if (ret) {
3313 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3314 		return ret;
3315 	}
3316 
3317 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3318 	batch = batch_ptr = kmap_atomic(page);
3319 
3320 	/*
3321 	 * Emit the two workaround batch buffers, recording the offset from the
3322 	 * start of the workaround batch buffer object for each and their
3323 	 * respective sizes.
3324 	 */
3325 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3326 		wa_bb[i]->offset = batch_ptr - batch;
3327 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3328 						  CACHELINE_BYTES))) {
3329 			ret = -EINVAL;
3330 			break;
3331 		}
3332 		if (wa_bb_fn[i])
3333 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3334 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3335 	}
3336 
3337 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3338 
3339 	kunmap_atomic(batch);
3340 	if (ret)
3341 		lrc_destroy_wa_ctx(engine);
3342 
3343 	return ret;
3344 }
3345 
3346 static void enable_execlists(struct intel_engine_cs *engine)
3347 {
3348 	u32 mode;
3349 
3350 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3351 
3352 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3353 
3354 	if (INTEL_GEN(engine->i915) >= 11)
3355 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3356 	else
3357 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3358 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3359 
3360 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3361 
3362 	ENGINE_WRITE_FW(engine,
3363 			RING_HWS_PGA,
3364 			i915_ggtt_offset(engine->status_page.vma));
3365 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3366 
3367 	engine->context_tag = 0;
3368 }
3369 
3370 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3371 {
3372 	bool unexpected = false;
3373 
3374 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3375 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3376 		unexpected = true;
3377 	}
3378 
3379 	return unexpected;
3380 }
3381 
3382 static int execlists_resume(struct intel_engine_cs *engine)
3383 {
3384 	intel_engine_apply_workarounds(engine);
3385 	intel_engine_apply_whitelist(engine);
3386 
3387 	intel_mocs_init_engine(engine);
3388 
3389 	intel_engine_reset_breadcrumbs(engine);
3390 
3391 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3392 		struct drm_printer p = drm_debug_printer(__func__);
3393 
3394 		intel_engine_dump(engine, &p, NULL);
3395 	}
3396 
3397 	enable_execlists(engine);
3398 
3399 	return 0;
3400 }
3401 
3402 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3403 {
3404 	struct intel_engine_execlists * const execlists = &engine->execlists;
3405 	unsigned long flags;
3406 
3407 	ENGINE_TRACE(engine, "depth<-%d\n",
3408 		     atomic_read(&execlists->tasklet.count));
3409 
3410 	/*
3411 	 * Prevent request submission to the hardware until we have
3412 	 * completed the reset in i915_gem_reset_finish(). If a request
3413 	 * is completed by one engine, it may then queue a request
3414 	 * to a second via its execlists->tasklet *just* as we are
3415 	 * calling engine->resume() and also writing the ELSP.
3416 	 * Turning off the execlists->tasklet until the reset is over
3417 	 * prevents the race.
3418 	 */
3419 	__tasklet_disable_sync_once(&execlists->tasklet);
3420 	GEM_BUG_ON(!reset_in_progress(execlists));
3421 
3422 	/* And flush any current direct submission. */
3423 	spin_lock_irqsave(&engine->active.lock, flags);
3424 	spin_unlock_irqrestore(&engine->active.lock, flags);
3425 
3426 	/*
3427 	 * We stop engines, otherwise we might get failed reset and a
3428 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3429 	 * from system hang if batchbuffer is progressing when
3430 	 * the reset is issued, regardless of READY_TO_RESET ack.
3431 	 * Thus assume it is best to stop engines on all gens
3432 	 * where we have a gpu reset.
3433 	 *
3434 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3435 	 *
3436 	 * FIXME: Wa for more modern gens needs to be validated
3437 	 */
3438 	intel_engine_stop_cs(engine);
3439 }
3440 
3441 static void reset_csb_pointers(struct intel_engine_cs *engine)
3442 {
3443 	struct intel_engine_execlists * const execlists = &engine->execlists;
3444 	const unsigned int reset_value = execlists->csb_size - 1;
3445 
3446 	ring_set_paused(engine, 0);
3447 
3448 	/*
3449 	 * After a reset, the HW starts writing into CSB entry [0]. We
3450 	 * therefore have to set our HEAD pointer back one entry so that
3451 	 * the *first* entry we check is entry 0. To complicate this further,
3452 	 * as we don't wait for the first interrupt after reset, we have to
3453 	 * fake the HW write to point back to the last entry so that our
3454 	 * inline comparison of our cached head position against the last HW
3455 	 * write works even before the first interrupt.
3456 	 */
3457 	execlists->csb_head = reset_value;
3458 	WRITE_ONCE(*execlists->csb_write, reset_value);
3459 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3460 
3461 	/*
3462 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3463 	 * Bludgeon them with a mmio update to be sure.
3464 	 */
3465 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3466 		     reset_value << 8 | reset_value);
3467 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3468 
3469 	invalidate_csb_entries(&execlists->csb_status[0],
3470 			       &execlists->csb_status[reset_value]);
3471 }
3472 
3473 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3474 {
3475 	int x;
3476 
3477 	x = lrc_ring_mi_mode(engine);
3478 	if (x != -1) {
3479 		regs[x + 1] &= ~STOP_RING;
3480 		regs[x + 1] |= STOP_RING << 16;
3481 	}
3482 }
3483 
3484 static void __execlists_reset_reg_state(const struct intel_context *ce,
3485 					const struct intel_engine_cs *engine)
3486 {
3487 	u32 *regs = ce->lrc_reg_state;
3488 
3489 	__reset_stop_ring(regs, engine);
3490 }
3491 
3492 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3493 {
3494 	struct intel_engine_execlists * const execlists = &engine->execlists;
3495 	struct intel_context *ce;
3496 	struct i915_request *rq;
3497 
3498 	mb(); /* paranoia: read the CSB pointers from after the reset */
3499 	clflush(execlists->csb_write);
3500 	mb();
3501 
3502 	process_csb(engine); /* drain preemption events */
3503 
3504 	/* Following the reset, we need to reload the CSB read/write pointers */
3505 	reset_csb_pointers(engine);
3506 
3507 	/*
3508 	 * Save the currently executing context, even if we completed
3509 	 * its request, it was still running at the time of the
3510 	 * reset and will have been clobbered.
3511 	 */
3512 	rq = execlists_active(execlists);
3513 	if (!rq)
3514 		goto unwind;
3515 
3516 	/* We still have requests in-flight; the engine should be active */
3517 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3518 
3519 	ce = rq->context;
3520 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3521 
3522 	if (i915_request_completed(rq)) {
3523 		/* Idle context; tidy up the ring so we can restart afresh */
3524 		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
3525 		goto out_replay;
3526 	}
3527 
3528 	/* Context has requests still in-flight; it should not be idle! */
3529 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3530 	rq = active_request(ce->timeline, rq);
3531 	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
3532 	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
3533 
3534 	/*
3535 	 * If this request hasn't started yet, e.g. it is waiting on a
3536 	 * semaphore, we need to avoid skipping the request or else we
3537 	 * break the signaling chain. However, if the context is corrupt
3538 	 * the request will not restart and we will be stuck with a wedged
3539 	 * device. It is quite often the case that if we issue a reset
3540 	 * while the GPU is loading the context image, that the context
3541 	 * image becomes corrupt.
3542 	 *
3543 	 * Otherwise, if we have not started yet, the request should replay
3544 	 * perfectly and we do not need to flag the result as being erroneous.
3545 	 */
3546 	if (!i915_request_started(rq))
3547 		goto out_replay;
3548 
3549 	/*
3550 	 * If the request was innocent, we leave the request in the ELSP
3551 	 * and will try to replay it on restarting. The context image may
3552 	 * have been corrupted by the reset, in which case we may have
3553 	 * to service a new GPU hang, but more likely we can continue on
3554 	 * without impact.
3555 	 *
3556 	 * If the request was guilty, we presume the context is corrupt
3557 	 * and have to at least restore the RING register in the context
3558 	 * image back to the expected values to skip over the guilty request.
3559 	 */
3560 	__i915_request_reset(rq, stalled);
3561 	if (!stalled)
3562 		goto out_replay;
3563 
3564 	/*
3565 	 * We want a simple context + ring to execute the breadcrumb update.
3566 	 * We cannot rely on the context being intact across the GPU hang,
3567 	 * so clear it and rebuild just what we need for the breadcrumb.
3568 	 * All pending requests for this context will be zapped, and any
3569 	 * future request will be after userspace has had the opportunity
3570 	 * to recreate its own state.
3571 	 */
3572 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3573 	restore_default_state(ce, engine);
3574 
3575 out_replay:
3576 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3577 		     ce->ring->head, ce->ring->tail);
3578 	intel_ring_update_space(ce->ring);
3579 	__execlists_reset_reg_state(ce, engine);
3580 	__execlists_update_reg_state(ce, engine);
3581 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3582 
3583 unwind:
3584 	/* Push back any incomplete requests for replay after the reset. */
3585 	cancel_port_requests(execlists);
3586 	__unwind_incomplete_requests(engine);
3587 }
3588 
3589 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3590 {
3591 	unsigned long flags;
3592 
3593 	ENGINE_TRACE(engine, "\n");
3594 
3595 	spin_lock_irqsave(&engine->active.lock, flags);
3596 
3597 	__execlists_reset(engine, stalled);
3598 
3599 	spin_unlock_irqrestore(&engine->active.lock, flags);
3600 }
3601 
3602 static void nop_submission_tasklet(unsigned long data)
3603 {
3604 	/* The driver is wedged; don't process any more events. */
3605 }
3606 
3607 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3608 {
3609 	struct intel_engine_execlists * const execlists = &engine->execlists;
3610 	struct i915_request *rq, *rn;
3611 	struct rb_node *rb;
3612 	unsigned long flags;
3613 
3614 	ENGINE_TRACE(engine, "\n");
3615 
3616 	/*
3617 	 * Before we call engine->cancel_requests(), we should have exclusive
3618 	 * access to the submission state. This is arranged for us by the
3619 	 * caller disabling the interrupt generation, the tasklet and other
3620 	 * threads that may then access the same state, giving us a free hand
3621 	 * to reset state. However, we still need to let lockdep be aware that
3622 	 * we know this state may be accessed in hardirq context, so we
3623 	 * disable the irq around this manipulation and we want to keep
3624 	 * the spinlock focused on its duties and not accidentally conflate
3625 	 * coverage to the submission's irq state. (Similarly, although we
3626 	 * shouldn't need to disable irq around the manipulation of the
3627 	 * submission's irq state, we also wish to remind ourselves that
3628 	 * it is irq state.)
3629 	 */
3630 	spin_lock_irqsave(&engine->active.lock, flags);
3631 
3632 	__execlists_reset(engine, true);
3633 
3634 	/* Mark all executing requests as skipped. */
3635 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3636 		mark_eio(rq);
3637 
3638 	/* Flush the queued requests to the timeline list (for retiring). */
3639 	while ((rb = rb_first_cached(&execlists->queue))) {
3640 		struct i915_priolist *p = to_priolist(rb);
3641 		int i;
3642 
3643 		priolist_for_each_request_consume(rq, rn, p, i) {
3644 			mark_eio(rq);
3645 			__i915_request_submit(rq);
3646 		}
3647 
3648 		rb_erase_cached(&p->node, &execlists->queue);
3649 		i915_priolist_free(p);
3650 	}
3651 
3652 	/* On-hold requests will be flushed to timeline upon their release */
3653 	list_for_each_entry(rq, &engine->active.hold, sched.link)
3654 		mark_eio(rq);
3655 
3656 	/* Cancel all attached virtual engines */
3657 	while ((rb = rb_first_cached(&execlists->virtual))) {
3658 		struct virtual_engine *ve =
3659 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3660 
3661 		rb_erase_cached(rb, &execlists->virtual);
3662 		RB_CLEAR_NODE(rb);
3663 
3664 		spin_lock(&ve->base.active.lock);
3665 		rq = fetch_and_zero(&ve->request);
3666 		if (rq) {
3667 			mark_eio(rq);
3668 
3669 			rq->engine = engine;
3670 			__i915_request_submit(rq);
3671 			i915_request_put(rq);
3672 
3673 			ve->base.execlists.queue_priority_hint = INT_MIN;
3674 		}
3675 		spin_unlock(&ve->base.active.lock);
3676 	}
3677 
3678 	/* Remaining _unready_ requests will be nop'ed when submitted */
3679 
3680 	execlists->queue_priority_hint = INT_MIN;
3681 	execlists->queue = RB_ROOT_CACHED;
3682 
3683 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3684 	execlists->tasklet.func = nop_submission_tasklet;
3685 
3686 	spin_unlock_irqrestore(&engine->active.lock, flags);
3687 }
3688 
3689 static void execlists_reset_finish(struct intel_engine_cs *engine)
3690 {
3691 	struct intel_engine_execlists * const execlists = &engine->execlists;
3692 
3693 	/*
3694 	 * After a GPU reset, we may have requests to replay. Do so now while
3695 	 * we still have the forcewake to be sure that the GPU is not allowed
3696 	 * to sleep before we restart and reload a context.
3697 	 */
3698 	GEM_BUG_ON(!reset_in_progress(execlists));
3699 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3700 		execlists->tasklet.func(execlists->tasklet.data);
3701 
3702 	if (__tasklet_enable(&execlists->tasklet))
3703 		/* And kick in case we missed a new request submission. */
3704 		tasklet_hi_schedule(&execlists->tasklet);
3705 	ENGINE_TRACE(engine, "depth->%d\n",
3706 		     atomic_read(&execlists->tasklet.count));
3707 }
3708 
3709 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3710 				    u64 offset, u32 len,
3711 				    const unsigned int flags)
3712 {
3713 	u32 *cs;
3714 
3715 	cs = intel_ring_begin(rq, 4);
3716 	if (IS_ERR(cs))
3717 		return PTR_ERR(cs);
3718 
3719 	/*
3720 	 * WaDisableCtxRestoreArbitration:bdw,chv
3721 	 *
3722 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3723 	 * particular all the gen that do not need the w/a at all!), if we
3724 	 * took care to make sure that on every switch into this context
3725 	 * (both ordinary and for preemption) that arbitrartion was enabled
3726 	 * we would be fine.  However, for gen8 there is another w/a that
3727 	 * requires us to not preempt inside GPGPU execution, so we keep
3728 	 * arbitration disabled for gen8 batches. Arbitration will be
3729 	 * re-enabled before we close the request
3730 	 * (engine->emit_fini_breadcrumb).
3731 	 */
3732 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3733 
3734 	/* FIXME(BDW+): Address space and security selectors. */
3735 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3736 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3737 	*cs++ = lower_32_bits(offset);
3738 	*cs++ = upper_32_bits(offset);
3739 
3740 	intel_ring_advance(rq, cs);
3741 
3742 	return 0;
3743 }
3744 
3745 static int gen8_emit_bb_start(struct i915_request *rq,
3746 			      u64 offset, u32 len,
3747 			      const unsigned int flags)
3748 {
3749 	u32 *cs;
3750 
3751 	cs = intel_ring_begin(rq, 6);
3752 	if (IS_ERR(cs))
3753 		return PTR_ERR(cs);
3754 
3755 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3756 
3757 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3758 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3759 	*cs++ = lower_32_bits(offset);
3760 	*cs++ = upper_32_bits(offset);
3761 
3762 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3763 	*cs++ = MI_NOOP;
3764 
3765 	intel_ring_advance(rq, cs);
3766 
3767 	return 0;
3768 }
3769 
3770 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3771 {
3772 	ENGINE_WRITE(engine, RING_IMR,
3773 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3774 	ENGINE_POSTING_READ(engine, RING_IMR);
3775 }
3776 
3777 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3778 {
3779 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3780 }
3781 
3782 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3783 {
3784 	u32 cmd, *cs;
3785 
3786 	cs = intel_ring_begin(request, 4);
3787 	if (IS_ERR(cs))
3788 		return PTR_ERR(cs);
3789 
3790 	cmd = MI_FLUSH_DW + 1;
3791 
3792 	/* We always require a command barrier so that subsequent
3793 	 * commands, such as breadcrumb interrupts, are strictly ordered
3794 	 * wrt the contents of the write cache being flushed to memory
3795 	 * (and thus being coherent from the CPU).
3796 	 */
3797 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3798 
3799 	if (mode & EMIT_INVALIDATE) {
3800 		cmd |= MI_INVALIDATE_TLB;
3801 		if (request->engine->class == VIDEO_DECODE_CLASS)
3802 			cmd |= MI_INVALIDATE_BSD;
3803 	}
3804 
3805 	*cs++ = cmd;
3806 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3807 	*cs++ = 0; /* upper addr */
3808 	*cs++ = 0; /* value */
3809 	intel_ring_advance(request, cs);
3810 
3811 	return 0;
3812 }
3813 
3814 static int gen8_emit_flush_render(struct i915_request *request,
3815 				  u32 mode)
3816 {
3817 	bool vf_flush_wa = false, dc_flush_wa = false;
3818 	u32 *cs, flags = 0;
3819 	int len;
3820 
3821 	flags |= PIPE_CONTROL_CS_STALL;
3822 
3823 	if (mode & EMIT_FLUSH) {
3824 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3825 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3826 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3827 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3828 	}
3829 
3830 	if (mode & EMIT_INVALIDATE) {
3831 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3832 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3833 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3834 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3835 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3836 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3837 		flags |= PIPE_CONTROL_QW_WRITE;
3838 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3839 
3840 		/*
3841 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3842 		 * pipe control.
3843 		 */
3844 		if (IS_GEN(request->i915, 9))
3845 			vf_flush_wa = true;
3846 
3847 		/* WaForGAMHang:kbl */
3848 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3849 			dc_flush_wa = true;
3850 	}
3851 
3852 	len = 6;
3853 
3854 	if (vf_flush_wa)
3855 		len += 6;
3856 
3857 	if (dc_flush_wa)
3858 		len += 12;
3859 
3860 	cs = intel_ring_begin(request, len);
3861 	if (IS_ERR(cs))
3862 		return PTR_ERR(cs);
3863 
3864 	if (vf_flush_wa)
3865 		cs = gen8_emit_pipe_control(cs, 0, 0);
3866 
3867 	if (dc_flush_wa)
3868 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3869 					    0);
3870 
3871 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3872 
3873 	if (dc_flush_wa)
3874 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3875 
3876 	intel_ring_advance(request, cs);
3877 
3878 	return 0;
3879 }
3880 
3881 static int gen11_emit_flush_render(struct i915_request *request,
3882 				   u32 mode)
3883 {
3884 	if (mode & EMIT_FLUSH) {
3885 		u32 *cs;
3886 		u32 flags = 0;
3887 
3888 		flags |= PIPE_CONTROL_CS_STALL;
3889 
3890 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3891 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3892 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3893 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3894 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3895 		flags |= PIPE_CONTROL_QW_WRITE;
3896 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3897 
3898 		cs = intel_ring_begin(request, 6);
3899 		if (IS_ERR(cs))
3900 			return PTR_ERR(cs);
3901 
3902 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3903 		intel_ring_advance(request, cs);
3904 	}
3905 
3906 	if (mode & EMIT_INVALIDATE) {
3907 		u32 *cs;
3908 		u32 flags = 0;
3909 
3910 		flags |= PIPE_CONTROL_CS_STALL;
3911 
3912 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3913 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3914 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3915 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3916 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3917 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3918 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3919 		flags |= PIPE_CONTROL_QW_WRITE;
3920 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3921 
3922 		cs = intel_ring_begin(request, 6);
3923 		if (IS_ERR(cs))
3924 			return PTR_ERR(cs);
3925 
3926 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3927 		intel_ring_advance(request, cs);
3928 	}
3929 
3930 	return 0;
3931 }
3932 
3933 static u32 preparser_disable(bool state)
3934 {
3935 	return MI_ARB_CHECK | 1 << 8 | state;
3936 }
3937 
3938 static int gen12_emit_flush_render(struct i915_request *request,
3939 				   u32 mode)
3940 {
3941 	if (mode & EMIT_FLUSH) {
3942 		u32 flags = 0;
3943 		u32 *cs;
3944 
3945 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3946 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3947 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3948 		/* Wa_1409600907:tgl */
3949 		flags |= PIPE_CONTROL_DEPTH_STALL;
3950 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3951 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3952 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3953 
3954 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3955 		flags |= PIPE_CONTROL_QW_WRITE;
3956 
3957 		flags |= PIPE_CONTROL_CS_STALL;
3958 
3959 		cs = intel_ring_begin(request, 6);
3960 		if (IS_ERR(cs))
3961 			return PTR_ERR(cs);
3962 
3963 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3964 		intel_ring_advance(request, cs);
3965 	}
3966 
3967 	if (mode & EMIT_INVALIDATE) {
3968 		u32 flags = 0;
3969 		u32 *cs;
3970 
3971 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3972 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3973 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3974 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3975 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3976 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3977 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3978 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3979 
3980 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3981 		flags |= PIPE_CONTROL_QW_WRITE;
3982 
3983 		flags |= PIPE_CONTROL_CS_STALL;
3984 
3985 		cs = intel_ring_begin(request, 8);
3986 		if (IS_ERR(cs))
3987 			return PTR_ERR(cs);
3988 
3989 		/*
3990 		 * Prevent the pre-parser from skipping past the TLB
3991 		 * invalidate and loading a stale page for the batch
3992 		 * buffer / request payload.
3993 		 */
3994 		*cs++ = preparser_disable(true);
3995 
3996 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3997 
3998 		*cs++ = preparser_disable(false);
3999 		intel_ring_advance(request, cs);
4000 
4001 		/*
4002 		 * Wa_1604544889:tgl
4003 		 */
4004 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
4005 			flags = 0;
4006 			flags |= PIPE_CONTROL_CS_STALL;
4007 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4008 
4009 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4010 			flags |= PIPE_CONTROL_QW_WRITE;
4011 
4012 			cs = intel_ring_begin(request, 6);
4013 			if (IS_ERR(cs))
4014 				return PTR_ERR(cs);
4015 
4016 			cs = gen8_emit_pipe_control(cs, flags,
4017 						    LRC_PPHWSP_SCRATCH_ADDR);
4018 			intel_ring_advance(request, cs);
4019 		}
4020 	}
4021 
4022 	return 0;
4023 }
4024 
4025 /*
4026  * Reserve space for 2 NOOPs at the end of each request to be
4027  * used as a workaround for not being allowed to do lite
4028  * restore with HEAD==TAIL (WaIdleLiteRestore).
4029  */
4030 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4031 {
4032 	/* Ensure there's always at least one preemption point per-request. */
4033 	*cs++ = MI_ARB_CHECK;
4034 	*cs++ = MI_NOOP;
4035 	request->wa_tail = intel_ring_offset(request, cs);
4036 
4037 	return cs;
4038 }
4039 
4040 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4041 {
4042 	*cs++ = MI_SEMAPHORE_WAIT |
4043 		MI_SEMAPHORE_GLOBAL_GTT |
4044 		MI_SEMAPHORE_POLL |
4045 		MI_SEMAPHORE_SAD_EQ_SDD;
4046 	*cs++ = 0;
4047 	*cs++ = intel_hws_preempt_address(request->engine);
4048 	*cs++ = 0;
4049 
4050 	return cs;
4051 }
4052 
4053 static __always_inline u32*
4054 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4055 				 u32 *cs)
4056 {
4057 	*cs++ = MI_USER_INTERRUPT;
4058 
4059 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4060 	if (intel_engine_has_semaphores(request->engine))
4061 		cs = emit_preempt_busywait(request, cs);
4062 
4063 	request->tail = intel_ring_offset(request, cs);
4064 	assert_ring_tail_valid(request->ring, request->tail);
4065 
4066 	return gen8_emit_wa_tail(request, cs);
4067 }
4068 
4069 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4070 {
4071 	cs = gen8_emit_ggtt_write(cs,
4072 				  request->fence.seqno,
4073 				  i915_request_active_timeline(request)->hwsp_offset,
4074 				  0);
4075 
4076 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4077 }
4078 
4079 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4080 {
4081 	cs = gen8_emit_pipe_control(cs,
4082 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4083 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4084 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4085 				    0);
4086 
4087 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4088 	cs = gen8_emit_ggtt_write_rcs(cs,
4089 				      request->fence.seqno,
4090 				      i915_request_active_timeline(request)->hwsp_offset,
4091 				      PIPE_CONTROL_FLUSH_ENABLE |
4092 				      PIPE_CONTROL_CS_STALL);
4093 
4094 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4095 }
4096 
4097 static u32 *
4098 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4099 {
4100 	cs = gen8_emit_ggtt_write_rcs(cs,
4101 				      request->fence.seqno,
4102 				      i915_request_active_timeline(request)->hwsp_offset,
4103 				      PIPE_CONTROL_CS_STALL |
4104 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4105 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4106 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4107 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4108 				      PIPE_CONTROL_FLUSH_ENABLE);
4109 
4110 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4111 }
4112 
4113 /*
4114  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4115  * flush and will continue pre-fetching the instructions after it before the
4116  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4117  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4118  * of the next request before the memory has been flushed, we're guaranteed that
4119  * we won't access the batch itself too early.
4120  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4121  * so, if the current request is modifying an instruction in the next request on
4122  * the same intel_context, we might pre-fetch and then execute the pre-update
4123  * instruction. To avoid this, the users of self-modifying code should either
4124  * disable the parser around the code emitting the memory writes, via a new flag
4125  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4126  * the in-kernel use-cases we've opted to use a separate context, see
4127  * reloc_gpu() as an example.
4128  * All the above applies only to the instructions themselves. Non-inline data
4129  * used by the instructions is not pre-fetched.
4130  */
4131 
4132 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4133 {
4134 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4135 		MI_SEMAPHORE_GLOBAL_GTT |
4136 		MI_SEMAPHORE_POLL |
4137 		MI_SEMAPHORE_SAD_EQ_SDD;
4138 	*cs++ = 0;
4139 	*cs++ = intel_hws_preempt_address(request->engine);
4140 	*cs++ = 0;
4141 	*cs++ = 0;
4142 	*cs++ = MI_NOOP;
4143 
4144 	return cs;
4145 }
4146 
4147 static __always_inline u32*
4148 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4149 {
4150 	*cs++ = MI_USER_INTERRUPT;
4151 
4152 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4153 	if (intel_engine_has_semaphores(request->engine))
4154 		cs = gen12_emit_preempt_busywait(request, cs);
4155 
4156 	request->tail = intel_ring_offset(request, cs);
4157 	assert_ring_tail_valid(request->ring, request->tail);
4158 
4159 	return gen8_emit_wa_tail(request, cs);
4160 }
4161 
4162 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4163 {
4164 	cs = gen8_emit_ggtt_write(cs,
4165 				  request->fence.seqno,
4166 				  i915_request_active_timeline(request)->hwsp_offset,
4167 				  0);
4168 
4169 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4170 }
4171 
4172 static u32 *
4173 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4174 {
4175 	cs = gen8_emit_ggtt_write_rcs(cs,
4176 				      request->fence.seqno,
4177 				      i915_request_active_timeline(request)->hwsp_offset,
4178 				      PIPE_CONTROL_CS_STALL |
4179 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4180 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4181 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4182 				      /* Wa_1409600907:tgl */
4183 				      PIPE_CONTROL_DEPTH_STALL |
4184 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4185 				      PIPE_CONTROL_FLUSH_ENABLE |
4186 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4187 
4188 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4189 }
4190 
4191 static void execlists_park(struct intel_engine_cs *engine)
4192 {
4193 	cancel_timer(&engine->execlists.timer);
4194 	cancel_timer(&engine->execlists.preempt);
4195 }
4196 
4197 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4198 {
4199 	engine->submit_request = execlists_submit_request;
4200 	engine->schedule = i915_schedule;
4201 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4202 
4203 	engine->reset.prepare = execlists_reset_prepare;
4204 	engine->reset.rewind = execlists_reset_rewind;
4205 	engine->reset.cancel = execlists_reset_cancel;
4206 	engine->reset.finish = execlists_reset_finish;
4207 
4208 	engine->park = execlists_park;
4209 	engine->unpark = NULL;
4210 
4211 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4212 	if (!intel_vgpu_active(engine->i915)) {
4213 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4214 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4215 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4216 	}
4217 
4218 	if (INTEL_GEN(engine->i915) >= 12)
4219 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4220 
4221 	if (intel_engine_has_preemption(engine))
4222 		engine->emit_bb_start = gen8_emit_bb_start;
4223 	else
4224 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4225 }
4226 
4227 static void execlists_shutdown(struct intel_engine_cs *engine)
4228 {
4229 	/* Synchronise with residual timers and any softirq they raise */
4230 	del_timer_sync(&engine->execlists.timer);
4231 	del_timer_sync(&engine->execlists.preempt);
4232 	tasklet_kill(&engine->execlists.tasklet);
4233 }
4234 
4235 static void execlists_release(struct intel_engine_cs *engine)
4236 {
4237 	execlists_shutdown(engine);
4238 
4239 	intel_engine_cleanup_common(engine);
4240 	lrc_destroy_wa_ctx(engine);
4241 }
4242 
4243 static void
4244 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4245 {
4246 	/* Default vfuncs which can be overriden by each engine. */
4247 
4248 	engine->resume = execlists_resume;
4249 
4250 	engine->cops = &execlists_context_ops;
4251 	engine->request_alloc = execlists_request_alloc;
4252 
4253 	engine->emit_flush = gen8_emit_flush;
4254 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4255 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4256 	if (INTEL_GEN(engine->i915) >= 12)
4257 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4258 
4259 	engine->set_default_submission = intel_execlists_set_default_submission;
4260 
4261 	if (INTEL_GEN(engine->i915) < 11) {
4262 		engine->irq_enable = gen8_logical_ring_enable_irq;
4263 		engine->irq_disable = gen8_logical_ring_disable_irq;
4264 	} else {
4265 		/*
4266 		 * TODO: On Gen11 interrupt masks need to be clear
4267 		 * to allow C6 entry. Keep interrupts enabled at
4268 		 * and take the hit of generating extra interrupts
4269 		 * until a more refined solution exists.
4270 		 */
4271 	}
4272 }
4273 
4274 static inline void
4275 logical_ring_default_irqs(struct intel_engine_cs *engine)
4276 {
4277 	unsigned int shift = 0;
4278 
4279 	if (INTEL_GEN(engine->i915) < 11) {
4280 		const u8 irq_shifts[] = {
4281 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
4282 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
4283 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4284 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4285 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
4286 		};
4287 
4288 		shift = irq_shifts[engine->id];
4289 	}
4290 
4291 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4292 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4293 }
4294 
4295 static void rcs_submission_override(struct intel_engine_cs *engine)
4296 {
4297 	switch (INTEL_GEN(engine->i915)) {
4298 	case 12:
4299 		engine->emit_flush = gen12_emit_flush_render;
4300 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4301 		break;
4302 	case 11:
4303 		engine->emit_flush = gen11_emit_flush_render;
4304 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4305 		break;
4306 	default:
4307 		engine->emit_flush = gen8_emit_flush_render;
4308 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4309 		break;
4310 	}
4311 }
4312 
4313 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4314 {
4315 	struct intel_engine_execlists * const execlists = &engine->execlists;
4316 	struct drm_i915_private *i915 = engine->i915;
4317 	struct intel_uncore *uncore = engine->uncore;
4318 	u32 base = engine->mmio_base;
4319 
4320 	tasklet_init(&engine->execlists.tasklet,
4321 		     execlists_submission_tasklet, (unsigned long)engine);
4322 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4323 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4324 
4325 	logical_ring_default_vfuncs(engine);
4326 	logical_ring_default_irqs(engine);
4327 
4328 	if (engine->class == RENDER_CLASS)
4329 		rcs_submission_override(engine);
4330 
4331 	if (intel_init_workaround_bb(engine))
4332 		/*
4333 		 * We continue even if we fail to initialize WA batch
4334 		 * because we only expect rare glitches but nothing
4335 		 * critical to prevent us from using GPU
4336 		 */
4337 		DRM_ERROR("WA batch buffer initialization failed\n");
4338 
4339 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
4340 		execlists->submit_reg = uncore->regs +
4341 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4342 		execlists->ctrl_reg = uncore->regs +
4343 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4344 	} else {
4345 		execlists->submit_reg = uncore->regs +
4346 			i915_mmio_reg_offset(RING_ELSP(base));
4347 	}
4348 
4349 	execlists->csb_status =
4350 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4351 
4352 	execlists->csb_write =
4353 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
4354 
4355 	if (INTEL_GEN(i915) < 11)
4356 		execlists->csb_size = GEN8_CSB_ENTRIES;
4357 	else
4358 		execlists->csb_size = GEN11_CSB_ENTRIES;
4359 
4360 	reset_csb_pointers(engine);
4361 
4362 	/* Finally, take ownership and responsibility for cleanup! */
4363 	engine->release = execlists_release;
4364 
4365 	return 0;
4366 }
4367 
4368 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4369 {
4370 	u32 indirect_ctx_offset;
4371 
4372 	switch (INTEL_GEN(engine->i915)) {
4373 	default:
4374 		MISSING_CASE(INTEL_GEN(engine->i915));
4375 		/* fall through */
4376 	case 12:
4377 		indirect_ctx_offset =
4378 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4379 		break;
4380 	case 11:
4381 		indirect_ctx_offset =
4382 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4383 		break;
4384 	case 10:
4385 		indirect_ctx_offset =
4386 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4387 		break;
4388 	case 9:
4389 		indirect_ctx_offset =
4390 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4391 		break;
4392 	case 8:
4393 		indirect_ctx_offset =
4394 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4395 		break;
4396 	}
4397 
4398 	return indirect_ctx_offset;
4399 }
4400 
4401 
4402 static void init_common_reg_state(u32 * const regs,
4403 				  const struct intel_engine_cs *engine,
4404 				  const struct intel_ring *ring,
4405 				  bool inhibit)
4406 {
4407 	u32 ctl;
4408 
4409 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4410 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4411 	if (inhibit)
4412 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4413 	if (INTEL_GEN(engine->i915) < 11)
4414 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4415 					   CTX_CTRL_RS_CTX_ENABLE);
4416 	regs[CTX_CONTEXT_CONTROL] = ctl;
4417 
4418 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4419 }
4420 
4421 static void init_wa_bb_reg_state(u32 * const regs,
4422 				 const struct intel_engine_cs *engine,
4423 				 u32 pos_bb_per_ctx)
4424 {
4425 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4426 
4427 	if (wa_ctx->per_ctx.size) {
4428 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4429 
4430 		regs[pos_bb_per_ctx] =
4431 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4432 	}
4433 
4434 	if (wa_ctx->indirect_ctx.size) {
4435 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4436 
4437 		regs[pos_bb_per_ctx + 2] =
4438 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
4439 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4440 
4441 		regs[pos_bb_per_ctx + 4] =
4442 			intel_lr_indirect_ctx_offset(engine) << 6;
4443 	}
4444 }
4445 
4446 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4447 {
4448 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
4449 		/* 64b PPGTT (48bit canonical)
4450 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
4451 		 * other PDP Descriptors are ignored.
4452 		 */
4453 		ASSIGN_CTX_PML4(ppgtt, regs);
4454 	} else {
4455 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
4456 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
4457 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
4458 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
4459 	}
4460 }
4461 
4462 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4463 {
4464 	if (i915_is_ggtt(vm))
4465 		return i915_vm_to_ggtt(vm)->alias;
4466 	else
4467 		return i915_vm_to_ppgtt(vm);
4468 }
4469 
4470 static void execlists_init_reg_state(u32 *regs,
4471 				     const struct intel_context *ce,
4472 				     const struct intel_engine_cs *engine,
4473 				     const struct intel_ring *ring,
4474 				     bool inhibit)
4475 {
4476 	/*
4477 	 * A context is actually a big batch buffer with several
4478 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4479 	 * values we are setting here are only for the first context restore:
4480 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
4481 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4482 	 * we are not initializing here).
4483 	 *
4484 	 * Must keep consistent with virtual_update_register_offsets().
4485 	 */
4486 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
4487 
4488 	init_common_reg_state(regs, engine, ring, inhibit);
4489 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4490 
4491 	init_wa_bb_reg_state(regs, engine,
4492 			     INTEL_GEN(engine->i915) >= 12 ?
4493 			     GEN12_CTX_BB_PER_CTX_PTR :
4494 			     CTX_BB_PER_CTX_PTR);
4495 
4496 	__reset_stop_ring(regs, engine);
4497 }
4498 
4499 static int
4500 populate_lr_context(struct intel_context *ce,
4501 		    struct drm_i915_gem_object *ctx_obj,
4502 		    struct intel_engine_cs *engine,
4503 		    struct intel_ring *ring)
4504 {
4505 	bool inhibit = true;
4506 	void *vaddr;
4507 	int ret;
4508 
4509 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4510 	if (IS_ERR(vaddr)) {
4511 		ret = PTR_ERR(vaddr);
4512 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4513 		return ret;
4514 	}
4515 
4516 	set_redzone(vaddr, engine);
4517 
4518 	if (engine->default_state) {
4519 		void *defaults;
4520 
4521 		defaults = i915_gem_object_pin_map(engine->default_state,
4522 						   I915_MAP_WB);
4523 		if (IS_ERR(defaults)) {
4524 			ret = PTR_ERR(defaults);
4525 			goto err_unpin_ctx;
4526 		}
4527 
4528 		memcpy(vaddr, defaults, engine->context_size);
4529 		i915_gem_object_unpin_map(engine->default_state);
4530 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
4531 		inhibit = false;
4532 	}
4533 
4534 	/* The second page of the context object contains some fields which must
4535 	 * be set up prior to the first execution. */
4536 	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4537 				 ce, engine, ring, inhibit);
4538 
4539 	ret = 0;
4540 err_unpin_ctx:
4541 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4542 	i915_gem_object_unpin_map(ctx_obj);
4543 	return ret;
4544 }
4545 
4546 static int __execlists_context_alloc(struct intel_context *ce,
4547 				     struct intel_engine_cs *engine)
4548 {
4549 	struct drm_i915_gem_object *ctx_obj;
4550 	struct intel_ring *ring;
4551 	struct i915_vma *vma;
4552 	u32 context_size;
4553 	int ret;
4554 
4555 	GEM_BUG_ON(ce->state);
4556 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4557 
4558 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4559 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4560 
4561 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4562 	if (IS_ERR(ctx_obj))
4563 		return PTR_ERR(ctx_obj);
4564 
4565 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4566 	if (IS_ERR(vma)) {
4567 		ret = PTR_ERR(vma);
4568 		goto error_deref_obj;
4569 	}
4570 
4571 	if (!ce->timeline) {
4572 		struct intel_timeline *tl;
4573 
4574 		tl = intel_timeline_create(engine->gt, NULL);
4575 		if (IS_ERR(tl)) {
4576 			ret = PTR_ERR(tl);
4577 			goto error_deref_obj;
4578 		}
4579 
4580 		ce->timeline = tl;
4581 	}
4582 
4583 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4584 	if (IS_ERR(ring)) {
4585 		ret = PTR_ERR(ring);
4586 		goto error_deref_obj;
4587 	}
4588 
4589 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4590 	if (ret) {
4591 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4592 		goto error_ring_free;
4593 	}
4594 
4595 	ce->ring = ring;
4596 	ce->state = vma;
4597 
4598 	return 0;
4599 
4600 error_ring_free:
4601 	intel_ring_put(ring);
4602 error_deref_obj:
4603 	i915_gem_object_put(ctx_obj);
4604 	return ret;
4605 }
4606 
4607 static struct list_head *virtual_queue(struct virtual_engine *ve)
4608 {
4609 	return &ve->base.execlists.default_priolist.requests[0];
4610 }
4611 
4612 static void virtual_context_destroy(struct kref *kref)
4613 {
4614 	struct virtual_engine *ve =
4615 		container_of(kref, typeof(*ve), context.ref);
4616 	unsigned int n;
4617 
4618 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4619 	GEM_BUG_ON(ve->request);
4620 	GEM_BUG_ON(ve->context.inflight);
4621 
4622 	for (n = 0; n < ve->num_siblings; n++) {
4623 		struct intel_engine_cs *sibling = ve->siblings[n];
4624 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4625 		unsigned long flags;
4626 
4627 		if (RB_EMPTY_NODE(node))
4628 			continue;
4629 
4630 		spin_lock_irqsave(&sibling->active.lock, flags);
4631 
4632 		/* Detachment is lazily performed in the execlists tasklet */
4633 		if (!RB_EMPTY_NODE(node))
4634 			rb_erase_cached(node, &sibling->execlists.virtual);
4635 
4636 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4637 	}
4638 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4639 
4640 	if (ve->context.state)
4641 		__execlists_context_fini(&ve->context);
4642 	intel_context_fini(&ve->context);
4643 
4644 	kfree(ve->bonds);
4645 	kfree(ve);
4646 }
4647 
4648 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4649 {
4650 	int swp;
4651 
4652 	/*
4653 	 * Pick a random sibling on starting to help spread the load around.
4654 	 *
4655 	 * New contexts are typically created with exactly the same order
4656 	 * of siblings, and often started in batches. Due to the way we iterate
4657 	 * the array of sibling when submitting requests, sibling[0] is
4658 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4659 	 * randomised across the system, we also help spread the load by the
4660 	 * first engine we inspect being different each time.
4661 	 *
4662 	 * NB This does not force us to execute on this engine, it will just
4663 	 * typically be the first we inspect for submission.
4664 	 */
4665 	swp = prandom_u32_max(ve->num_siblings);
4666 	if (!swp)
4667 		return;
4668 
4669 	swap(ve->siblings[swp], ve->siblings[0]);
4670 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4671 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4672 						ve->siblings[0]);
4673 }
4674 
4675 static int virtual_context_alloc(struct intel_context *ce)
4676 {
4677 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4678 
4679 	return __execlists_context_alloc(ce, ve->siblings[0]);
4680 }
4681 
4682 static int virtual_context_pin(struct intel_context *ce)
4683 {
4684 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4685 	int err;
4686 
4687 	/* Note: we must use a real engine class for setting up reg state */
4688 	err = __execlists_context_pin(ce, ve->siblings[0]);
4689 	if (err)
4690 		return err;
4691 
4692 	virtual_engine_initial_hint(ve);
4693 	return 0;
4694 }
4695 
4696 static void virtual_context_enter(struct intel_context *ce)
4697 {
4698 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4699 	unsigned int n;
4700 
4701 	for (n = 0; n < ve->num_siblings; n++)
4702 		intel_engine_pm_get(ve->siblings[n]);
4703 
4704 	intel_timeline_enter(ce->timeline);
4705 }
4706 
4707 static void virtual_context_exit(struct intel_context *ce)
4708 {
4709 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4710 	unsigned int n;
4711 
4712 	intel_timeline_exit(ce->timeline);
4713 
4714 	for (n = 0; n < ve->num_siblings; n++)
4715 		intel_engine_pm_put(ve->siblings[n]);
4716 }
4717 
4718 static const struct intel_context_ops virtual_context_ops = {
4719 	.alloc = virtual_context_alloc,
4720 
4721 	.pin = virtual_context_pin,
4722 	.unpin = execlists_context_unpin,
4723 
4724 	.enter = virtual_context_enter,
4725 	.exit = virtual_context_exit,
4726 
4727 	.destroy = virtual_context_destroy,
4728 };
4729 
4730 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4731 {
4732 	struct i915_request *rq;
4733 	intel_engine_mask_t mask;
4734 
4735 	rq = READ_ONCE(ve->request);
4736 	if (!rq)
4737 		return 0;
4738 
4739 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4740 	mask = rq->execution_mask;
4741 	if (unlikely(!mask)) {
4742 		/* Invalid selection, submit to a random engine in error */
4743 		i915_request_skip(rq, -ENODEV);
4744 		mask = ve->siblings[0]->mask;
4745 	}
4746 
4747 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4748 		     rq->fence.context, rq->fence.seqno,
4749 		     mask, ve->base.execlists.queue_priority_hint);
4750 
4751 	return mask;
4752 }
4753 
4754 static void virtual_submission_tasklet(unsigned long data)
4755 {
4756 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4757 	const int prio = ve->base.execlists.queue_priority_hint;
4758 	intel_engine_mask_t mask;
4759 	unsigned int n;
4760 
4761 	rcu_read_lock();
4762 	mask = virtual_submission_mask(ve);
4763 	rcu_read_unlock();
4764 	if (unlikely(!mask))
4765 		return;
4766 
4767 	local_irq_disable();
4768 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4769 		struct intel_engine_cs *sibling = ve->siblings[n];
4770 		struct ve_node * const node = &ve->nodes[sibling->id];
4771 		struct rb_node **parent, *rb;
4772 		bool first;
4773 
4774 		if (unlikely(!(mask & sibling->mask))) {
4775 			if (!RB_EMPTY_NODE(&node->rb)) {
4776 				spin_lock(&sibling->active.lock);
4777 				rb_erase_cached(&node->rb,
4778 						&sibling->execlists.virtual);
4779 				RB_CLEAR_NODE(&node->rb);
4780 				spin_unlock(&sibling->active.lock);
4781 			}
4782 			continue;
4783 		}
4784 
4785 		spin_lock(&sibling->active.lock);
4786 
4787 		if (!RB_EMPTY_NODE(&node->rb)) {
4788 			/*
4789 			 * Cheat and avoid rebalancing the tree if we can
4790 			 * reuse this node in situ.
4791 			 */
4792 			first = rb_first_cached(&sibling->execlists.virtual) ==
4793 				&node->rb;
4794 			if (prio == node->prio || (prio > node->prio && first))
4795 				goto submit_engine;
4796 
4797 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4798 		}
4799 
4800 		rb = NULL;
4801 		first = true;
4802 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4803 		while (*parent) {
4804 			struct ve_node *other;
4805 
4806 			rb = *parent;
4807 			other = rb_entry(rb, typeof(*other), rb);
4808 			if (prio > other->prio) {
4809 				parent = &rb->rb_left;
4810 			} else {
4811 				parent = &rb->rb_right;
4812 				first = false;
4813 			}
4814 		}
4815 
4816 		rb_link_node(&node->rb, rb, parent);
4817 		rb_insert_color_cached(&node->rb,
4818 				       &sibling->execlists.virtual,
4819 				       first);
4820 
4821 submit_engine:
4822 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4823 		node->prio = prio;
4824 		if (first && prio > sibling->execlists.queue_priority_hint) {
4825 			sibling->execlists.queue_priority_hint = prio;
4826 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4827 		}
4828 
4829 		spin_unlock(&sibling->active.lock);
4830 	}
4831 	local_irq_enable();
4832 }
4833 
4834 static void virtual_submit_request(struct i915_request *rq)
4835 {
4836 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4837 	struct i915_request *old;
4838 	unsigned long flags;
4839 
4840 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4841 		     rq->fence.context,
4842 		     rq->fence.seqno);
4843 
4844 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4845 
4846 	spin_lock_irqsave(&ve->base.active.lock, flags);
4847 
4848 	old = ve->request;
4849 	if (old) { /* background completion event from preempt-to-busy */
4850 		GEM_BUG_ON(!i915_request_completed(old));
4851 		__i915_request_submit(old);
4852 		i915_request_put(old);
4853 	}
4854 
4855 	if (i915_request_completed(rq)) {
4856 		__i915_request_submit(rq);
4857 
4858 		ve->base.execlists.queue_priority_hint = INT_MIN;
4859 		ve->request = NULL;
4860 	} else {
4861 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4862 		ve->request = i915_request_get(rq);
4863 
4864 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4865 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4866 
4867 		tasklet_schedule(&ve->base.execlists.tasklet);
4868 	}
4869 
4870 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4871 }
4872 
4873 static struct ve_bond *
4874 virtual_find_bond(struct virtual_engine *ve,
4875 		  const struct intel_engine_cs *master)
4876 {
4877 	int i;
4878 
4879 	for (i = 0; i < ve->num_bonds; i++) {
4880 		if (ve->bonds[i].master == master)
4881 			return &ve->bonds[i];
4882 	}
4883 
4884 	return NULL;
4885 }
4886 
4887 static void
4888 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4889 {
4890 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4891 	intel_engine_mask_t allowed, exec;
4892 	struct ve_bond *bond;
4893 
4894 	allowed = ~to_request(signal)->engine->mask;
4895 
4896 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4897 	if (bond)
4898 		allowed &= bond->sibling_mask;
4899 
4900 	/* Restrict the bonded request to run on only the available engines */
4901 	exec = READ_ONCE(rq->execution_mask);
4902 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4903 		;
4904 
4905 	/* Prevent the master from being re-run on the bonded engines */
4906 	to_request(signal)->execution_mask &= ~allowed;
4907 }
4908 
4909 struct intel_context *
4910 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
4911 			       unsigned int count)
4912 {
4913 	struct virtual_engine *ve;
4914 	unsigned int n;
4915 	int err;
4916 
4917 	if (count == 0)
4918 		return ERR_PTR(-EINVAL);
4919 
4920 	if (count == 1)
4921 		return intel_context_create(siblings[0]);
4922 
4923 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4924 	if (!ve)
4925 		return ERR_PTR(-ENOMEM);
4926 
4927 	ve->base.i915 = siblings[0]->i915;
4928 	ve->base.gt = siblings[0]->gt;
4929 	ve->base.uncore = siblings[0]->uncore;
4930 	ve->base.id = -1;
4931 
4932 	ve->base.class = OTHER_CLASS;
4933 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4934 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4935 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4936 
4937 	/*
4938 	 * The decision on whether to submit a request using semaphores
4939 	 * depends on the saturated state of the engine. We only compute
4940 	 * this during HW submission of the request, and we need for this
4941 	 * state to be globally applied to all requests being submitted
4942 	 * to this engine. Virtual engines encompass more than one physical
4943 	 * engine and so we cannot accurately tell in advance if one of those
4944 	 * engines is already saturated and so cannot afford to use a semaphore
4945 	 * and be pessimized in priority for doing so -- if we are the only
4946 	 * context using semaphores after all other clients have stopped, we
4947 	 * will be starved on the saturated system. Such a global switch for
4948 	 * semaphores is less than ideal, but alas is the current compromise.
4949 	 */
4950 	ve->base.saturated = ALL_ENGINES;
4951 
4952 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4953 
4954 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4955 	intel_engine_init_breadcrumbs(&ve->base);
4956 	intel_engine_init_execlists(&ve->base);
4957 
4958 	ve->base.cops = &virtual_context_ops;
4959 	ve->base.request_alloc = execlists_request_alloc;
4960 
4961 	ve->base.schedule = i915_schedule;
4962 	ve->base.submit_request = virtual_submit_request;
4963 	ve->base.bond_execute = virtual_bond_execute;
4964 
4965 	INIT_LIST_HEAD(virtual_queue(ve));
4966 	ve->base.execlists.queue_priority_hint = INT_MIN;
4967 	tasklet_init(&ve->base.execlists.tasklet,
4968 		     virtual_submission_tasklet,
4969 		     (unsigned long)ve);
4970 
4971 	intel_context_init(&ve->context, &ve->base);
4972 
4973 	for (n = 0; n < count; n++) {
4974 		struct intel_engine_cs *sibling = siblings[n];
4975 
4976 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4977 		if (sibling->mask & ve->base.mask) {
4978 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4979 				  sibling->name);
4980 			err = -EINVAL;
4981 			goto err_put;
4982 		}
4983 
4984 		/*
4985 		 * The virtual engine implementation is tightly coupled to
4986 		 * the execlists backend -- we push out request directly
4987 		 * into a tree inside each physical engine. We could support
4988 		 * layering if we handle cloning of the requests and
4989 		 * submitting a copy into each backend.
4990 		 */
4991 		if (sibling->execlists.tasklet.func !=
4992 		    execlists_submission_tasklet) {
4993 			err = -ENODEV;
4994 			goto err_put;
4995 		}
4996 
4997 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4998 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4999 
5000 		ve->siblings[ve->num_siblings++] = sibling;
5001 		ve->base.mask |= sibling->mask;
5002 
5003 		/*
5004 		 * All physical engines must be compatible for their emission
5005 		 * functions (as we build the instructions during request
5006 		 * construction and do not alter them before submission
5007 		 * on the physical engine). We use the engine class as a guide
5008 		 * here, although that could be refined.
5009 		 */
5010 		if (ve->base.class != OTHER_CLASS) {
5011 			if (ve->base.class != sibling->class) {
5012 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5013 					  sibling->class, ve->base.class);
5014 				err = -EINVAL;
5015 				goto err_put;
5016 			}
5017 			continue;
5018 		}
5019 
5020 		ve->base.class = sibling->class;
5021 		ve->base.uabi_class = sibling->uabi_class;
5022 		snprintf(ve->base.name, sizeof(ve->base.name),
5023 			 "v%dx%d", ve->base.class, count);
5024 		ve->base.context_size = sibling->context_size;
5025 
5026 		ve->base.emit_bb_start = sibling->emit_bb_start;
5027 		ve->base.emit_flush = sibling->emit_flush;
5028 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5029 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5030 		ve->base.emit_fini_breadcrumb_dw =
5031 			sibling->emit_fini_breadcrumb_dw;
5032 
5033 		ve->base.flags = sibling->flags;
5034 	}
5035 
5036 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5037 
5038 	return &ve->context;
5039 
5040 err_put:
5041 	intel_context_put(&ve->context);
5042 	return ERR_PTR(err);
5043 }
5044 
5045 struct intel_context *
5046 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5047 {
5048 	struct virtual_engine *se = to_virtual_engine(src);
5049 	struct intel_context *dst;
5050 
5051 	dst = intel_execlists_create_virtual(se->siblings,
5052 					     se->num_siblings);
5053 	if (IS_ERR(dst))
5054 		return dst;
5055 
5056 	if (se->num_bonds) {
5057 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5058 
5059 		de->bonds = kmemdup(se->bonds,
5060 				    sizeof(*se->bonds) * se->num_bonds,
5061 				    GFP_KERNEL);
5062 		if (!de->bonds) {
5063 			intel_context_put(dst);
5064 			return ERR_PTR(-ENOMEM);
5065 		}
5066 
5067 		de->num_bonds = se->num_bonds;
5068 	}
5069 
5070 	return dst;
5071 }
5072 
5073 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5074 				     const struct intel_engine_cs *master,
5075 				     const struct intel_engine_cs *sibling)
5076 {
5077 	struct virtual_engine *ve = to_virtual_engine(engine);
5078 	struct ve_bond *bond;
5079 	int n;
5080 
5081 	/* Sanity check the sibling is part of the virtual engine */
5082 	for (n = 0; n < ve->num_siblings; n++)
5083 		if (sibling == ve->siblings[n])
5084 			break;
5085 	if (n == ve->num_siblings)
5086 		return -EINVAL;
5087 
5088 	bond = virtual_find_bond(ve, master);
5089 	if (bond) {
5090 		bond->sibling_mask |= sibling->mask;
5091 		return 0;
5092 	}
5093 
5094 	bond = krealloc(ve->bonds,
5095 			sizeof(*bond) * (ve->num_bonds + 1),
5096 			GFP_KERNEL);
5097 	if (!bond)
5098 		return -ENOMEM;
5099 
5100 	bond[ve->num_bonds].master = master;
5101 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5102 
5103 	ve->bonds = bond;
5104 	ve->num_bonds++;
5105 
5106 	return 0;
5107 }
5108 
5109 struct intel_engine_cs *
5110 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5111 				 unsigned int sibling)
5112 {
5113 	struct virtual_engine *ve = to_virtual_engine(engine);
5114 
5115 	if (sibling >= ve->num_siblings)
5116 		return NULL;
5117 
5118 	return ve->siblings[sibling];
5119 }
5120 
5121 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5122 				   struct drm_printer *m,
5123 				   void (*show_request)(struct drm_printer *m,
5124 							struct i915_request *rq,
5125 							const char *prefix),
5126 				   unsigned int max)
5127 {
5128 	const struct intel_engine_execlists *execlists = &engine->execlists;
5129 	struct i915_request *rq, *last;
5130 	unsigned long flags;
5131 	unsigned int count;
5132 	struct rb_node *rb;
5133 
5134 	spin_lock_irqsave(&engine->active.lock, flags);
5135 
5136 	last = NULL;
5137 	count = 0;
5138 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5139 		if (count++ < max - 1)
5140 			show_request(m, rq, "\t\tE ");
5141 		else
5142 			last = rq;
5143 	}
5144 	if (last) {
5145 		if (count > max) {
5146 			drm_printf(m,
5147 				   "\t\t...skipping %d executing requests...\n",
5148 				   count - max);
5149 		}
5150 		show_request(m, last, "\t\tE ");
5151 	}
5152 
5153 	last = NULL;
5154 	count = 0;
5155 	if (execlists->queue_priority_hint != INT_MIN)
5156 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5157 			   execlists->queue_priority_hint);
5158 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5159 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5160 		int i;
5161 
5162 		priolist_for_each_request(rq, p, i) {
5163 			if (count++ < max - 1)
5164 				show_request(m, rq, "\t\tQ ");
5165 			else
5166 				last = rq;
5167 		}
5168 	}
5169 	if (last) {
5170 		if (count > max) {
5171 			drm_printf(m,
5172 				   "\t\t...skipping %d queued requests...\n",
5173 				   count - max);
5174 		}
5175 		show_request(m, last, "\t\tQ ");
5176 	}
5177 
5178 	last = NULL;
5179 	count = 0;
5180 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5181 		struct virtual_engine *ve =
5182 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5183 		struct i915_request *rq = READ_ONCE(ve->request);
5184 
5185 		if (rq) {
5186 			if (count++ < max - 1)
5187 				show_request(m, rq, "\t\tV ");
5188 			else
5189 				last = rq;
5190 		}
5191 	}
5192 	if (last) {
5193 		if (count > max) {
5194 			drm_printf(m,
5195 				   "\t\t...skipping %d virtual requests...\n",
5196 				   count - max);
5197 		}
5198 		show_request(m, last, "\t\tV ");
5199 	}
5200 
5201 	spin_unlock_irqrestore(&engine->active.lock, flags);
5202 }
5203 
5204 void intel_lr_context_reset(struct intel_engine_cs *engine,
5205 			    struct intel_context *ce,
5206 			    u32 head,
5207 			    bool scrub)
5208 {
5209 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5210 
5211 	/*
5212 	 * We want a simple context + ring to execute the breadcrumb update.
5213 	 * We cannot rely on the context being intact across the GPU hang,
5214 	 * so clear it and rebuild just what we need for the breadcrumb.
5215 	 * All pending requests for this context will be zapped, and any
5216 	 * future request will be after userspace has had the opportunity
5217 	 * to recreate its own state.
5218 	 */
5219 	if (scrub)
5220 		restore_default_state(ce, engine);
5221 
5222 	/* Rerun the request; its payload has been neutered (if guilty). */
5223 	ce->ring->head = head;
5224 	intel_ring_update_space(ce->ring);
5225 
5226 	__execlists_update_reg_state(ce, engine);
5227 }
5228 
5229 bool
5230 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5231 {
5232 	return engine->set_default_submission ==
5233 	       intel_execlists_set_default_submission;
5234 }
5235 
5236 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5237 #include "selftest_lrc.c"
5238 #endif
5239