xref: /linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 1d1997db870f4058676439ef7014390ba9e24eb2)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "gem/i915_gem_context.h"
137 
138 #include "i915_drv.h"
139 #include "i915_perf.h"
140 #include "i915_trace.h"
141 #include "i915_vgpu.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 #define WA_TAIL_DWORDS 2
181 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
182 
183 struct virtual_engine {
184 	struct intel_engine_cs base;
185 	struct intel_context context;
186 
187 	/*
188 	 * We allow only a single request through the virtual engine at a time
189 	 * (each request in the timeline waits for the completion fence of
190 	 * the previous before being submitted). By restricting ourselves to
191 	 * only submitting a single request, each request is placed on to a
192 	 * physical to maximise load spreading (by virtue of the late greedy
193 	 * scheduling -- each real engine takes the next available request
194 	 * upon idling).
195 	 */
196 	struct i915_request *request;
197 
198 	/*
199 	 * We keep a rbtree of available virtual engines inside each physical
200 	 * engine, sorted by priority. Here we preallocate the nodes we need
201 	 * for the virtual engine, indexed by physical_engine->id.
202 	 */
203 	struct ve_node {
204 		struct rb_node rb;
205 		int prio;
206 	} nodes[I915_NUM_ENGINES];
207 
208 	/*
209 	 * Keep track of bonded pairs -- restrictions upon on our selection
210 	 * of physical engines any particular request may be submitted to.
211 	 * If we receive a submit-fence from a master engine, we will only
212 	 * use one of sibling_mask physical engines.
213 	 */
214 	struct ve_bond {
215 		const struct intel_engine_cs *master;
216 		intel_engine_mask_t sibling_mask;
217 	} *bonds;
218 	unsigned int num_bonds;
219 
220 	/* And finally, which physical engines this virtual engine maps onto. */
221 	unsigned int num_siblings;
222 	struct intel_engine_cs *siblings[0];
223 };
224 
225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
226 {
227 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
228 	return container_of(engine, struct virtual_engine, base);
229 }
230 
231 static int __execlists_context_alloc(struct intel_context *ce,
232 				     struct intel_engine_cs *engine);
233 
234 static void execlists_init_reg_state(u32 *reg_state,
235 				     const struct intel_context *ce,
236 				     const struct intel_engine_cs *engine,
237 				     const struct intel_ring *ring,
238 				     bool close);
239 static void
240 __execlists_update_reg_state(const struct intel_context *ce,
241 			     const struct intel_engine_cs *engine);
242 
243 static void mark_eio(struct i915_request *rq)
244 {
245 	if (i915_request_completed(rq))
246 		return;
247 
248 	GEM_BUG_ON(i915_request_signaled(rq));
249 
250 	dma_fence_set_error(&rq->fence, -EIO);
251 	i915_request_mark_complete(rq);
252 }
253 
254 static struct i915_request *
255 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
256 {
257 	struct i915_request *active = rq;
258 
259 	rcu_read_lock();
260 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
261 		if (i915_request_completed(rq))
262 			break;
263 
264 		active = rq;
265 	}
266 	rcu_read_unlock();
267 
268 	return active;
269 }
270 
271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
272 {
273 	return (i915_ggtt_offset(engine->status_page.vma) +
274 		I915_GEM_HWS_PREEMPT_ADDR);
275 }
276 
277 static inline void
278 ring_set_paused(const struct intel_engine_cs *engine, int state)
279 {
280 	/*
281 	 * We inspect HWS_PREEMPT with a semaphore inside
282 	 * engine->emit_fini_breadcrumb. If the dword is true,
283 	 * the ring is paused as the semaphore will busywait
284 	 * until the dword is false.
285 	 */
286 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
287 	if (state)
288 		wmb();
289 }
290 
291 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
292 {
293 	return rb_entry(rb, struct i915_priolist, node);
294 }
295 
296 static inline int rq_prio(const struct i915_request *rq)
297 {
298 	return rq->sched.attr.priority;
299 }
300 
301 static int effective_prio(const struct i915_request *rq)
302 {
303 	int prio = rq_prio(rq);
304 
305 	/*
306 	 * If this request is special and must not be interrupted at any
307 	 * cost, so be it. Note we are only checking the most recent request
308 	 * in the context and so may be masking an earlier vip request. It
309 	 * is hoped that under the conditions where nopreempt is used, this
310 	 * will not matter (i.e. all requests to that context will be
311 	 * nopreempt for as long as desired).
312 	 */
313 	if (i915_request_has_nopreempt(rq))
314 		prio = I915_PRIORITY_UNPREEMPTABLE;
315 
316 	/*
317 	 * On unwinding the active request, we give it a priority bump
318 	 * if it has completed waiting on any semaphore. If we know that
319 	 * the request has already started, we can prevent an unwanted
320 	 * preempt-to-idle cycle by taking that into account now.
321 	 */
322 	if (__i915_request_has_started(rq))
323 		prio |= I915_PRIORITY_NOSEMAPHORE;
324 
325 	/* Restrict mere WAIT boosts from triggering preemption */
326 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
327 	return prio | __NO_PREEMPTION;
328 }
329 
330 static int queue_prio(const struct intel_engine_execlists *execlists)
331 {
332 	struct i915_priolist *p;
333 	struct rb_node *rb;
334 
335 	rb = rb_first_cached(&execlists->queue);
336 	if (!rb)
337 		return INT_MIN;
338 
339 	/*
340 	 * As the priolist[] are inverted, with the highest priority in [0],
341 	 * we have to flip the index value to become priority.
342 	 */
343 	p = to_priolist(rb);
344 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
345 }
346 
347 static inline bool need_preempt(const struct intel_engine_cs *engine,
348 				const struct i915_request *rq,
349 				struct rb_node *rb)
350 {
351 	int last_prio;
352 
353 	if (!intel_engine_has_semaphores(engine))
354 		return false;
355 
356 	/*
357 	 * Check if the current priority hint merits a preemption attempt.
358 	 *
359 	 * We record the highest value priority we saw during rescheduling
360 	 * prior to this dequeue, therefore we know that if it is strictly
361 	 * less than the current tail of ESLP[0], we do not need to force
362 	 * a preempt-to-idle cycle.
363 	 *
364 	 * However, the priority hint is a mere hint that we may need to
365 	 * preempt. If that hint is stale or we may be trying to preempt
366 	 * ourselves, ignore the request.
367 	 *
368 	 * More naturally we would write
369 	 *      prio >= max(0, last);
370 	 * except that we wish to prevent triggering preemption at the same
371 	 * priority level: the task that is running should remain running
372 	 * to preserve FIFO ordering of dependencies.
373 	 */
374 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
375 	if (engine->execlists.queue_priority_hint <= last_prio)
376 		return false;
377 
378 	/*
379 	 * Check against the first request in ELSP[1], it will, thanks to the
380 	 * power of PI, be the highest priority of that context.
381 	 */
382 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
383 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
384 		return true;
385 
386 	if (rb) {
387 		struct virtual_engine *ve =
388 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
389 		bool preempt = false;
390 
391 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
392 			struct i915_request *next;
393 
394 			rcu_read_lock();
395 			next = READ_ONCE(ve->request);
396 			if (next)
397 				preempt = rq_prio(next) > last_prio;
398 			rcu_read_unlock();
399 		}
400 
401 		if (preempt)
402 			return preempt;
403 	}
404 
405 	/*
406 	 * If the inflight context did not trigger the preemption, then maybe
407 	 * it was the set of queued requests? Pick the highest priority in
408 	 * the queue (the first active priolist) and see if it deserves to be
409 	 * running instead of ELSP[0].
410 	 *
411 	 * The highest priority request in the queue can not be either
412 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
413 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
414 	 */
415 	return queue_prio(&engine->execlists) > last_prio;
416 }
417 
418 __maybe_unused static inline bool
419 assert_priority_queue(const struct i915_request *prev,
420 		      const struct i915_request *next)
421 {
422 	/*
423 	 * Without preemption, the prev may refer to the still active element
424 	 * which we refuse to let go.
425 	 *
426 	 * Even with preemption, there are times when we think it is better not
427 	 * to preempt and leave an ostensibly lower priority request in flight.
428 	 */
429 	if (i915_request_is_active(prev))
430 		return true;
431 
432 	return rq_prio(prev) >= rq_prio(next);
433 }
434 
435 /*
436  * The context descriptor encodes various attributes of a context,
437  * including its GTT address and some flags. Because it's fairly
438  * expensive to calculate, we'll just do it once and cache the result,
439  * which remains valid until the context is unpinned.
440  *
441  * This is what a descriptor looks like, from LSB to MSB::
442  *
443  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
444  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
445  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
446  *      bits 53-54:    mbz, reserved for use by hardware
447  *      bits 55-63:    group ID, currently unused and set to 0
448  *
449  * Starting from Gen11, the upper dword of the descriptor has a new format:
450  *
451  *      bits 32-36:    reserved
452  *      bits 37-47:    SW context ID
453  *      bits 48:53:    engine instance
454  *      bit 54:        mbz, reserved for use by hardware
455  *      bits 55-60:    SW counter
456  *      bits 61-63:    engine class
457  *
458  * engine info, SW context ID and SW counter need to form a unique number
459  * (Context ID) per lrc.
460  */
461 static u64
462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
463 {
464 	u64 desc;
465 
466 	desc = INTEL_LEGACY_32B_CONTEXT;
467 	if (i915_vm_is_4lvl(ce->vm))
468 		desc = INTEL_LEGACY_64B_CONTEXT;
469 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
470 
471 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
472 	if (IS_GEN(engine->i915, 8))
473 		desc |= GEN8_CTX_L3LLC_COHERENT;
474 
475 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
476 	/*
477 	 * The following 32bits are copied into the OA reports (dword 2).
478 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
479 	 * anything below.
480 	 */
481 	if (INTEL_GEN(engine->i915) >= 11) {
482 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
483 								/* bits 48-53 */
484 
485 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
486 								/* bits 61-63 */
487 	}
488 
489 	return desc;
490 }
491 
492 static u32 *set_offsets(u32 *regs,
493 			const u8 *data,
494 			const struct intel_engine_cs *engine)
495 #define NOP(x) (BIT(7) | (x))
496 #define LRI(count, flags) ((flags) << 6 | (count))
497 #define POSTED BIT(0)
498 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
499 #define REG16(x) \
500 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
501 	(((x) >> 2) & 0x7f)
502 #define END() 0
503 {
504 	const u32 base = engine->mmio_base;
505 
506 	while (*data) {
507 		u8 count, flags;
508 
509 		if (*data & BIT(7)) { /* skip */
510 			regs += *data++ & ~BIT(7);
511 			continue;
512 		}
513 
514 		count = *data & 0x3f;
515 		flags = *data >> 6;
516 		data++;
517 
518 		*regs = MI_LOAD_REGISTER_IMM(count);
519 		if (flags & POSTED)
520 			*regs |= MI_LRI_FORCE_POSTED;
521 		if (INTEL_GEN(engine->i915) >= 11)
522 			*regs |= MI_LRI_CS_MMIO;
523 		regs++;
524 
525 		GEM_BUG_ON(!count);
526 		do {
527 			u32 offset = 0;
528 			u8 v;
529 
530 			do {
531 				v = *data++;
532 				offset <<= 7;
533 				offset |= v & ~BIT(7);
534 			} while (v & BIT(7));
535 
536 			*regs = base + (offset << 2);
537 			regs += 2;
538 		} while (--count);
539 	}
540 
541 	return regs;
542 }
543 
544 static const u8 gen8_xcs_offsets[] = {
545 	NOP(1),
546 	LRI(11, 0),
547 	REG16(0x244),
548 	REG(0x034),
549 	REG(0x030),
550 	REG(0x038),
551 	REG(0x03c),
552 	REG(0x168),
553 	REG(0x140),
554 	REG(0x110),
555 	REG(0x11c),
556 	REG(0x114),
557 	REG(0x118),
558 
559 	NOP(9),
560 	LRI(9, 0),
561 	REG16(0x3a8),
562 	REG16(0x28c),
563 	REG16(0x288),
564 	REG16(0x284),
565 	REG16(0x280),
566 	REG16(0x27c),
567 	REG16(0x278),
568 	REG16(0x274),
569 	REG16(0x270),
570 
571 	NOP(13),
572 	LRI(2, 0),
573 	REG16(0x200),
574 	REG(0x028),
575 
576 	END(),
577 };
578 
579 static const u8 gen9_xcs_offsets[] = {
580 	NOP(1),
581 	LRI(14, POSTED),
582 	REG16(0x244),
583 	REG(0x034),
584 	REG(0x030),
585 	REG(0x038),
586 	REG(0x03c),
587 	REG(0x168),
588 	REG(0x140),
589 	REG(0x110),
590 	REG(0x11c),
591 	REG(0x114),
592 	REG(0x118),
593 	REG(0x1c0),
594 	REG(0x1c4),
595 	REG(0x1c8),
596 
597 	NOP(3),
598 	LRI(9, POSTED),
599 	REG16(0x3a8),
600 	REG16(0x28c),
601 	REG16(0x288),
602 	REG16(0x284),
603 	REG16(0x280),
604 	REG16(0x27c),
605 	REG16(0x278),
606 	REG16(0x274),
607 	REG16(0x270),
608 
609 	NOP(13),
610 	LRI(1, POSTED),
611 	REG16(0x200),
612 
613 	NOP(13),
614 	LRI(44, POSTED),
615 	REG(0x028),
616 	REG(0x09c),
617 	REG(0x0c0),
618 	REG(0x178),
619 	REG(0x17c),
620 	REG16(0x358),
621 	REG(0x170),
622 	REG(0x150),
623 	REG(0x154),
624 	REG(0x158),
625 	REG16(0x41c),
626 	REG16(0x600),
627 	REG16(0x604),
628 	REG16(0x608),
629 	REG16(0x60c),
630 	REG16(0x610),
631 	REG16(0x614),
632 	REG16(0x618),
633 	REG16(0x61c),
634 	REG16(0x620),
635 	REG16(0x624),
636 	REG16(0x628),
637 	REG16(0x62c),
638 	REG16(0x630),
639 	REG16(0x634),
640 	REG16(0x638),
641 	REG16(0x63c),
642 	REG16(0x640),
643 	REG16(0x644),
644 	REG16(0x648),
645 	REG16(0x64c),
646 	REG16(0x650),
647 	REG16(0x654),
648 	REG16(0x658),
649 	REG16(0x65c),
650 	REG16(0x660),
651 	REG16(0x664),
652 	REG16(0x668),
653 	REG16(0x66c),
654 	REG16(0x670),
655 	REG16(0x674),
656 	REG16(0x678),
657 	REG16(0x67c),
658 	REG(0x068),
659 
660 	END(),
661 };
662 
663 static const u8 gen12_xcs_offsets[] = {
664 	NOP(1),
665 	LRI(13, POSTED),
666 	REG16(0x244),
667 	REG(0x034),
668 	REG(0x030),
669 	REG(0x038),
670 	REG(0x03c),
671 	REG(0x168),
672 	REG(0x140),
673 	REG(0x110),
674 	REG(0x1c0),
675 	REG(0x1c4),
676 	REG(0x1c8),
677 	REG(0x180),
678 	REG16(0x2b4),
679 
680 	NOP(5),
681 	LRI(9, POSTED),
682 	REG16(0x3a8),
683 	REG16(0x28c),
684 	REG16(0x288),
685 	REG16(0x284),
686 	REG16(0x280),
687 	REG16(0x27c),
688 	REG16(0x278),
689 	REG16(0x274),
690 	REG16(0x270),
691 
692 	END(),
693 };
694 
695 static const u8 gen8_rcs_offsets[] = {
696 	NOP(1),
697 	LRI(14, POSTED),
698 	REG16(0x244),
699 	REG(0x034),
700 	REG(0x030),
701 	REG(0x038),
702 	REG(0x03c),
703 	REG(0x168),
704 	REG(0x140),
705 	REG(0x110),
706 	REG(0x11c),
707 	REG(0x114),
708 	REG(0x118),
709 	REG(0x1c0),
710 	REG(0x1c4),
711 	REG(0x1c8),
712 
713 	NOP(3),
714 	LRI(9, POSTED),
715 	REG16(0x3a8),
716 	REG16(0x28c),
717 	REG16(0x288),
718 	REG16(0x284),
719 	REG16(0x280),
720 	REG16(0x27c),
721 	REG16(0x278),
722 	REG16(0x274),
723 	REG16(0x270),
724 
725 	NOP(13),
726 	LRI(1, 0),
727 	REG(0x0c8),
728 
729 	END(),
730 };
731 
732 static const u8 gen11_rcs_offsets[] = {
733 	NOP(1),
734 	LRI(15, POSTED),
735 	REG16(0x244),
736 	REG(0x034),
737 	REG(0x030),
738 	REG(0x038),
739 	REG(0x03c),
740 	REG(0x168),
741 	REG(0x140),
742 	REG(0x110),
743 	REG(0x11c),
744 	REG(0x114),
745 	REG(0x118),
746 	REG(0x1c0),
747 	REG(0x1c4),
748 	REG(0x1c8),
749 	REG(0x180),
750 
751 	NOP(1),
752 	LRI(9, POSTED),
753 	REG16(0x3a8),
754 	REG16(0x28c),
755 	REG16(0x288),
756 	REG16(0x284),
757 	REG16(0x280),
758 	REG16(0x27c),
759 	REG16(0x278),
760 	REG16(0x274),
761 	REG16(0x270),
762 
763 	LRI(1, POSTED),
764 	REG(0x1b0),
765 
766 	NOP(10),
767 	LRI(1, 0),
768 	REG(0x0c8),
769 
770 	END(),
771 };
772 
773 static const u8 gen12_rcs_offsets[] = {
774 	NOP(1),
775 	LRI(13, POSTED),
776 	REG16(0x244),
777 	REG(0x034),
778 	REG(0x030),
779 	REG(0x038),
780 	REG(0x03c),
781 	REG(0x168),
782 	REG(0x140),
783 	REG(0x110),
784 	REG(0x1c0),
785 	REG(0x1c4),
786 	REG(0x1c8),
787 	REG(0x180),
788 	REG16(0x2b4),
789 
790 	NOP(5),
791 	LRI(9, POSTED),
792 	REG16(0x3a8),
793 	REG16(0x28c),
794 	REG16(0x288),
795 	REG16(0x284),
796 	REG16(0x280),
797 	REG16(0x27c),
798 	REG16(0x278),
799 	REG16(0x274),
800 	REG16(0x270),
801 
802 	LRI(3, POSTED),
803 	REG(0x1b0),
804 	REG16(0x5a8),
805 	REG16(0x5ac),
806 
807 	NOP(6),
808 	LRI(1, 0),
809 	REG(0x0c8),
810 
811 	END(),
812 };
813 
814 #undef END
815 #undef REG16
816 #undef REG
817 #undef LRI
818 #undef NOP
819 
820 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
821 {
822 	/*
823 	 * The gen12+ lists only have the registers we program in the basic
824 	 * default state. We rely on the context image using relative
825 	 * addressing to automatic fixup the register state between the
826 	 * physical engines for virtual engine.
827 	 */
828 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
829 		   !intel_engine_has_relative_mmio(engine));
830 
831 	if (engine->class == RENDER_CLASS) {
832 		if (INTEL_GEN(engine->i915) >= 12)
833 			return gen12_rcs_offsets;
834 		else if (INTEL_GEN(engine->i915) >= 11)
835 			return gen11_rcs_offsets;
836 		else
837 			return gen8_rcs_offsets;
838 	} else {
839 		if (INTEL_GEN(engine->i915) >= 12)
840 			return gen12_xcs_offsets;
841 		else if (INTEL_GEN(engine->i915) >= 9)
842 			return gen9_xcs_offsets;
843 		else
844 			return gen8_xcs_offsets;
845 	}
846 }
847 
848 static void unwind_wa_tail(struct i915_request *rq)
849 {
850 	rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
851 	assert_ring_tail_valid(rq->ring, rq->tail);
852 }
853 
854 static struct i915_request *
855 __unwind_incomplete_requests(struct intel_engine_cs *engine)
856 {
857 	struct i915_request *rq, *rn, *active = NULL;
858 	struct list_head *uninitialized_var(pl);
859 	int prio = I915_PRIORITY_INVALID;
860 
861 	lockdep_assert_held(&engine->active.lock);
862 
863 	list_for_each_entry_safe_reverse(rq, rn,
864 					 &engine->active.requests,
865 					 sched.link) {
866 
867 		if (i915_request_completed(rq))
868 			continue; /* XXX */
869 
870 		__i915_request_unsubmit(rq);
871 		unwind_wa_tail(rq);
872 
873 		/*
874 		 * Push the request back into the queue for later resubmission.
875 		 * If this request is not native to this physical engine (i.e.
876 		 * it came from a virtual source), push it back onto the virtual
877 		 * engine so that it can be moved across onto another physical
878 		 * engine as load dictates.
879 		 */
880 		if (likely(rq->execution_mask == engine->mask)) {
881 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
882 			if (rq_prio(rq) != prio) {
883 				prio = rq_prio(rq);
884 				pl = i915_sched_lookup_priolist(engine, prio);
885 			}
886 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
887 
888 			list_move(&rq->sched.link, pl);
889 			active = rq;
890 		} else {
891 			struct intel_engine_cs *owner = rq->hw_context->engine;
892 
893 			/*
894 			 * Decouple the virtual breadcrumb before moving it
895 			 * back to the virtual engine -- we don't want the
896 			 * request to complete in the background and try
897 			 * and cancel the breadcrumb on the virtual engine
898 			 * (instead of the old engine where it is linked)!
899 			 */
900 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
901 				     &rq->fence.flags)) {
902 				spin_lock_nested(&rq->lock,
903 						 SINGLE_DEPTH_NESTING);
904 				i915_request_cancel_breadcrumb(rq);
905 				spin_unlock(&rq->lock);
906 			}
907 			rq->engine = owner;
908 			owner->submit_request(rq);
909 			active = NULL;
910 		}
911 	}
912 
913 	return active;
914 }
915 
916 struct i915_request *
917 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
918 {
919 	struct intel_engine_cs *engine =
920 		container_of(execlists, typeof(*engine), execlists);
921 
922 	return __unwind_incomplete_requests(engine);
923 }
924 
925 static inline void
926 execlists_context_status_change(struct i915_request *rq, unsigned long status)
927 {
928 	/*
929 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
930 	 * The compiler should eliminate this function as dead-code.
931 	 */
932 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
933 		return;
934 
935 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
936 				   status, rq);
937 }
938 
939 static void intel_engine_context_in(struct intel_engine_cs *engine)
940 {
941 	unsigned long flags;
942 
943 	if (READ_ONCE(engine->stats.enabled) == 0)
944 		return;
945 
946 	write_seqlock_irqsave(&engine->stats.lock, flags);
947 
948 	if (engine->stats.enabled > 0) {
949 		if (engine->stats.active++ == 0)
950 			engine->stats.start = ktime_get();
951 		GEM_BUG_ON(engine->stats.active == 0);
952 	}
953 
954 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
955 }
956 
957 static void intel_engine_context_out(struct intel_engine_cs *engine)
958 {
959 	unsigned long flags;
960 
961 	if (READ_ONCE(engine->stats.enabled) == 0)
962 		return;
963 
964 	write_seqlock_irqsave(&engine->stats.lock, flags);
965 
966 	if (engine->stats.enabled > 0) {
967 		ktime_t last;
968 
969 		if (engine->stats.active && --engine->stats.active == 0) {
970 			/*
971 			 * Decrement the active context count and in case GPU
972 			 * is now idle add up to the running total.
973 			 */
974 			last = ktime_sub(ktime_get(), engine->stats.start);
975 
976 			engine->stats.total = ktime_add(engine->stats.total,
977 							last);
978 		} else if (engine->stats.active == 0) {
979 			/*
980 			 * After turning on engine stats, context out might be
981 			 * the first event in which case we account from the
982 			 * time stats gathering was turned on.
983 			 */
984 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
985 
986 			engine->stats.total = ktime_add(engine->stats.total,
987 							last);
988 		}
989 	}
990 
991 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
992 }
993 
994 static void restore_default_state(struct intel_context *ce,
995 				  struct intel_engine_cs *engine)
996 {
997 	u32 *regs = ce->lrc_reg_state;
998 
999 	if (engine->pinned_default_state)
1000 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1001 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1002 		       engine->context_size - PAGE_SIZE);
1003 
1004 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1005 }
1006 
1007 static void reset_active(struct i915_request *rq,
1008 			 struct intel_engine_cs *engine)
1009 {
1010 	struct intel_context * const ce = rq->hw_context;
1011 	u32 head;
1012 
1013 	/*
1014 	 * The executing context has been cancelled. We want to prevent
1015 	 * further execution along this context and propagate the error on
1016 	 * to anything depending on its results.
1017 	 *
1018 	 * In __i915_request_submit(), we apply the -EIO and remove the
1019 	 * requests' payloads for any banned requests. But first, we must
1020 	 * rewind the context back to the start of the incomplete request so
1021 	 * that we do not jump back into the middle of the batch.
1022 	 *
1023 	 * We preserve the breadcrumbs and semaphores of the incomplete
1024 	 * requests so that inter-timeline dependencies (i.e other timelines)
1025 	 * remain correctly ordered. And we defer to __i915_request_submit()
1026 	 * so that all asynchronous waits are correctly handled.
1027 	 */
1028 	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
1029 		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
1030 
1031 	/* On resubmission of the active request, payload will be scrubbed */
1032 	if (i915_request_completed(rq))
1033 		head = rq->tail;
1034 	else
1035 		head = active_request(ce->timeline, rq)->head;
1036 	ce->ring->head = intel_ring_wrap(ce->ring, head);
1037 	intel_ring_update_space(ce->ring);
1038 
1039 	/* Scrub the context image to prevent replaying the previous batch */
1040 	restore_default_state(ce, engine);
1041 	__execlists_update_reg_state(ce, engine);
1042 
1043 	/* We've switched away, so this should be a no-op, but intent matters */
1044 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1045 }
1046 
1047 static inline struct intel_engine_cs *
1048 __execlists_schedule_in(struct i915_request *rq)
1049 {
1050 	struct intel_engine_cs * const engine = rq->engine;
1051 	struct intel_context * const ce = rq->hw_context;
1052 
1053 	intel_context_get(ce);
1054 
1055 	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
1056 		reset_active(rq, engine);
1057 
1058 	if (ce->tag) {
1059 		/* Use a fixed tag for OA and friends */
1060 		ce->lrc_desc |= (u64)ce->tag << 32;
1061 	} else {
1062 		/* We don't need a strict matching tag, just different values */
1063 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1064 		ce->lrc_desc |=
1065 			(u64)(engine->context_tag++ % NUM_CONTEXT_TAG) <<
1066 			GEN11_SW_CTX_ID_SHIFT;
1067 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1068 	}
1069 
1070 	intel_gt_pm_get(engine->gt);
1071 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1072 	intel_engine_context_in(engine);
1073 
1074 	return engine;
1075 }
1076 
1077 static inline struct i915_request *
1078 execlists_schedule_in(struct i915_request *rq, int idx)
1079 {
1080 	struct intel_context * const ce = rq->hw_context;
1081 	struct intel_engine_cs *old;
1082 
1083 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1084 	trace_i915_request_in(rq, idx);
1085 
1086 	old = READ_ONCE(ce->inflight);
1087 	do {
1088 		if (!old) {
1089 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1090 			break;
1091 		}
1092 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1093 
1094 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1095 	return i915_request_get(rq);
1096 }
1097 
1098 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1099 {
1100 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1101 	struct i915_request *next = READ_ONCE(ve->request);
1102 
1103 	if (next && next->execution_mask & ~rq->execution_mask)
1104 		tasklet_schedule(&ve->base.execlists.tasklet);
1105 }
1106 
1107 static inline void
1108 __execlists_schedule_out(struct i915_request *rq,
1109 			 struct intel_engine_cs * const engine)
1110 {
1111 	struct intel_context * const ce = rq->hw_context;
1112 
1113 	/*
1114 	 * NB process_csb() is not under the engine->active.lock and hence
1115 	 * schedule_out can race with schedule_in meaning that we should
1116 	 * refrain from doing non-trivial work here.
1117 	 */
1118 
1119 	/*
1120 	 * If we have just completed this context, the engine may now be
1121 	 * idle and we want to re-enter powersaving.
1122 	 */
1123 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
1124 	    i915_request_completed(rq))
1125 		intel_engine_add_retire(engine, ce->timeline);
1126 
1127 	intel_engine_context_out(engine);
1128 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1129 	intel_gt_pm_put_async(engine->gt);
1130 
1131 	/*
1132 	 * If this is part of a virtual engine, its next request may
1133 	 * have been blocked waiting for access to the active context.
1134 	 * We have to kick all the siblings again in case we need to
1135 	 * switch (e.g. the next request is not runnable on this
1136 	 * engine). Hopefully, we will already have submitted the next
1137 	 * request before the tasklet runs and do not need to rebuild
1138 	 * each virtual tree and kick everyone again.
1139 	 */
1140 	if (ce->engine != engine)
1141 		kick_siblings(rq, ce);
1142 
1143 	intel_context_put(ce);
1144 }
1145 
1146 static inline void
1147 execlists_schedule_out(struct i915_request *rq)
1148 {
1149 	struct intel_context * const ce = rq->hw_context;
1150 	struct intel_engine_cs *cur, *old;
1151 
1152 	trace_i915_request_out(rq);
1153 
1154 	old = READ_ONCE(ce->inflight);
1155 	do
1156 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1157 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1158 	if (!cur)
1159 		__execlists_schedule_out(rq, old);
1160 
1161 	i915_request_put(rq);
1162 }
1163 
1164 static u64 execlists_update_context(const struct i915_request *rq)
1165 {
1166 	struct intel_context *ce = rq->hw_context;
1167 	u64 desc;
1168 
1169 	ce->lrc_reg_state[CTX_RING_TAIL] =
1170 		intel_ring_set_tail(rq->ring, rq->tail);
1171 
1172 	/*
1173 	 * Make sure the context image is complete before we submit it to HW.
1174 	 *
1175 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1176 	 * an uncached write such as our mmio register access, the empirical
1177 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1178 	 * may not be visible to the HW prior to the completion of the UC
1179 	 * register write and that we may begin execution from the context
1180 	 * before its image is complete leading to invalid PD chasing.
1181 	 *
1182 	 * Furthermore, Braswell, at least, wants a full mb to be sure that
1183 	 * the writes are coherent in memory (visible to the GPU) prior to
1184 	 * execution, and not just visible to other CPUs (as is the result of
1185 	 * wmb).
1186 	 */
1187 	mb();
1188 
1189 	desc = ce->lrc_desc;
1190 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1191 
1192 	/* Wa_1607138340:tgl */
1193 	if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0))
1194 		desc |= CTX_DESC_FORCE_RESTORE;
1195 
1196 	return desc;
1197 }
1198 
1199 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1200 {
1201 	if (execlists->ctrl_reg) {
1202 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1203 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1204 	} else {
1205 		writel(upper_32_bits(desc), execlists->submit_reg);
1206 		writel(lower_32_bits(desc), execlists->submit_reg);
1207 	}
1208 }
1209 
1210 static __maybe_unused void
1211 trace_ports(const struct intel_engine_execlists *execlists,
1212 	    const char *msg,
1213 	    struct i915_request * const *ports)
1214 {
1215 	const struct intel_engine_cs *engine =
1216 		container_of(execlists, typeof(*engine), execlists);
1217 
1218 	if (!ports[0])
1219 		return;
1220 
1221 	GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
1222 		  engine->name, msg,
1223 		  ports[0]->fence.context,
1224 		  ports[0]->fence.seqno,
1225 		  i915_request_completed(ports[0]) ? "!" :
1226 		  i915_request_started(ports[0]) ? "*" :
1227 		  "",
1228 		  ports[1] ? ports[1]->fence.context : 0,
1229 		  ports[1] ? ports[1]->fence.seqno : 0);
1230 }
1231 
1232 static __maybe_unused bool
1233 assert_pending_valid(const struct intel_engine_execlists *execlists,
1234 		     const char *msg)
1235 {
1236 	struct i915_request * const *port, *rq;
1237 	struct intel_context *ce = NULL;
1238 
1239 	trace_ports(execlists, msg, execlists->pending);
1240 
1241 	if (!execlists->pending[0]) {
1242 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1243 		return false;
1244 	}
1245 
1246 	if (execlists->pending[execlists_num_ports(execlists)]) {
1247 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1248 			      execlists_num_ports(execlists));
1249 		return false;
1250 	}
1251 
1252 	for (port = execlists->pending; (rq = *port); port++) {
1253 		if (ce == rq->hw_context) {
1254 			GEM_TRACE_ERR("Duplicate context in pending[%zd]\n",
1255 				      port - execlists->pending);
1256 			return false;
1257 		}
1258 
1259 		ce = rq->hw_context;
1260 		if (i915_request_completed(rq))
1261 			continue;
1262 
1263 		if (i915_active_is_idle(&ce->active)) {
1264 			GEM_TRACE_ERR("Inactive context in pending[%zd]\n",
1265 				      port - execlists->pending);
1266 			return false;
1267 		}
1268 
1269 		if (!i915_vma_is_pinned(ce->state)) {
1270 			GEM_TRACE_ERR("Unpinned context in pending[%zd]\n",
1271 				      port - execlists->pending);
1272 			return false;
1273 		}
1274 
1275 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1276 			GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n",
1277 				      port - execlists->pending);
1278 			return false;
1279 		}
1280 	}
1281 
1282 	return ce;
1283 }
1284 
1285 static void execlists_submit_ports(struct intel_engine_cs *engine)
1286 {
1287 	struct intel_engine_execlists *execlists = &engine->execlists;
1288 	unsigned int n;
1289 
1290 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1291 
1292 	/*
1293 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1294 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1295 	 * not be relinquished until the device is idle (see
1296 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1297 	 * that all ELSP are drained i.e. we have processed the CSB,
1298 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1299 	 */
1300 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1301 
1302 	/*
1303 	 * ELSQ note: the submit queue is not cleared after being submitted
1304 	 * to the HW so we need to make sure we always clean it up. This is
1305 	 * currently ensured by the fact that we always write the same number
1306 	 * of elsq entries, keep this in mind before changing the loop below.
1307 	 */
1308 	for (n = execlists_num_ports(execlists); n--; ) {
1309 		struct i915_request *rq = execlists->pending[n];
1310 
1311 		write_desc(execlists,
1312 			   rq ? execlists_update_context(rq) : 0,
1313 			   n);
1314 	}
1315 
1316 	/* we need to manually load the submit queue */
1317 	if (execlists->ctrl_reg)
1318 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1319 }
1320 
1321 static bool ctx_single_port_submission(const struct intel_context *ce)
1322 {
1323 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1324 		i915_gem_context_force_single_submission(ce->gem_context));
1325 }
1326 
1327 static bool can_merge_ctx(const struct intel_context *prev,
1328 			  const struct intel_context *next)
1329 {
1330 	if (prev != next)
1331 		return false;
1332 
1333 	if (ctx_single_port_submission(prev))
1334 		return false;
1335 
1336 	return true;
1337 }
1338 
1339 static bool can_merge_rq(const struct i915_request *prev,
1340 			 const struct i915_request *next)
1341 {
1342 	GEM_BUG_ON(prev == next);
1343 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1344 
1345 	/*
1346 	 * We do not submit known completed requests. Therefore if the next
1347 	 * request is already completed, we can pretend to merge it in
1348 	 * with the previous context (and we will skip updating the ELSP
1349 	 * and tracking). Thus hopefully keeping the ELSP full with active
1350 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1351 	 * us.
1352 	 */
1353 	if (i915_request_completed(next))
1354 		return true;
1355 
1356 	if (unlikely((prev->flags ^ next->flags) &
1357 		     (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL)))
1358 		return false;
1359 
1360 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
1361 		return false;
1362 
1363 	return true;
1364 }
1365 
1366 static void virtual_update_register_offsets(u32 *regs,
1367 					    struct intel_engine_cs *engine)
1368 {
1369 	set_offsets(regs, reg_offsets(engine), engine);
1370 }
1371 
1372 static bool virtual_matches(const struct virtual_engine *ve,
1373 			    const struct i915_request *rq,
1374 			    const struct intel_engine_cs *engine)
1375 {
1376 	const struct intel_engine_cs *inflight;
1377 
1378 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1379 		return false;
1380 
1381 	/*
1382 	 * We track when the HW has completed saving the context image
1383 	 * (i.e. when we have seen the final CS event switching out of
1384 	 * the context) and must not overwrite the context image before
1385 	 * then. This restricts us to only using the active engine
1386 	 * while the previous virtualized request is inflight (so
1387 	 * we reuse the register offsets). This is a very small
1388 	 * hystersis on the greedy seelction algorithm.
1389 	 */
1390 	inflight = intel_context_inflight(&ve->context);
1391 	if (inflight && inflight != engine)
1392 		return false;
1393 
1394 	return true;
1395 }
1396 
1397 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1398 				     struct intel_engine_cs *engine)
1399 {
1400 	struct intel_engine_cs *old = ve->siblings[0];
1401 
1402 	/* All unattached (rq->engine == old) must already be completed */
1403 
1404 	spin_lock(&old->breadcrumbs.irq_lock);
1405 	if (!list_empty(&ve->context.signal_link)) {
1406 		list_move_tail(&ve->context.signal_link,
1407 			       &engine->breadcrumbs.signalers);
1408 		intel_engine_queue_breadcrumbs(engine);
1409 	}
1410 	spin_unlock(&old->breadcrumbs.irq_lock);
1411 }
1412 
1413 static struct i915_request *
1414 last_active(const struct intel_engine_execlists *execlists)
1415 {
1416 	struct i915_request * const *last = READ_ONCE(execlists->active);
1417 
1418 	while (*last && i915_request_completed(*last))
1419 		last++;
1420 
1421 	return *last;
1422 }
1423 
1424 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1425 {
1426 	LIST_HEAD(list);
1427 
1428 	/*
1429 	 * We want to move the interrupted request to the back of
1430 	 * the round-robin list (i.e. its priority level), but
1431 	 * in doing so, we must then move all requests that were in
1432 	 * flight and were waiting for the interrupted request to
1433 	 * be run after it again.
1434 	 */
1435 	do {
1436 		struct i915_dependency *p;
1437 
1438 		GEM_BUG_ON(i915_request_is_active(rq));
1439 		list_move_tail(&rq->sched.link, pl);
1440 
1441 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1442 			struct i915_request *w =
1443 				container_of(p->waiter, typeof(*w), sched);
1444 
1445 			/* Leave semaphores spinning on the other engines */
1446 			if (w->engine != rq->engine)
1447 				continue;
1448 
1449 			/* No waiter should start before its signaler */
1450 			GEM_BUG_ON(i915_request_started(w) &&
1451 				   !i915_request_completed(rq));
1452 
1453 			GEM_BUG_ON(i915_request_is_active(w));
1454 			if (list_empty(&w->sched.link))
1455 				continue; /* Not yet submitted; unready */
1456 
1457 			if (rq_prio(w) < rq_prio(rq))
1458 				continue;
1459 
1460 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1461 			list_move_tail(&w->sched.link, &list);
1462 		}
1463 
1464 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1465 	} while (rq);
1466 }
1467 
1468 static void defer_active(struct intel_engine_cs *engine)
1469 {
1470 	struct i915_request *rq;
1471 
1472 	rq = __unwind_incomplete_requests(engine);
1473 	if (!rq)
1474 		return;
1475 
1476 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1477 }
1478 
1479 static bool
1480 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1481 {
1482 	int hint;
1483 
1484 	if (!intel_engine_has_timeslices(engine))
1485 		return false;
1486 
1487 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1488 		return false;
1489 
1490 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
1491 		   engine->execlists.queue_priority_hint);
1492 
1493 	return hint >= effective_prio(rq);
1494 }
1495 
1496 static int
1497 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1498 {
1499 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1500 		return INT_MIN;
1501 
1502 	return rq_prio(list_next_entry(rq, sched.link));
1503 }
1504 
1505 static inline unsigned long
1506 timeslice(const struct intel_engine_cs *engine)
1507 {
1508 	return READ_ONCE(engine->props.timeslice_duration_ms);
1509 }
1510 
1511 static unsigned long
1512 active_timeslice(const struct intel_engine_cs *engine)
1513 {
1514 	const struct i915_request *rq = *engine->execlists.active;
1515 
1516 	if (i915_request_completed(rq))
1517 		return 0;
1518 
1519 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1520 		return 0;
1521 
1522 	return timeslice(engine);
1523 }
1524 
1525 static void set_timeslice(struct intel_engine_cs *engine)
1526 {
1527 	if (!intel_engine_has_timeslices(engine))
1528 		return;
1529 
1530 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1531 }
1532 
1533 static void record_preemption(struct intel_engine_execlists *execlists)
1534 {
1535 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1536 }
1537 
1538 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1539 {
1540 	struct i915_request *rq;
1541 
1542 	rq = last_active(&engine->execlists);
1543 	if (!rq)
1544 		return 0;
1545 
1546 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1547 	if (unlikely(i915_gem_context_is_banned(rq->gem_context)))
1548 		return 1;
1549 
1550 	return READ_ONCE(engine->props.preempt_timeout_ms);
1551 }
1552 
1553 static void set_preempt_timeout(struct intel_engine_cs *engine)
1554 {
1555 	if (!intel_engine_has_preempt_reset(engine))
1556 		return;
1557 
1558 	set_timer_ms(&engine->execlists.preempt,
1559 		     active_preempt_timeout(engine));
1560 }
1561 
1562 static void execlists_dequeue(struct intel_engine_cs *engine)
1563 {
1564 	struct intel_engine_execlists * const execlists = &engine->execlists;
1565 	struct i915_request **port = execlists->pending;
1566 	struct i915_request ** const last_port = port + execlists->port_mask;
1567 	struct i915_request *last;
1568 	struct rb_node *rb;
1569 	bool submit = false;
1570 
1571 	/*
1572 	 * Hardware submission is through 2 ports. Conceptually each port
1573 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1574 	 * static for a context, and unique to each, so we only execute
1575 	 * requests belonging to a single context from each ring. RING_HEAD
1576 	 * is maintained by the CS in the context image, it marks the place
1577 	 * where it got up to last time, and through RING_TAIL we tell the CS
1578 	 * where we want to execute up to this time.
1579 	 *
1580 	 * In this list the requests are in order of execution. Consecutive
1581 	 * requests from the same context are adjacent in the ringbuffer. We
1582 	 * can combine these requests into a single RING_TAIL update:
1583 	 *
1584 	 *              RING_HEAD...req1...req2
1585 	 *                                    ^- RING_TAIL
1586 	 * since to execute req2 the CS must first execute req1.
1587 	 *
1588 	 * Our goal then is to point each port to the end of a consecutive
1589 	 * sequence of requests as being the most optimal (fewest wake ups
1590 	 * and context switches) submission.
1591 	 */
1592 
1593 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1594 		struct virtual_engine *ve =
1595 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1596 		struct i915_request *rq = READ_ONCE(ve->request);
1597 
1598 		if (!rq) { /* lazily cleanup after another engine handled rq */
1599 			rb_erase_cached(rb, &execlists->virtual);
1600 			RB_CLEAR_NODE(rb);
1601 			rb = rb_first_cached(&execlists->virtual);
1602 			continue;
1603 		}
1604 
1605 		if (!virtual_matches(ve, rq, engine)) {
1606 			rb = rb_next(rb);
1607 			continue;
1608 		}
1609 
1610 		break;
1611 	}
1612 
1613 	/*
1614 	 * If the queue is higher priority than the last
1615 	 * request in the currently active context, submit afresh.
1616 	 * We will resubmit again afterwards in case we need to split
1617 	 * the active context to interject the preemption request,
1618 	 * i.e. we will retrigger preemption following the ack in case
1619 	 * of trouble.
1620 	 */
1621 	last = last_active(execlists);
1622 	if (last) {
1623 		if (need_preempt(engine, last, rb)) {
1624 			GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1625 				  engine->name,
1626 				  last->fence.context,
1627 				  last->fence.seqno,
1628 				  last->sched.attr.priority,
1629 				  execlists->queue_priority_hint);
1630 			record_preemption(execlists);
1631 
1632 			/*
1633 			 * Don't let the RING_HEAD advance past the breadcrumb
1634 			 * as we unwind (and until we resubmit) so that we do
1635 			 * not accidentally tell it to go backwards.
1636 			 */
1637 			ring_set_paused(engine, 1);
1638 
1639 			/*
1640 			 * Note that we have not stopped the GPU at this point,
1641 			 * so we are unwinding the incomplete requests as they
1642 			 * remain inflight and so by the time we do complete
1643 			 * the preemption, some of the unwound requests may
1644 			 * complete!
1645 			 */
1646 			__unwind_incomplete_requests(engine);
1647 
1648 			/*
1649 			 * If we need to return to the preempted context, we
1650 			 * need to skip the lite-restore and force it to
1651 			 * reload the RING_TAIL. Otherwise, the HW has a
1652 			 * tendency to ignore us rewinding the TAIL to the
1653 			 * end of an earlier request.
1654 			 */
1655 			last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1656 			last = NULL;
1657 		} else if (need_timeslice(engine, last) &&
1658 			   timer_expired(&engine->execlists.timer)) {
1659 			GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1660 				  engine->name,
1661 				  last->fence.context,
1662 				  last->fence.seqno,
1663 				  last->sched.attr.priority,
1664 				  execlists->queue_priority_hint);
1665 
1666 			ring_set_paused(engine, 1);
1667 			defer_active(engine);
1668 
1669 			/*
1670 			 * Unlike for preemption, if we rewind and continue
1671 			 * executing the same context as previously active,
1672 			 * the order of execution will remain the same and
1673 			 * the tail will only advance. We do not need to
1674 			 * force a full context restore, as a lite-restore
1675 			 * is sufficient to resample the monotonic TAIL.
1676 			 *
1677 			 * If we switch to any other context, similarly we
1678 			 * will not rewind TAIL of current context, and
1679 			 * normal save/restore will preserve state and allow
1680 			 * us to later continue executing the same request.
1681 			 */
1682 			last = NULL;
1683 		} else {
1684 			/*
1685 			 * Otherwise if we already have a request pending
1686 			 * for execution after the current one, we can
1687 			 * just wait until the next CS event before
1688 			 * queuing more. In either case we will force a
1689 			 * lite-restore preemption event, but if we wait
1690 			 * we hopefully coalesce several updates into a single
1691 			 * submission.
1692 			 */
1693 			if (!list_is_last(&last->sched.link,
1694 					  &engine->active.requests)) {
1695 				/*
1696 				 * Even if ELSP[1] is occupied and not worthy
1697 				 * of timeslices, our queue might be.
1698 				 */
1699 				if (!execlists->timer.expires &&
1700 				    need_timeslice(engine, last))
1701 					set_timer_ms(&execlists->timer,
1702 						     timeslice(engine));
1703 
1704 				return;
1705 			}
1706 
1707 			/*
1708 			 * WaIdleLiteRestore:bdw,skl
1709 			 * Apply the wa NOOPs to prevent
1710 			 * ring:HEAD == rq:TAIL as we resubmit the
1711 			 * request. See gen8_emit_fini_breadcrumb() for
1712 			 * where we prepare the padding after the
1713 			 * end of the request.
1714 			 */
1715 			last->tail = last->wa_tail;
1716 		}
1717 	}
1718 
1719 	while (rb) { /* XXX virtual is always taking precedence */
1720 		struct virtual_engine *ve =
1721 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1722 		struct i915_request *rq;
1723 
1724 		spin_lock(&ve->base.active.lock);
1725 
1726 		rq = ve->request;
1727 		if (unlikely(!rq)) { /* lost the race to a sibling */
1728 			spin_unlock(&ve->base.active.lock);
1729 			rb_erase_cached(rb, &execlists->virtual);
1730 			RB_CLEAR_NODE(rb);
1731 			rb = rb_first_cached(&execlists->virtual);
1732 			continue;
1733 		}
1734 
1735 		GEM_BUG_ON(rq != ve->request);
1736 		GEM_BUG_ON(rq->engine != &ve->base);
1737 		GEM_BUG_ON(rq->hw_context != &ve->context);
1738 
1739 		if (rq_prio(rq) >= queue_prio(execlists)) {
1740 			if (!virtual_matches(ve, rq, engine)) {
1741 				spin_unlock(&ve->base.active.lock);
1742 				rb = rb_next(rb);
1743 				continue;
1744 			}
1745 
1746 			if (last && !can_merge_rq(last, rq)) {
1747 				spin_unlock(&ve->base.active.lock);
1748 				return; /* leave this for another */
1749 			}
1750 
1751 			GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1752 				  engine->name,
1753 				  rq->fence.context,
1754 				  rq->fence.seqno,
1755 				  i915_request_completed(rq) ? "!" :
1756 				  i915_request_started(rq) ? "*" :
1757 				  "",
1758 				  yesno(engine != ve->siblings[0]));
1759 
1760 			ve->request = NULL;
1761 			ve->base.execlists.queue_priority_hint = INT_MIN;
1762 			rb_erase_cached(rb, &execlists->virtual);
1763 			RB_CLEAR_NODE(rb);
1764 
1765 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1766 			rq->engine = engine;
1767 
1768 			if (engine != ve->siblings[0]) {
1769 				u32 *regs = ve->context.lrc_reg_state;
1770 				unsigned int n;
1771 
1772 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1773 
1774 				if (!intel_engine_has_relative_mmio(engine))
1775 					virtual_update_register_offsets(regs,
1776 									engine);
1777 
1778 				if (!list_empty(&ve->context.signals))
1779 					virtual_xfer_breadcrumbs(ve, engine);
1780 
1781 				/*
1782 				 * Move the bound engine to the top of the list
1783 				 * for future execution. We then kick this
1784 				 * tasklet first before checking others, so that
1785 				 * we preferentially reuse this set of bound
1786 				 * registers.
1787 				 */
1788 				for (n = 1; n < ve->num_siblings; n++) {
1789 					if (ve->siblings[n] == engine) {
1790 						swap(ve->siblings[n],
1791 						     ve->siblings[0]);
1792 						break;
1793 					}
1794 				}
1795 
1796 				GEM_BUG_ON(ve->siblings[0] != engine);
1797 			}
1798 
1799 			if (__i915_request_submit(rq)) {
1800 				submit = true;
1801 				last = rq;
1802 			}
1803 			i915_request_put(rq);
1804 
1805 			/*
1806 			 * Hmm, we have a bunch of virtual engine requests,
1807 			 * but the first one was already completed (thanks
1808 			 * preempt-to-busy!). Keep looking at the veng queue
1809 			 * until we have no more relevant requests (i.e.
1810 			 * the normal submit queue has higher priority).
1811 			 */
1812 			if (!submit) {
1813 				spin_unlock(&ve->base.active.lock);
1814 				rb = rb_first_cached(&execlists->virtual);
1815 				continue;
1816 			}
1817 		}
1818 
1819 		spin_unlock(&ve->base.active.lock);
1820 		break;
1821 	}
1822 
1823 	while ((rb = rb_first_cached(&execlists->queue))) {
1824 		struct i915_priolist *p = to_priolist(rb);
1825 		struct i915_request *rq, *rn;
1826 		int i;
1827 
1828 		priolist_for_each_request_consume(rq, rn, p, i) {
1829 			bool merge = true;
1830 
1831 			/*
1832 			 * Can we combine this request with the current port?
1833 			 * It has to be the same context/ringbuffer and not
1834 			 * have any exceptions (e.g. GVT saying never to
1835 			 * combine contexts).
1836 			 *
1837 			 * If we can combine the requests, we can execute both
1838 			 * by updating the RING_TAIL to point to the end of the
1839 			 * second request, and so we never need to tell the
1840 			 * hardware about the first.
1841 			 */
1842 			if (last && !can_merge_rq(last, rq)) {
1843 				/*
1844 				 * If we are on the second port and cannot
1845 				 * combine this request with the last, then we
1846 				 * are done.
1847 				 */
1848 				if (port == last_port)
1849 					goto done;
1850 
1851 				/*
1852 				 * We must not populate both ELSP[] with the
1853 				 * same LRCA, i.e. we must submit 2 different
1854 				 * contexts if we submit 2 ELSP.
1855 				 */
1856 				if (last->hw_context == rq->hw_context)
1857 					goto done;
1858 
1859 				if (i915_request_has_sentinel(last))
1860 					goto done;
1861 
1862 				/*
1863 				 * If GVT overrides us we only ever submit
1864 				 * port[0], leaving port[1] empty. Note that we
1865 				 * also have to be careful that we don't queue
1866 				 * the same context (even though a different
1867 				 * request) to the second port.
1868 				 */
1869 				if (ctx_single_port_submission(last->hw_context) ||
1870 				    ctx_single_port_submission(rq->hw_context))
1871 					goto done;
1872 
1873 				merge = false;
1874 			}
1875 
1876 			if (__i915_request_submit(rq)) {
1877 				if (!merge) {
1878 					*port = execlists_schedule_in(last, port - execlists->pending);
1879 					port++;
1880 					last = NULL;
1881 				}
1882 
1883 				GEM_BUG_ON(last &&
1884 					   !can_merge_ctx(last->hw_context,
1885 							  rq->hw_context));
1886 
1887 				submit = true;
1888 				last = rq;
1889 			}
1890 		}
1891 
1892 		rb_erase_cached(&p->node, &execlists->queue);
1893 		i915_priolist_free(p);
1894 	}
1895 
1896 done:
1897 	/*
1898 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1899 	 *
1900 	 * We choose the priority hint such that if we add a request of greater
1901 	 * priority than this, we kick the submission tasklet to decide on
1902 	 * the right order of submitting the requests to hardware. We must
1903 	 * also be prepared to reorder requests as they are in-flight on the
1904 	 * HW. We derive the priority hint then as the first "hole" in
1905 	 * the HW submission ports and if there are no available slots,
1906 	 * the priority of the lowest executing request, i.e. last.
1907 	 *
1908 	 * When we do receive a higher priority request ready to run from the
1909 	 * user, see queue_request(), the priority hint is bumped to that
1910 	 * request triggering preemption on the next dequeue (or subsequent
1911 	 * interrupt for secondary ports).
1912 	 */
1913 	execlists->queue_priority_hint = queue_prio(execlists);
1914 	GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1915 		  engine->name, execlists->queue_priority_hint,
1916 		  yesno(submit));
1917 
1918 	if (submit) {
1919 		*port = execlists_schedule_in(last, port - execlists->pending);
1920 		execlists->switch_priority_hint =
1921 			switch_prio(engine, *execlists->pending);
1922 
1923 		/*
1924 		 * Skip if we ended up with exactly the same set of requests,
1925 		 * e.g. trying to timeslice a pair of ordered contexts
1926 		 */
1927 		if (!memcmp(execlists->active, execlists->pending,
1928 			    (port - execlists->pending + 1) * sizeof(*port))) {
1929 			do
1930 				execlists_schedule_out(fetch_and_zero(port));
1931 			while (port-- != execlists->pending);
1932 
1933 			goto skip_submit;
1934 		}
1935 
1936 		memset(port + 1, 0, (last_port - port) * sizeof(*port));
1937 		execlists_submit_ports(engine);
1938 
1939 		set_preempt_timeout(engine);
1940 	} else {
1941 skip_submit:
1942 		ring_set_paused(engine, 0);
1943 	}
1944 }
1945 
1946 static void
1947 cancel_port_requests(struct intel_engine_execlists * const execlists)
1948 {
1949 	struct i915_request * const *port;
1950 
1951 	for (port = execlists->pending; *port; port++)
1952 		execlists_schedule_out(*port);
1953 	memset(execlists->pending, 0, sizeof(execlists->pending));
1954 
1955 	/* Mark the end of active before we overwrite *active */
1956 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
1957 		execlists_schedule_out(*port);
1958 	WRITE_ONCE(execlists->active,
1959 		   memset(execlists->inflight, 0, sizeof(execlists->inflight)));
1960 }
1961 
1962 static inline void
1963 invalidate_csb_entries(const u32 *first, const u32 *last)
1964 {
1965 	clflush((void *)first);
1966 	clflush((void *)last);
1967 }
1968 
1969 static inline bool
1970 reset_in_progress(const struct intel_engine_execlists *execlists)
1971 {
1972 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1973 }
1974 
1975 /*
1976  * Starting with Gen12, the status has a new format:
1977  *
1978  *     bit  0:     switched to new queue
1979  *     bit  1:     reserved
1980  *     bit  2:     semaphore wait mode (poll or signal), only valid when
1981  *                 switch detail is set to "wait on semaphore"
1982  *     bits 3-5:   engine class
1983  *     bits 6-11:  engine instance
1984  *     bits 12-14: reserved
1985  *     bits 15-25: sw context id of the lrc the GT switched to
1986  *     bits 26-31: sw counter of the lrc the GT switched to
1987  *     bits 32-35: context switch detail
1988  *                  - 0: ctx complete
1989  *                  - 1: wait on sync flip
1990  *                  - 2: wait on vblank
1991  *                  - 3: wait on scanline
1992  *                  - 4: wait on semaphore
1993  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
1994  *                       WAIT_FOR_EVENT)
1995  *     bit  36:    reserved
1996  *     bits 37-43: wait detail (for switch detail 1 to 4)
1997  *     bits 44-46: reserved
1998  *     bits 47-57: sw context id of the lrc the GT switched away from
1999  *     bits 58-63: sw counter of the lrc the GT switched away from
2000  */
2001 static inline bool
2002 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2003 {
2004 	u32 lower_dw = csb[0];
2005 	u32 upper_dw = csb[1];
2006 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2007 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2008 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2009 
2010 	/*
2011 	 * The context switch detail is not guaranteed to be 5 when a preemption
2012 	 * occurs, so we can't just check for that. The check below works for
2013 	 * all the cases we care about, including preemptions of WAIT
2014 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2015 	 * would require some extra handling, but we don't support that.
2016 	 */
2017 	if (!ctx_away_valid || new_queue) {
2018 		GEM_BUG_ON(!ctx_to_valid);
2019 		return true;
2020 	}
2021 
2022 	/*
2023 	 * switch detail = 5 is covered by the case above and we do not expect a
2024 	 * context switch on an unsuccessful wait instruction since we always
2025 	 * use polling mode.
2026 	 */
2027 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2028 	return false;
2029 }
2030 
2031 static inline bool
2032 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2033 {
2034 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2035 }
2036 
2037 static void process_csb(struct intel_engine_cs *engine)
2038 {
2039 	struct intel_engine_execlists * const execlists = &engine->execlists;
2040 	const u32 * const buf = execlists->csb_status;
2041 	const u8 num_entries = execlists->csb_size;
2042 	u8 head, tail;
2043 
2044 	/*
2045 	 * As we modify our execlists state tracking we require exclusive
2046 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2047 	 * and we assume that is only inside the reset paths and so serialised.
2048 	 */
2049 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2050 		   !reset_in_progress(execlists));
2051 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2052 
2053 	/*
2054 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2055 	 * When reading from the csb_write mmio register, we have to be
2056 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2057 	 * the low 4bits. As it happens we know the next 4bits are always
2058 	 * zero and so we can simply masked off the low u8 of the register
2059 	 * and treat it identically to reading from the HWSP (without having
2060 	 * to use explicit shifting and masking, and probably bifurcating
2061 	 * the code to handle the legacy mmio read).
2062 	 */
2063 	head = execlists->csb_head;
2064 	tail = READ_ONCE(*execlists->csb_write);
2065 	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
2066 	if (unlikely(head == tail))
2067 		return;
2068 
2069 	/*
2070 	 * Hopefully paired with a wmb() in HW!
2071 	 *
2072 	 * We must complete the read of the write pointer before any reads
2073 	 * from the CSB, so that we do not see stale values. Without an rmb
2074 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2075 	 * we perform the READ_ONCE(*csb_write).
2076 	 */
2077 	rmb();
2078 
2079 	do {
2080 		bool promote;
2081 
2082 		if (++head == num_entries)
2083 			head = 0;
2084 
2085 		/*
2086 		 * We are flying near dragons again.
2087 		 *
2088 		 * We hold a reference to the request in execlist_port[]
2089 		 * but no more than that. We are operating in softirq
2090 		 * context and so cannot hold any mutex or sleep. That
2091 		 * prevents us stopping the requests we are processing
2092 		 * in port[] from being retired simultaneously (the
2093 		 * breadcrumb will be complete before we see the
2094 		 * context-switch). As we only hold the reference to the
2095 		 * request, any pointer chasing underneath the request
2096 		 * is subject to a potential use-after-free. Thus we
2097 		 * store all of the bookkeeping within port[] as
2098 		 * required, and avoid using unguarded pointers beneath
2099 		 * request itself. The same applies to the atomic
2100 		 * status notifier.
2101 		 */
2102 
2103 		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
2104 			  engine->name, head,
2105 			  buf[2 * head + 0], buf[2 * head + 1]);
2106 
2107 		if (INTEL_GEN(engine->i915) >= 12)
2108 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2109 		else
2110 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2111 		if (promote) {
2112 			struct i915_request * const *old = execlists->active;
2113 
2114 			/* Point active to the new ELSP; prevent overwriting */
2115 			WRITE_ONCE(execlists->active, execlists->pending);
2116 			set_timeslice(engine);
2117 
2118 			if (!inject_preempt_hang(execlists))
2119 				ring_set_paused(engine, 0);
2120 
2121 			/* cancel old inflight, prepare for switch */
2122 			trace_ports(execlists, "preempted", old);
2123 			while (*old)
2124 				execlists_schedule_out(*old++);
2125 
2126 			/* switch pending to inflight */
2127 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2128 			WRITE_ONCE(execlists->active,
2129 				   memcpy(execlists->inflight,
2130 					  execlists->pending,
2131 					  execlists_num_ports(execlists) *
2132 					  sizeof(*execlists->pending)));
2133 
2134 			WRITE_ONCE(execlists->pending[0], NULL);
2135 		} else {
2136 			GEM_BUG_ON(!*execlists->active);
2137 
2138 			/* port0 completed, advanced to port1 */
2139 			trace_ports(execlists, "completed", execlists->active);
2140 
2141 			/*
2142 			 * We rely on the hardware being strongly
2143 			 * ordered, that the breadcrumb write is
2144 			 * coherent (visible from the CPU) before the
2145 			 * user interrupt and CSB is processed.
2146 			 */
2147 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2148 				   !reset_in_progress(execlists));
2149 			execlists_schedule_out(*execlists->active++);
2150 
2151 			GEM_BUG_ON(execlists->active - execlists->inflight >
2152 				   execlists_num_ports(execlists));
2153 		}
2154 	} while (head != tail);
2155 
2156 	execlists->csb_head = head;
2157 
2158 	/*
2159 	 * Gen11 has proven to fail wrt global observation point between
2160 	 * entry and tail update, failing on the ordering and thus
2161 	 * we see an old entry in the context status buffer.
2162 	 *
2163 	 * Forcibly evict out entries for the next gpu csb update,
2164 	 * to increase the odds that we get a fresh entries with non
2165 	 * working hardware. The cost for doing so comes out mostly with
2166 	 * the wash as hardware, working or not, will need to do the
2167 	 * invalidation before.
2168 	 */
2169 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2170 }
2171 
2172 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2173 {
2174 	lockdep_assert_held(&engine->active.lock);
2175 	if (!engine->execlists.pending[0]) {
2176 		rcu_read_lock(); /* protect peeking at execlists->active */
2177 		execlists_dequeue(engine);
2178 		rcu_read_unlock();
2179 	}
2180 }
2181 
2182 static noinline void preempt_reset(struct intel_engine_cs *engine)
2183 {
2184 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2185 	unsigned long *lock = &engine->gt->reset.flags;
2186 
2187 	if (i915_modparams.reset < 3)
2188 		return;
2189 
2190 	if (test_and_set_bit(bit, lock))
2191 		return;
2192 
2193 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2194 	tasklet_disable_nosync(&engine->execlists.tasklet);
2195 
2196 	GEM_TRACE("%s: preempt timeout %lu+%ums\n",
2197 		  engine->name,
2198 		  READ_ONCE(engine->props.preempt_timeout_ms),
2199 		  jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2200 	intel_engine_reset(engine, "preemption time out");
2201 
2202 	tasklet_enable(&engine->execlists.tasklet);
2203 	clear_and_wake_up_bit(bit, lock);
2204 }
2205 
2206 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2207 {
2208 	const struct timer_list *t = &engine->execlists.preempt;
2209 
2210 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2211 		return false;
2212 
2213 	if (!timer_expired(t))
2214 		return false;
2215 
2216 	return READ_ONCE(engine->execlists.pending[0]);
2217 }
2218 
2219 /*
2220  * Check the unread Context Status Buffers and manage the submission of new
2221  * contexts to the ELSP accordingly.
2222  */
2223 static void execlists_submission_tasklet(unsigned long data)
2224 {
2225 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2226 	bool timeout = preempt_timeout(engine);
2227 
2228 	process_csb(engine);
2229 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2230 		unsigned long flags;
2231 
2232 		spin_lock_irqsave(&engine->active.lock, flags);
2233 		__execlists_submission_tasklet(engine);
2234 		spin_unlock_irqrestore(&engine->active.lock, flags);
2235 
2236 		/* Recheck after serialising with direct-submission */
2237 		if (timeout && preempt_timeout(engine))
2238 			preempt_reset(engine);
2239 	}
2240 }
2241 
2242 static void __execlists_kick(struct intel_engine_execlists *execlists)
2243 {
2244 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2245 	tasklet_hi_schedule(&execlists->tasklet);
2246 }
2247 
2248 #define execlists_kick(t, member) \
2249 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2250 
2251 static void execlists_timeslice(struct timer_list *timer)
2252 {
2253 	execlists_kick(timer, timer);
2254 }
2255 
2256 static void execlists_preempt(struct timer_list *timer)
2257 {
2258 	execlists_kick(timer, preempt);
2259 }
2260 
2261 static void queue_request(struct intel_engine_cs *engine,
2262 			  struct i915_sched_node *node,
2263 			  int prio)
2264 {
2265 	GEM_BUG_ON(!list_empty(&node->link));
2266 	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
2267 }
2268 
2269 static void __submit_queue_imm(struct intel_engine_cs *engine)
2270 {
2271 	struct intel_engine_execlists * const execlists = &engine->execlists;
2272 
2273 	if (reset_in_progress(execlists))
2274 		return; /* defer until we restart the engine following reset */
2275 
2276 	if (execlists->tasklet.func == execlists_submission_tasklet)
2277 		__execlists_submission_tasklet(engine);
2278 	else
2279 		tasklet_hi_schedule(&execlists->tasklet);
2280 }
2281 
2282 static void submit_queue(struct intel_engine_cs *engine,
2283 			 const struct i915_request *rq)
2284 {
2285 	struct intel_engine_execlists *execlists = &engine->execlists;
2286 
2287 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2288 		return;
2289 
2290 	execlists->queue_priority_hint = rq_prio(rq);
2291 	__submit_queue_imm(engine);
2292 }
2293 
2294 static void execlists_submit_request(struct i915_request *request)
2295 {
2296 	struct intel_engine_cs *engine = request->engine;
2297 	unsigned long flags;
2298 
2299 	/* Will be called from irq-context when using foreign fences. */
2300 	spin_lock_irqsave(&engine->active.lock, flags);
2301 
2302 	queue_request(engine, &request->sched, rq_prio(request));
2303 
2304 	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2305 	GEM_BUG_ON(list_empty(&request->sched.link));
2306 
2307 	submit_queue(engine, request);
2308 
2309 	spin_unlock_irqrestore(&engine->active.lock, flags);
2310 }
2311 
2312 static void __execlists_context_fini(struct intel_context *ce)
2313 {
2314 	intel_ring_put(ce->ring);
2315 	i915_vma_put(ce->state);
2316 }
2317 
2318 static void execlists_context_destroy(struct kref *kref)
2319 {
2320 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2321 
2322 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2323 	GEM_BUG_ON(intel_context_is_pinned(ce));
2324 
2325 	if (ce->state)
2326 		__execlists_context_fini(ce);
2327 
2328 	intel_context_fini(ce);
2329 	intel_context_free(ce);
2330 }
2331 
2332 static void
2333 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2334 {
2335 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2336 		return;
2337 
2338 	vaddr += engine->context_size;
2339 
2340 	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
2341 }
2342 
2343 static void
2344 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2345 {
2346 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2347 		return;
2348 
2349 	vaddr += engine->context_size;
2350 
2351 	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
2352 		dev_err_once(engine->i915->drm.dev,
2353 			     "%s context redzone overwritten!\n",
2354 			     engine->name);
2355 }
2356 
2357 static void execlists_context_unpin(struct intel_context *ce)
2358 {
2359 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2360 		      ce->engine);
2361 
2362 	i915_gem_object_unpin_map(ce->state->obj);
2363 	intel_ring_reset(ce->ring, ce->ring->tail);
2364 }
2365 
2366 static void
2367 __execlists_update_reg_state(const struct intel_context *ce,
2368 			     const struct intel_engine_cs *engine)
2369 {
2370 	struct intel_ring *ring = ce->ring;
2371 	u32 *regs = ce->lrc_reg_state;
2372 
2373 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2374 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2375 
2376 	regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma);
2377 	regs[CTX_RING_HEAD] = ring->head;
2378 	regs[CTX_RING_TAIL] = ring->tail;
2379 
2380 	/* RPCS */
2381 	if (engine->class == RENDER_CLASS) {
2382 		regs[CTX_R_PWR_CLK_STATE] =
2383 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2384 
2385 		i915_oa_init_reg_state(ce, engine);
2386 	}
2387 }
2388 
2389 static int
2390 __execlists_context_pin(struct intel_context *ce,
2391 			struct intel_engine_cs *engine)
2392 {
2393 	void *vaddr;
2394 	int ret;
2395 
2396 	GEM_BUG_ON(!ce->state);
2397 
2398 	ret = intel_context_active_acquire(ce);
2399 	if (ret)
2400 		goto err;
2401 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2402 
2403 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2404 					i915_coherent_map_type(engine->i915) |
2405 					I915_MAP_OVERRIDE);
2406 	if (IS_ERR(vaddr)) {
2407 		ret = PTR_ERR(vaddr);
2408 		goto unpin_active;
2409 	}
2410 
2411 	ce->lrc_desc = lrc_descriptor(ce, engine);
2412 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2413 	__execlists_update_reg_state(ce, engine);
2414 
2415 	return 0;
2416 
2417 unpin_active:
2418 	intel_context_active_release(ce);
2419 err:
2420 	return ret;
2421 }
2422 
2423 static int execlists_context_pin(struct intel_context *ce)
2424 {
2425 	return __execlists_context_pin(ce, ce->engine);
2426 }
2427 
2428 static int execlists_context_alloc(struct intel_context *ce)
2429 {
2430 	return __execlists_context_alloc(ce, ce->engine);
2431 }
2432 
2433 static void execlists_context_reset(struct intel_context *ce)
2434 {
2435 	/*
2436 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2437 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2438 	 * that stored in context. As we only write new commands from
2439 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2440 	 * starts reading from its RING_HEAD from the context, it may try to
2441 	 * execute that junk and die.
2442 	 *
2443 	 * The contexts that are stilled pinned on resume belong to the
2444 	 * kernel, and are local to each engine. All other contexts will
2445 	 * have their head/tail sanitized upon pinning before use, so they
2446 	 * will never see garbage,
2447 	 *
2448 	 * So to avoid that we reset the context images upon resume. For
2449 	 * simplicity, we just zero everything out.
2450 	 */
2451 	intel_ring_reset(ce->ring, 0);
2452 	__execlists_update_reg_state(ce, ce->engine);
2453 }
2454 
2455 static const struct intel_context_ops execlists_context_ops = {
2456 	.alloc = execlists_context_alloc,
2457 
2458 	.pin = execlists_context_pin,
2459 	.unpin = execlists_context_unpin,
2460 
2461 	.enter = intel_context_enter_engine,
2462 	.exit = intel_context_exit_engine,
2463 
2464 	.reset = execlists_context_reset,
2465 	.destroy = execlists_context_destroy,
2466 };
2467 
2468 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2469 {
2470 	u32 *cs;
2471 
2472 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2473 
2474 	cs = intel_ring_begin(rq, 6);
2475 	if (IS_ERR(cs))
2476 		return PTR_ERR(cs);
2477 
2478 	/*
2479 	 * Check if we have been preempted before we even get started.
2480 	 *
2481 	 * After this point i915_request_started() reports true, even if
2482 	 * we get preempted and so are no longer running.
2483 	 */
2484 	*cs++ = MI_ARB_CHECK;
2485 	*cs++ = MI_NOOP;
2486 
2487 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2488 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2489 	*cs++ = 0;
2490 	*cs++ = rq->fence.seqno - 1;
2491 
2492 	intel_ring_advance(rq, cs);
2493 
2494 	/* Record the updated position of the request's payload */
2495 	rq->infix = intel_ring_offset(rq, cs);
2496 
2497 	return 0;
2498 }
2499 
2500 static int execlists_request_alloc(struct i915_request *request)
2501 {
2502 	int ret;
2503 
2504 	GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
2505 
2506 	/*
2507 	 * Flush enough space to reduce the likelihood of waiting after
2508 	 * we start building the request - in which case we will just
2509 	 * have to repeat work.
2510 	 */
2511 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
2512 
2513 	/*
2514 	 * Note that after this point, we have committed to using
2515 	 * this request as it is being used to both track the
2516 	 * state of engine initialisation and liveness of the
2517 	 * golden renderstate above. Think twice before you try
2518 	 * to cancel/unwind this request now.
2519 	 */
2520 
2521 	/* Unconditionally invalidate GPU caches and TLBs. */
2522 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2523 	if (ret)
2524 		return ret;
2525 
2526 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2527 	return 0;
2528 }
2529 
2530 /*
2531  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
2532  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
2533  * but there is a slight complication as this is applied in WA batch where the
2534  * values are only initialized once so we cannot take register value at the
2535  * beginning and reuse it further; hence we save its value to memory, upload a
2536  * constant value with bit21 set and then we restore it back with the saved value.
2537  * To simplify the WA, a constant value is formed by using the default value
2538  * of this register. This shouldn't be a problem because we are only modifying
2539  * it for a short period and this batch in non-premptible. We can ofcourse
2540  * use additional instructions that read the actual value of the register
2541  * at that time and set our bit of interest but it makes the WA complicated.
2542  *
2543  * This WA is also required for Gen9 so extracting as a function avoids
2544  * code duplication.
2545  */
2546 static u32 *
2547 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2548 {
2549 	/* NB no one else is allowed to scribble over scratch + 256! */
2550 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2551 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2552 	*batch++ = intel_gt_scratch_offset(engine->gt,
2553 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2554 	*batch++ = 0;
2555 
2556 	*batch++ = MI_LOAD_REGISTER_IMM(1);
2557 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2558 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2559 
2560 	batch = gen8_emit_pipe_control(batch,
2561 				       PIPE_CONTROL_CS_STALL |
2562 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
2563 				       0);
2564 
2565 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2566 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2567 	*batch++ = intel_gt_scratch_offset(engine->gt,
2568 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2569 	*batch++ = 0;
2570 
2571 	return batch;
2572 }
2573 
2574 /*
2575  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2576  * initialized at the beginning and shared across all contexts but this field
2577  * helps us to have multiple batches at different offsets and select them based
2578  * on a criteria. At the moment this batch always start at the beginning of the page
2579  * and at this point we don't have multiple wa_ctx batch buffers.
2580  *
2581  * The number of WA applied are not known at the beginning; we use this field
2582  * to return the no of DWORDS written.
2583  *
2584  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2585  * so it adds NOOPs as padding to make it cacheline aligned.
2586  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2587  * makes a complete batch buffer.
2588  */
2589 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2590 {
2591 	/* WaDisableCtxRestoreArbitration:bdw,chv */
2592 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2593 
2594 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2595 	if (IS_BROADWELL(engine->i915))
2596 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2597 
2598 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2599 	/* Actual scratch location is at 128 bytes offset */
2600 	batch = gen8_emit_pipe_control(batch,
2601 				       PIPE_CONTROL_FLUSH_L3 |
2602 				       PIPE_CONTROL_STORE_DATA_INDEX |
2603 				       PIPE_CONTROL_CS_STALL |
2604 				       PIPE_CONTROL_QW_WRITE,
2605 				       LRC_PPHWSP_SCRATCH_ADDR);
2606 
2607 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2608 
2609 	/* Pad to end of cacheline */
2610 	while ((unsigned long)batch % CACHELINE_BYTES)
2611 		*batch++ = MI_NOOP;
2612 
2613 	/*
2614 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2615 	 * execution depends on the length specified in terms of cache lines
2616 	 * in the register CTX_RCS_INDIRECT_CTX
2617 	 */
2618 
2619 	return batch;
2620 }
2621 
2622 struct lri {
2623 	i915_reg_t reg;
2624 	u32 value;
2625 };
2626 
2627 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2628 {
2629 	GEM_BUG_ON(!count || count > 63);
2630 
2631 	*batch++ = MI_LOAD_REGISTER_IMM(count);
2632 	do {
2633 		*batch++ = i915_mmio_reg_offset(lri->reg);
2634 		*batch++ = lri->value;
2635 	} while (lri++, --count);
2636 	*batch++ = MI_NOOP;
2637 
2638 	return batch;
2639 }
2640 
2641 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2642 {
2643 	static const struct lri lri[] = {
2644 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2645 		{
2646 			COMMON_SLICE_CHICKEN2,
2647 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2648 				       0),
2649 		},
2650 
2651 		/* BSpec: 11391 */
2652 		{
2653 			FF_SLICE_CHICKEN,
2654 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2655 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2656 		},
2657 
2658 		/* BSpec: 11299 */
2659 		{
2660 			_3D_CHICKEN3,
2661 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2662 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2663 		}
2664 	};
2665 
2666 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2667 
2668 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2669 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2670 
2671 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2672 
2673 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
2674 	if (HAS_POOLED_EU(engine->i915)) {
2675 		/*
2676 		 * EU pool configuration is setup along with golden context
2677 		 * during context initialization. This value depends on
2678 		 * device type (2x6 or 3x6) and needs to be updated based
2679 		 * on which subslice is disabled especially for 2x6
2680 		 * devices, however it is safe to load default
2681 		 * configuration of 3x6 device instead of masking off
2682 		 * corresponding bits because HW ignores bits of a disabled
2683 		 * subslice and drops down to appropriate config. Please
2684 		 * see render_state_setup() in i915_gem_render_state.c for
2685 		 * possible configurations, to avoid duplication they are
2686 		 * not shown here again.
2687 		 */
2688 		*batch++ = GEN9_MEDIA_POOL_STATE;
2689 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
2690 		*batch++ = 0x00777000;
2691 		*batch++ = 0;
2692 		*batch++ = 0;
2693 		*batch++ = 0;
2694 	}
2695 
2696 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2697 
2698 	/* Pad to end of cacheline */
2699 	while ((unsigned long)batch % CACHELINE_BYTES)
2700 		*batch++ = MI_NOOP;
2701 
2702 	return batch;
2703 }
2704 
2705 static u32 *
2706 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2707 {
2708 	int i;
2709 
2710 	/*
2711 	 * WaPipeControlBefore3DStateSamplePattern: cnl
2712 	 *
2713 	 * Ensure the engine is idle prior to programming a
2714 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
2715 	 */
2716 	batch = gen8_emit_pipe_control(batch,
2717 				       PIPE_CONTROL_CS_STALL,
2718 				       0);
2719 	/*
2720 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2721 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2722 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2723 	 * confusing. Since gen8_emit_pipe_control() already advances the
2724 	 * batch by 6 dwords, we advance the other 10 here, completing a
2725 	 * cacheline. It's not clear if the workaround requires this padding
2726 	 * before other commands, or if it's just the regular padding we would
2727 	 * already have for the workaround bb, so leave it here for now.
2728 	 */
2729 	for (i = 0; i < 10; i++)
2730 		*batch++ = MI_NOOP;
2731 
2732 	/* Pad to end of cacheline */
2733 	while ((unsigned long)batch % CACHELINE_BYTES)
2734 		*batch++ = MI_NOOP;
2735 
2736 	return batch;
2737 }
2738 
2739 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2740 
2741 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2742 {
2743 	struct drm_i915_gem_object *obj;
2744 	struct i915_vma *vma;
2745 	int err;
2746 
2747 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2748 	if (IS_ERR(obj))
2749 		return PTR_ERR(obj);
2750 
2751 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2752 	if (IS_ERR(vma)) {
2753 		err = PTR_ERR(vma);
2754 		goto err;
2755 	}
2756 
2757 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2758 	if (err)
2759 		goto err;
2760 
2761 	engine->wa_ctx.vma = vma;
2762 	return 0;
2763 
2764 err:
2765 	i915_gem_object_put(obj);
2766 	return err;
2767 }
2768 
2769 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2770 {
2771 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2772 }
2773 
2774 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2775 
2776 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2777 {
2778 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2779 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2780 					    &wa_ctx->per_ctx };
2781 	wa_bb_func_t wa_bb_fn[2];
2782 	struct page *page;
2783 	void *batch, *batch_ptr;
2784 	unsigned int i;
2785 	int ret;
2786 
2787 	if (engine->class != RENDER_CLASS)
2788 		return 0;
2789 
2790 	switch (INTEL_GEN(engine->i915)) {
2791 	case 12:
2792 	case 11:
2793 		return 0;
2794 	case 10:
2795 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
2796 		wa_bb_fn[1] = NULL;
2797 		break;
2798 	case 9:
2799 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
2800 		wa_bb_fn[1] = NULL;
2801 		break;
2802 	case 8:
2803 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
2804 		wa_bb_fn[1] = NULL;
2805 		break;
2806 	default:
2807 		MISSING_CASE(INTEL_GEN(engine->i915));
2808 		return 0;
2809 	}
2810 
2811 	ret = lrc_setup_wa_ctx(engine);
2812 	if (ret) {
2813 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2814 		return ret;
2815 	}
2816 
2817 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2818 	batch = batch_ptr = kmap_atomic(page);
2819 
2820 	/*
2821 	 * Emit the two workaround batch buffers, recording the offset from the
2822 	 * start of the workaround batch buffer object for each and their
2823 	 * respective sizes.
2824 	 */
2825 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2826 		wa_bb[i]->offset = batch_ptr - batch;
2827 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2828 						  CACHELINE_BYTES))) {
2829 			ret = -EINVAL;
2830 			break;
2831 		}
2832 		if (wa_bb_fn[i])
2833 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2834 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2835 	}
2836 
2837 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2838 
2839 	kunmap_atomic(batch);
2840 	if (ret)
2841 		lrc_destroy_wa_ctx(engine);
2842 
2843 	return ret;
2844 }
2845 
2846 static void enable_execlists(struct intel_engine_cs *engine)
2847 {
2848 	u32 mode;
2849 
2850 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2851 
2852 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2853 
2854 	if (INTEL_GEN(engine->i915) >= 11)
2855 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2856 	else
2857 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2858 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2859 
2860 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2861 
2862 	ENGINE_WRITE_FW(engine,
2863 			RING_HWS_PGA,
2864 			i915_ggtt_offset(engine->status_page.vma));
2865 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2866 }
2867 
2868 static bool unexpected_starting_state(struct intel_engine_cs *engine)
2869 {
2870 	bool unexpected = false;
2871 
2872 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2873 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2874 		unexpected = true;
2875 	}
2876 
2877 	return unexpected;
2878 }
2879 
2880 static int execlists_resume(struct intel_engine_cs *engine)
2881 {
2882 	intel_engine_apply_workarounds(engine);
2883 	intel_engine_apply_whitelist(engine);
2884 
2885 	intel_mocs_init_engine(engine);
2886 
2887 	intel_engine_reset_breadcrumbs(engine);
2888 
2889 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2890 		struct drm_printer p = drm_debug_printer(__func__);
2891 
2892 		intel_engine_dump(engine, &p, NULL);
2893 	}
2894 
2895 	enable_execlists(engine);
2896 
2897 	return 0;
2898 }
2899 
2900 static void execlists_reset_prepare(struct intel_engine_cs *engine)
2901 {
2902 	struct intel_engine_execlists * const execlists = &engine->execlists;
2903 	unsigned long flags;
2904 
2905 	GEM_TRACE("%s: depth<-%d\n", engine->name,
2906 		  atomic_read(&execlists->tasklet.count));
2907 
2908 	/*
2909 	 * Prevent request submission to the hardware until we have
2910 	 * completed the reset in i915_gem_reset_finish(). If a request
2911 	 * is completed by one engine, it may then queue a request
2912 	 * to a second via its execlists->tasklet *just* as we are
2913 	 * calling engine->resume() and also writing the ELSP.
2914 	 * Turning off the execlists->tasklet until the reset is over
2915 	 * prevents the race.
2916 	 */
2917 	__tasklet_disable_sync_once(&execlists->tasklet);
2918 	GEM_BUG_ON(!reset_in_progress(execlists));
2919 
2920 	/* And flush any current direct submission. */
2921 	spin_lock_irqsave(&engine->active.lock, flags);
2922 	spin_unlock_irqrestore(&engine->active.lock, flags);
2923 
2924 	/*
2925 	 * We stop engines, otherwise we might get failed reset and a
2926 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
2927 	 * from system hang if batchbuffer is progressing when
2928 	 * the reset is issued, regardless of READY_TO_RESET ack.
2929 	 * Thus assume it is best to stop engines on all gens
2930 	 * where we have a gpu reset.
2931 	 *
2932 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2933 	 *
2934 	 * FIXME: Wa for more modern gens needs to be validated
2935 	 */
2936 	intel_engine_stop_cs(engine);
2937 }
2938 
2939 static void reset_csb_pointers(struct intel_engine_cs *engine)
2940 {
2941 	struct intel_engine_execlists * const execlists = &engine->execlists;
2942 	const unsigned int reset_value = execlists->csb_size - 1;
2943 
2944 	ring_set_paused(engine, 0);
2945 
2946 	/*
2947 	 * After a reset, the HW starts writing into CSB entry [0]. We
2948 	 * therefore have to set our HEAD pointer back one entry so that
2949 	 * the *first* entry we check is entry 0. To complicate this further,
2950 	 * as we don't wait for the first interrupt after reset, we have to
2951 	 * fake the HW write to point back to the last entry so that our
2952 	 * inline comparison of our cached head position against the last HW
2953 	 * write works even before the first interrupt.
2954 	 */
2955 	execlists->csb_head = reset_value;
2956 	WRITE_ONCE(*execlists->csb_write, reset_value);
2957 	wmb(); /* Make sure this is visible to HW (paranoia?) */
2958 
2959 	invalidate_csb_entries(&execlists->csb_status[0],
2960 			       &execlists->csb_status[reset_value]);
2961 }
2962 
2963 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
2964 {
2965 	if (INTEL_GEN(engine->i915) >= 12)
2966 		return 0x60;
2967 	else if (INTEL_GEN(engine->i915) >= 9)
2968 		return 0x54;
2969 	else if (engine->class == RENDER_CLASS)
2970 		return 0x58;
2971 	else
2972 		return -1;
2973 }
2974 
2975 static void __execlists_reset_reg_state(const struct intel_context *ce,
2976 					const struct intel_engine_cs *engine)
2977 {
2978 	u32 *regs = ce->lrc_reg_state;
2979 	int x;
2980 
2981 	x = lrc_ring_mi_mode(engine);
2982 	if (x != -1) {
2983 		regs[x + 1] &= ~STOP_RING;
2984 		regs[x + 1] |= STOP_RING << 16;
2985 	}
2986 }
2987 
2988 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2989 {
2990 	struct intel_engine_execlists * const execlists = &engine->execlists;
2991 	struct intel_context *ce;
2992 	struct i915_request *rq;
2993 
2994 	mb(); /* paranoia: read the CSB pointers from after the reset */
2995 	clflush(execlists->csb_write);
2996 	mb();
2997 
2998 	process_csb(engine); /* drain preemption events */
2999 
3000 	/* Following the reset, we need to reload the CSB read/write pointers */
3001 	reset_csb_pointers(engine);
3002 
3003 	/*
3004 	 * Save the currently executing context, even if we completed
3005 	 * its request, it was still running at the time of the
3006 	 * reset and will have been clobbered.
3007 	 */
3008 	rq = execlists_active(execlists);
3009 	if (!rq)
3010 		goto unwind;
3011 
3012 	/* We still have requests in-flight; the engine should be active */
3013 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3014 
3015 	ce = rq->hw_context;
3016 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3017 
3018 	if (i915_request_completed(rq)) {
3019 		/* Idle context; tidy up the ring so we can restart afresh */
3020 		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
3021 		goto out_replay;
3022 	}
3023 
3024 	/* Context has requests still in-flight; it should not be idle! */
3025 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3026 	rq = active_request(ce->timeline, rq);
3027 	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
3028 	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
3029 
3030 	/*
3031 	 * If this request hasn't started yet, e.g. it is waiting on a
3032 	 * semaphore, we need to avoid skipping the request or else we
3033 	 * break the signaling chain. However, if the context is corrupt
3034 	 * the request will not restart and we will be stuck with a wedged
3035 	 * device. It is quite often the case that if we issue a reset
3036 	 * while the GPU is loading the context image, that the context
3037 	 * image becomes corrupt.
3038 	 *
3039 	 * Otherwise, if we have not started yet, the request should replay
3040 	 * perfectly and we do not need to flag the result as being erroneous.
3041 	 */
3042 	if (!i915_request_started(rq))
3043 		goto out_replay;
3044 
3045 	/*
3046 	 * If the request was innocent, we leave the request in the ELSP
3047 	 * and will try to replay it on restarting. The context image may
3048 	 * have been corrupted by the reset, in which case we may have
3049 	 * to service a new GPU hang, but more likely we can continue on
3050 	 * without impact.
3051 	 *
3052 	 * If the request was guilty, we presume the context is corrupt
3053 	 * and have to at least restore the RING register in the context
3054 	 * image back to the expected values to skip over the guilty request.
3055 	 */
3056 	__i915_request_reset(rq, stalled);
3057 	if (!stalled)
3058 		goto out_replay;
3059 
3060 	/*
3061 	 * We want a simple context + ring to execute the breadcrumb update.
3062 	 * We cannot rely on the context being intact across the GPU hang,
3063 	 * so clear it and rebuild just what we need for the breadcrumb.
3064 	 * All pending requests for this context will be zapped, and any
3065 	 * future request will be after userspace has had the opportunity
3066 	 * to recreate its own state.
3067 	 */
3068 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3069 	restore_default_state(ce, engine);
3070 
3071 out_replay:
3072 	GEM_TRACE("%s replay {head:%04x, tail:%04x}\n",
3073 		  engine->name, ce->ring->head, ce->ring->tail);
3074 	intel_ring_update_space(ce->ring);
3075 	__execlists_reset_reg_state(ce, engine);
3076 	__execlists_update_reg_state(ce, engine);
3077 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3078 
3079 unwind:
3080 	/* Push back any incomplete requests for replay after the reset. */
3081 	cancel_port_requests(execlists);
3082 	__unwind_incomplete_requests(engine);
3083 }
3084 
3085 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
3086 {
3087 	unsigned long flags;
3088 
3089 	GEM_TRACE("%s\n", engine->name);
3090 
3091 	spin_lock_irqsave(&engine->active.lock, flags);
3092 
3093 	__execlists_reset(engine, stalled);
3094 
3095 	spin_unlock_irqrestore(&engine->active.lock, flags);
3096 }
3097 
3098 static void nop_submission_tasklet(unsigned long data)
3099 {
3100 	/* The driver is wedged; don't process any more events. */
3101 }
3102 
3103 static void execlists_cancel_requests(struct intel_engine_cs *engine)
3104 {
3105 	struct intel_engine_execlists * const execlists = &engine->execlists;
3106 	struct i915_request *rq, *rn;
3107 	struct rb_node *rb;
3108 	unsigned long flags;
3109 
3110 	GEM_TRACE("%s\n", engine->name);
3111 
3112 	/*
3113 	 * Before we call engine->cancel_requests(), we should have exclusive
3114 	 * access to the submission state. This is arranged for us by the
3115 	 * caller disabling the interrupt generation, the tasklet and other
3116 	 * threads that may then access the same state, giving us a free hand
3117 	 * to reset state. However, we still need to let lockdep be aware that
3118 	 * we know this state may be accessed in hardirq context, so we
3119 	 * disable the irq around this manipulation and we want to keep
3120 	 * the spinlock focused on its duties and not accidentally conflate
3121 	 * coverage to the submission's irq state. (Similarly, although we
3122 	 * shouldn't need to disable irq around the manipulation of the
3123 	 * submission's irq state, we also wish to remind ourselves that
3124 	 * it is irq state.)
3125 	 */
3126 	spin_lock_irqsave(&engine->active.lock, flags);
3127 
3128 	__execlists_reset(engine, true);
3129 
3130 	/* Mark all executing requests as skipped. */
3131 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3132 		mark_eio(rq);
3133 
3134 	/* Flush the queued requests to the timeline list (for retiring). */
3135 	while ((rb = rb_first_cached(&execlists->queue))) {
3136 		struct i915_priolist *p = to_priolist(rb);
3137 		int i;
3138 
3139 		priolist_for_each_request_consume(rq, rn, p, i) {
3140 			mark_eio(rq);
3141 			__i915_request_submit(rq);
3142 		}
3143 
3144 		rb_erase_cached(&p->node, &execlists->queue);
3145 		i915_priolist_free(p);
3146 	}
3147 
3148 	/* Cancel all attached virtual engines */
3149 	while ((rb = rb_first_cached(&execlists->virtual))) {
3150 		struct virtual_engine *ve =
3151 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3152 
3153 		rb_erase_cached(rb, &execlists->virtual);
3154 		RB_CLEAR_NODE(rb);
3155 
3156 		spin_lock(&ve->base.active.lock);
3157 		rq = fetch_and_zero(&ve->request);
3158 		if (rq) {
3159 			mark_eio(rq);
3160 
3161 			rq->engine = engine;
3162 			__i915_request_submit(rq);
3163 			i915_request_put(rq);
3164 
3165 			ve->base.execlists.queue_priority_hint = INT_MIN;
3166 		}
3167 		spin_unlock(&ve->base.active.lock);
3168 	}
3169 
3170 	/* Remaining _unready_ requests will be nop'ed when submitted */
3171 
3172 	execlists->queue_priority_hint = INT_MIN;
3173 	execlists->queue = RB_ROOT_CACHED;
3174 
3175 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3176 	execlists->tasklet.func = nop_submission_tasklet;
3177 
3178 	spin_unlock_irqrestore(&engine->active.lock, flags);
3179 }
3180 
3181 static void execlists_reset_finish(struct intel_engine_cs *engine)
3182 {
3183 	struct intel_engine_execlists * const execlists = &engine->execlists;
3184 
3185 	/*
3186 	 * After a GPU reset, we may have requests to replay. Do so now while
3187 	 * we still have the forcewake to be sure that the GPU is not allowed
3188 	 * to sleep before we restart and reload a context.
3189 	 */
3190 	GEM_BUG_ON(!reset_in_progress(execlists));
3191 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3192 		execlists->tasklet.func(execlists->tasklet.data);
3193 
3194 	if (__tasklet_enable(&execlists->tasklet))
3195 		/* And kick in case we missed a new request submission. */
3196 		tasklet_hi_schedule(&execlists->tasklet);
3197 	GEM_TRACE("%s: depth->%d\n", engine->name,
3198 		  atomic_read(&execlists->tasklet.count));
3199 }
3200 
3201 static int gen8_emit_bb_start(struct i915_request *rq,
3202 			      u64 offset, u32 len,
3203 			      const unsigned int flags)
3204 {
3205 	u32 *cs;
3206 
3207 	cs = intel_ring_begin(rq, 4);
3208 	if (IS_ERR(cs))
3209 		return PTR_ERR(cs);
3210 
3211 	/*
3212 	 * WaDisableCtxRestoreArbitration:bdw,chv
3213 	 *
3214 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3215 	 * particular all the gen that do not need the w/a at all!), if we
3216 	 * took care to make sure that on every switch into this context
3217 	 * (both ordinary and for preemption) that arbitrartion was enabled
3218 	 * we would be fine.  However, for gen8 there is another w/a that
3219 	 * requires us to not preempt inside GPGPU execution, so we keep
3220 	 * arbitration disabled for gen8 batches. Arbitration will be
3221 	 * re-enabled before we close the request
3222 	 * (engine->emit_fini_breadcrumb).
3223 	 */
3224 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3225 
3226 	/* FIXME(BDW+): Address space and security selectors. */
3227 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3228 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3229 	*cs++ = lower_32_bits(offset);
3230 	*cs++ = upper_32_bits(offset);
3231 
3232 	intel_ring_advance(rq, cs);
3233 
3234 	return 0;
3235 }
3236 
3237 static int gen9_emit_bb_start(struct i915_request *rq,
3238 			      u64 offset, u32 len,
3239 			      const unsigned int flags)
3240 {
3241 	u32 *cs;
3242 
3243 	cs = intel_ring_begin(rq, 6);
3244 	if (IS_ERR(cs))
3245 		return PTR_ERR(cs);
3246 
3247 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3248 
3249 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3250 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3251 	*cs++ = lower_32_bits(offset);
3252 	*cs++ = upper_32_bits(offset);
3253 
3254 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3255 	*cs++ = MI_NOOP;
3256 
3257 	intel_ring_advance(rq, cs);
3258 
3259 	return 0;
3260 }
3261 
3262 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3263 {
3264 	ENGINE_WRITE(engine, RING_IMR,
3265 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3266 	ENGINE_POSTING_READ(engine, RING_IMR);
3267 }
3268 
3269 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3270 {
3271 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3272 }
3273 
3274 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3275 {
3276 	u32 cmd, *cs;
3277 
3278 	cs = intel_ring_begin(request, 4);
3279 	if (IS_ERR(cs))
3280 		return PTR_ERR(cs);
3281 
3282 	cmd = MI_FLUSH_DW + 1;
3283 
3284 	/* We always require a command barrier so that subsequent
3285 	 * commands, such as breadcrumb interrupts, are strictly ordered
3286 	 * wrt the contents of the write cache being flushed to memory
3287 	 * (and thus being coherent from the CPU).
3288 	 */
3289 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3290 
3291 	if (mode & EMIT_INVALIDATE) {
3292 		cmd |= MI_INVALIDATE_TLB;
3293 		if (request->engine->class == VIDEO_DECODE_CLASS)
3294 			cmd |= MI_INVALIDATE_BSD;
3295 	}
3296 
3297 	*cs++ = cmd;
3298 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3299 	*cs++ = 0; /* upper addr */
3300 	*cs++ = 0; /* value */
3301 	intel_ring_advance(request, cs);
3302 
3303 	return 0;
3304 }
3305 
3306 static int gen8_emit_flush_render(struct i915_request *request,
3307 				  u32 mode)
3308 {
3309 	bool vf_flush_wa = false, dc_flush_wa = false;
3310 	u32 *cs, flags = 0;
3311 	int len;
3312 
3313 	flags |= PIPE_CONTROL_CS_STALL;
3314 
3315 	if (mode & EMIT_FLUSH) {
3316 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3317 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3318 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3319 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3320 	}
3321 
3322 	if (mode & EMIT_INVALIDATE) {
3323 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3324 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3325 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3326 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3327 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3328 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3329 		flags |= PIPE_CONTROL_QW_WRITE;
3330 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3331 
3332 		/*
3333 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3334 		 * pipe control.
3335 		 */
3336 		if (IS_GEN(request->i915, 9))
3337 			vf_flush_wa = true;
3338 
3339 		/* WaForGAMHang:kbl */
3340 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3341 			dc_flush_wa = true;
3342 	}
3343 
3344 	len = 6;
3345 
3346 	if (vf_flush_wa)
3347 		len += 6;
3348 
3349 	if (dc_flush_wa)
3350 		len += 12;
3351 
3352 	cs = intel_ring_begin(request, len);
3353 	if (IS_ERR(cs))
3354 		return PTR_ERR(cs);
3355 
3356 	if (vf_flush_wa)
3357 		cs = gen8_emit_pipe_control(cs, 0, 0);
3358 
3359 	if (dc_flush_wa)
3360 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3361 					    0);
3362 
3363 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3364 
3365 	if (dc_flush_wa)
3366 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3367 
3368 	intel_ring_advance(request, cs);
3369 
3370 	return 0;
3371 }
3372 
3373 static int gen11_emit_flush_render(struct i915_request *request,
3374 				   u32 mode)
3375 {
3376 	if (mode & EMIT_FLUSH) {
3377 		u32 *cs;
3378 		u32 flags = 0;
3379 
3380 		flags |= PIPE_CONTROL_CS_STALL;
3381 
3382 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3383 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3384 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3385 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3386 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3387 		flags |= PIPE_CONTROL_QW_WRITE;
3388 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3389 
3390 		cs = intel_ring_begin(request, 6);
3391 		if (IS_ERR(cs))
3392 			return PTR_ERR(cs);
3393 
3394 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3395 		intel_ring_advance(request, cs);
3396 	}
3397 
3398 	if (mode & EMIT_INVALIDATE) {
3399 		u32 *cs;
3400 		u32 flags = 0;
3401 
3402 		flags |= PIPE_CONTROL_CS_STALL;
3403 
3404 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3405 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3406 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3407 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3408 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3409 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3410 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3411 		flags |= PIPE_CONTROL_QW_WRITE;
3412 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3413 
3414 		cs = intel_ring_begin(request, 6);
3415 		if (IS_ERR(cs))
3416 			return PTR_ERR(cs);
3417 
3418 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3419 		intel_ring_advance(request, cs);
3420 	}
3421 
3422 	return 0;
3423 }
3424 
3425 static u32 preparser_disable(bool state)
3426 {
3427 	return MI_ARB_CHECK | 1 << 8 | state;
3428 }
3429 
3430 static int gen12_emit_flush_render(struct i915_request *request,
3431 				   u32 mode)
3432 {
3433 	if (mode & EMIT_FLUSH) {
3434 		u32 flags = 0;
3435 		u32 *cs;
3436 
3437 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3438 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3439 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3440 		/* Wa_1409600907:tgl */
3441 		flags |= PIPE_CONTROL_DEPTH_STALL;
3442 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3443 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3444 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3445 
3446 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3447 		flags |= PIPE_CONTROL_QW_WRITE;
3448 
3449 		flags |= PIPE_CONTROL_CS_STALL;
3450 
3451 		cs = intel_ring_begin(request, 6);
3452 		if (IS_ERR(cs))
3453 			return PTR_ERR(cs);
3454 
3455 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3456 		intel_ring_advance(request, cs);
3457 	}
3458 
3459 	if (mode & EMIT_INVALIDATE) {
3460 		u32 flags = 0;
3461 		u32 *cs;
3462 
3463 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3464 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3465 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3466 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3467 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3468 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3469 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3470 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3471 
3472 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3473 		flags |= PIPE_CONTROL_QW_WRITE;
3474 
3475 		flags |= PIPE_CONTROL_CS_STALL;
3476 
3477 		cs = intel_ring_begin(request, 8);
3478 		if (IS_ERR(cs))
3479 			return PTR_ERR(cs);
3480 
3481 		/*
3482 		 * Prevent the pre-parser from skipping past the TLB
3483 		 * invalidate and loading a stale page for the batch
3484 		 * buffer / request payload.
3485 		 */
3486 		*cs++ = preparser_disable(true);
3487 
3488 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3489 
3490 		*cs++ = preparser_disable(false);
3491 		intel_ring_advance(request, cs);
3492 
3493 		/*
3494 		 * Wa_1604544889:tgl
3495 		 */
3496 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
3497 			flags = 0;
3498 			flags |= PIPE_CONTROL_CS_STALL;
3499 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3500 
3501 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3502 			flags |= PIPE_CONTROL_QW_WRITE;
3503 
3504 			cs = intel_ring_begin(request, 6);
3505 			if (IS_ERR(cs))
3506 				return PTR_ERR(cs);
3507 
3508 			cs = gen8_emit_pipe_control(cs, flags,
3509 						    LRC_PPHWSP_SCRATCH_ADDR);
3510 			intel_ring_advance(request, cs);
3511 		}
3512 	}
3513 
3514 	return 0;
3515 }
3516 
3517 /*
3518  * Reserve space for 2 NOOPs at the end of each request to be
3519  * used as a workaround for not being allowed to do lite
3520  * restore with HEAD==TAIL (WaIdleLiteRestore).
3521  */
3522 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
3523 {
3524 	/* Ensure there's always at least one preemption point per-request. */
3525 	*cs++ = MI_ARB_CHECK;
3526 	*cs++ = MI_NOOP;
3527 	request->wa_tail = intel_ring_offset(request, cs);
3528 
3529 	return cs;
3530 }
3531 
3532 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
3533 {
3534 	*cs++ = MI_SEMAPHORE_WAIT |
3535 		MI_SEMAPHORE_GLOBAL_GTT |
3536 		MI_SEMAPHORE_POLL |
3537 		MI_SEMAPHORE_SAD_EQ_SDD;
3538 	*cs++ = 0;
3539 	*cs++ = intel_hws_preempt_address(request->engine);
3540 	*cs++ = 0;
3541 
3542 	return cs;
3543 }
3544 
3545 static __always_inline u32*
3546 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
3547 				 u32 *cs)
3548 {
3549 	*cs++ = MI_USER_INTERRUPT;
3550 
3551 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3552 	if (intel_engine_has_semaphores(request->engine))
3553 		cs = emit_preempt_busywait(request, cs);
3554 
3555 	request->tail = intel_ring_offset(request, cs);
3556 	assert_ring_tail_valid(request->ring, request->tail);
3557 
3558 	return gen8_emit_wa_tail(request, cs);
3559 }
3560 
3561 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3562 {
3563 	cs = gen8_emit_ggtt_write(cs,
3564 				  request->fence.seqno,
3565 				  i915_request_active_timeline(request)->hwsp_offset,
3566 				  0);
3567 
3568 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3569 }
3570 
3571 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3572 {
3573 	cs = gen8_emit_pipe_control(cs,
3574 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3575 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3576 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
3577 				    0);
3578 
3579 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
3580 	cs = gen8_emit_ggtt_write_rcs(cs,
3581 				      request->fence.seqno,
3582 				      i915_request_active_timeline(request)->hwsp_offset,
3583 				      PIPE_CONTROL_FLUSH_ENABLE |
3584 				      PIPE_CONTROL_CS_STALL);
3585 
3586 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3587 }
3588 
3589 static u32 *
3590 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3591 {
3592 	cs = gen8_emit_ggtt_write_rcs(cs,
3593 				      request->fence.seqno,
3594 				      i915_request_active_timeline(request)->hwsp_offset,
3595 				      PIPE_CONTROL_CS_STALL |
3596 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3597 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3598 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3599 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3600 				      PIPE_CONTROL_FLUSH_ENABLE);
3601 
3602 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3603 }
3604 
3605 /*
3606  * Note that the CS instruction pre-parser will not stall on the breadcrumb
3607  * flush and will continue pre-fetching the instructions after it before the
3608  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
3609  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
3610  * of the next request before the memory has been flushed, we're guaranteed that
3611  * we won't access the batch itself too early.
3612  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
3613  * so, if the current request is modifying an instruction in the next request on
3614  * the same intel_context, we might pre-fetch and then execute the pre-update
3615  * instruction. To avoid this, the users of self-modifying code should either
3616  * disable the parser around the code emitting the memory writes, via a new flag
3617  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
3618  * the in-kernel use-cases we've opted to use a separate context, see
3619  * reloc_gpu() as an example.
3620  * All the above applies only to the instructions themselves. Non-inline data
3621  * used by the instructions is not pre-fetched.
3622  */
3623 
3624 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
3625 {
3626 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
3627 		MI_SEMAPHORE_GLOBAL_GTT |
3628 		MI_SEMAPHORE_POLL |
3629 		MI_SEMAPHORE_SAD_EQ_SDD;
3630 	*cs++ = 0;
3631 	*cs++ = intel_hws_preempt_address(request->engine);
3632 	*cs++ = 0;
3633 	*cs++ = 0;
3634 	*cs++ = MI_NOOP;
3635 
3636 	return cs;
3637 }
3638 
3639 static __always_inline u32*
3640 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
3641 {
3642 	*cs++ = MI_USER_INTERRUPT;
3643 
3644 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3645 	if (intel_engine_has_semaphores(request->engine))
3646 		cs = gen12_emit_preempt_busywait(request, cs);
3647 
3648 	request->tail = intel_ring_offset(request, cs);
3649 	assert_ring_tail_valid(request->ring, request->tail);
3650 
3651 	return gen8_emit_wa_tail(request, cs);
3652 }
3653 
3654 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3655 {
3656 	cs = gen8_emit_ggtt_write(cs,
3657 				  request->fence.seqno,
3658 				  i915_request_active_timeline(request)->hwsp_offset,
3659 				  0);
3660 
3661 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3662 }
3663 
3664 static u32 *
3665 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3666 {
3667 	cs = gen8_emit_ggtt_write_rcs(cs,
3668 				      request->fence.seqno,
3669 				      i915_request_active_timeline(request)->hwsp_offset,
3670 				      PIPE_CONTROL_CS_STALL |
3671 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3672 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3673 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3674 				      /* Wa_1409600907:tgl */
3675 				      PIPE_CONTROL_DEPTH_STALL |
3676 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3677 				      PIPE_CONTROL_FLUSH_ENABLE |
3678 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
3679 
3680 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3681 }
3682 
3683 static void execlists_park(struct intel_engine_cs *engine)
3684 {
3685 	cancel_timer(&engine->execlists.timer);
3686 	cancel_timer(&engine->execlists.preempt);
3687 }
3688 
3689 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
3690 {
3691 	engine->submit_request = execlists_submit_request;
3692 	engine->cancel_requests = execlists_cancel_requests;
3693 	engine->schedule = i915_schedule;
3694 	engine->execlists.tasklet.func = execlists_submission_tasklet;
3695 
3696 	engine->reset.prepare = execlists_reset_prepare;
3697 	engine->reset.reset = execlists_reset;
3698 	engine->reset.finish = execlists_reset_finish;
3699 
3700 	engine->park = execlists_park;
3701 	engine->unpark = NULL;
3702 
3703 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3704 	if (!intel_vgpu_active(engine->i915)) {
3705 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3706 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3707 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3708 	}
3709 
3710 	if (INTEL_GEN(engine->i915) >= 12)
3711 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
3712 }
3713 
3714 static void execlists_destroy(struct intel_engine_cs *engine)
3715 {
3716 	intel_engine_cleanup_common(engine);
3717 	lrc_destroy_wa_ctx(engine);
3718 	kfree(engine);
3719 }
3720 
3721 static void
3722 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3723 {
3724 	/* Default vfuncs which can be overriden by each engine. */
3725 
3726 	engine->destroy = execlists_destroy;
3727 	engine->resume = execlists_resume;
3728 
3729 	engine->reset.prepare = execlists_reset_prepare;
3730 	engine->reset.reset = execlists_reset;
3731 	engine->reset.finish = execlists_reset_finish;
3732 
3733 	engine->cops = &execlists_context_ops;
3734 	engine->request_alloc = execlists_request_alloc;
3735 
3736 	engine->emit_flush = gen8_emit_flush;
3737 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3738 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3739 	if (INTEL_GEN(engine->i915) >= 12)
3740 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
3741 
3742 	engine->set_default_submission = intel_execlists_set_default_submission;
3743 
3744 	if (INTEL_GEN(engine->i915) < 11) {
3745 		engine->irq_enable = gen8_logical_ring_enable_irq;
3746 		engine->irq_disable = gen8_logical_ring_disable_irq;
3747 	} else {
3748 		/*
3749 		 * TODO: On Gen11 interrupt masks need to be clear
3750 		 * to allow C6 entry. Keep interrupts enabled at
3751 		 * and take the hit of generating extra interrupts
3752 		 * until a more refined solution exists.
3753 		 */
3754 	}
3755 	if (IS_GEN(engine->i915, 8))
3756 		engine->emit_bb_start = gen8_emit_bb_start;
3757 	else
3758 		engine->emit_bb_start = gen9_emit_bb_start;
3759 }
3760 
3761 static inline void
3762 logical_ring_default_irqs(struct intel_engine_cs *engine)
3763 {
3764 	unsigned int shift = 0;
3765 
3766 	if (INTEL_GEN(engine->i915) < 11) {
3767 		const u8 irq_shifts[] = {
3768 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
3769 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
3770 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3771 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3772 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
3773 		};
3774 
3775 		shift = irq_shifts[engine->id];
3776 	}
3777 
3778 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3779 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3780 }
3781 
3782 static void rcs_submission_override(struct intel_engine_cs *engine)
3783 {
3784 	switch (INTEL_GEN(engine->i915)) {
3785 	case 12:
3786 		engine->emit_flush = gen12_emit_flush_render;
3787 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3788 		break;
3789 	case 11:
3790 		engine->emit_flush = gen11_emit_flush_render;
3791 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3792 		break;
3793 	default:
3794 		engine->emit_flush = gen8_emit_flush_render;
3795 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3796 		break;
3797 	}
3798 }
3799 
3800 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3801 {
3802 	tasklet_init(&engine->execlists.tasklet,
3803 		     execlists_submission_tasklet, (unsigned long)engine);
3804 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
3805 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
3806 
3807 	logical_ring_default_vfuncs(engine);
3808 	logical_ring_default_irqs(engine);
3809 
3810 	if (engine->class == RENDER_CLASS)
3811 		rcs_submission_override(engine);
3812 
3813 	return 0;
3814 }
3815 
3816 int intel_execlists_submission_init(struct intel_engine_cs *engine)
3817 {
3818 	struct intel_engine_execlists * const execlists = &engine->execlists;
3819 	struct drm_i915_private *i915 = engine->i915;
3820 	struct intel_uncore *uncore = engine->uncore;
3821 	u32 base = engine->mmio_base;
3822 	int ret;
3823 
3824 	ret = intel_engine_init_common(engine);
3825 	if (ret)
3826 		return ret;
3827 
3828 	if (intel_init_workaround_bb(engine))
3829 		/*
3830 		 * We continue even if we fail to initialize WA batch
3831 		 * because we only expect rare glitches but nothing
3832 		 * critical to prevent us from using GPU
3833 		 */
3834 		DRM_ERROR("WA batch buffer initialization failed\n");
3835 
3836 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
3837 		execlists->submit_reg = uncore->regs +
3838 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3839 		execlists->ctrl_reg = uncore->regs +
3840 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3841 	} else {
3842 		execlists->submit_reg = uncore->regs +
3843 			i915_mmio_reg_offset(RING_ELSP(base));
3844 	}
3845 
3846 	execlists->csb_status =
3847 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3848 
3849 	execlists->csb_write =
3850 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
3851 
3852 	if (INTEL_GEN(i915) < 11)
3853 		execlists->csb_size = GEN8_CSB_ENTRIES;
3854 	else
3855 		execlists->csb_size = GEN11_CSB_ENTRIES;
3856 
3857 	reset_csb_pointers(engine);
3858 
3859 	return 0;
3860 }
3861 
3862 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
3863 {
3864 	u32 indirect_ctx_offset;
3865 
3866 	switch (INTEL_GEN(engine->i915)) {
3867 	default:
3868 		MISSING_CASE(INTEL_GEN(engine->i915));
3869 		/* fall through */
3870 	case 12:
3871 		indirect_ctx_offset =
3872 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3873 		break;
3874 	case 11:
3875 		indirect_ctx_offset =
3876 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3877 		break;
3878 	case 10:
3879 		indirect_ctx_offset =
3880 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3881 		break;
3882 	case 9:
3883 		indirect_ctx_offset =
3884 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3885 		break;
3886 	case 8:
3887 		indirect_ctx_offset =
3888 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3889 		break;
3890 	}
3891 
3892 	return indirect_ctx_offset;
3893 }
3894 
3895 
3896 static void init_common_reg_state(u32 * const regs,
3897 				  const struct intel_engine_cs *engine,
3898 				  const struct intel_ring *ring)
3899 {
3900 	regs[CTX_CONTEXT_CONTROL] =
3901 		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3902 		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
3903 	if (INTEL_GEN(engine->i915) < 11)
3904 		regs[CTX_CONTEXT_CONTROL] |=
3905 			_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3906 					    CTX_CTRL_RS_CTX_ENABLE);
3907 
3908 	regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3909 	regs[CTX_BB_STATE] = RING_BB_PPGTT;
3910 }
3911 
3912 static void init_wa_bb_reg_state(u32 * const regs,
3913 				 const struct intel_engine_cs *engine,
3914 				 u32 pos_bb_per_ctx)
3915 {
3916 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
3917 
3918 	if (wa_ctx->per_ctx.size) {
3919 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3920 
3921 		regs[pos_bb_per_ctx] =
3922 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3923 	}
3924 
3925 	if (wa_ctx->indirect_ctx.size) {
3926 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3927 
3928 		regs[pos_bb_per_ctx + 2] =
3929 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
3930 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3931 
3932 		regs[pos_bb_per_ctx + 4] =
3933 			intel_lr_indirect_ctx_offset(engine) << 6;
3934 	}
3935 }
3936 
3937 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
3938 {
3939 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
3940 		/* 64b PPGTT (48bit canonical)
3941 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
3942 		 * other PDP Descriptors are ignored.
3943 		 */
3944 		ASSIGN_CTX_PML4(ppgtt, regs);
3945 	} else {
3946 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
3947 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
3948 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
3949 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
3950 	}
3951 }
3952 
3953 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
3954 {
3955 	if (i915_is_ggtt(vm))
3956 		return i915_vm_to_ggtt(vm)->alias;
3957 	else
3958 		return i915_vm_to_ppgtt(vm);
3959 }
3960 
3961 static void execlists_init_reg_state(u32 *regs,
3962 				     const struct intel_context *ce,
3963 				     const struct intel_engine_cs *engine,
3964 				     const struct intel_ring *ring,
3965 				     bool close)
3966 {
3967 	/*
3968 	 * A context is actually a big batch buffer with several
3969 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3970 	 * values we are setting here are only for the first context restore:
3971 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
3972 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3973 	 * we are not initializing here).
3974 	 *
3975 	 * Must keep consistent with virtual_update_register_offsets().
3976 	 */
3977 	u32 *bbe = set_offsets(regs, reg_offsets(engine), engine);
3978 
3979 	if (close) { /* Close the batch; used mainly by live_lrc_layout() */
3980 		*bbe = MI_BATCH_BUFFER_END;
3981 		if (INTEL_GEN(engine->i915) >= 10)
3982 			*bbe |= BIT(0);
3983 	}
3984 
3985 	init_common_reg_state(regs, engine, ring);
3986 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
3987 
3988 	init_wa_bb_reg_state(regs, engine,
3989 			     INTEL_GEN(engine->i915) >= 12 ?
3990 			     GEN12_CTX_BB_PER_CTX_PTR :
3991 			     CTX_BB_PER_CTX_PTR);
3992 }
3993 
3994 static int
3995 populate_lr_context(struct intel_context *ce,
3996 		    struct drm_i915_gem_object *ctx_obj,
3997 		    struct intel_engine_cs *engine,
3998 		    struct intel_ring *ring)
3999 {
4000 	bool inhibit = true;
4001 	void *vaddr;
4002 	u32 *regs;
4003 	int ret;
4004 
4005 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4006 	if (IS_ERR(vaddr)) {
4007 		ret = PTR_ERR(vaddr);
4008 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4009 		return ret;
4010 	}
4011 
4012 	set_redzone(vaddr, engine);
4013 
4014 	if (engine->default_state) {
4015 		void *defaults;
4016 
4017 		defaults = i915_gem_object_pin_map(engine->default_state,
4018 						   I915_MAP_WB);
4019 		if (IS_ERR(defaults)) {
4020 			ret = PTR_ERR(defaults);
4021 			goto err_unpin_ctx;
4022 		}
4023 
4024 		memcpy(vaddr, defaults, engine->context_size);
4025 		i915_gem_object_unpin_map(engine->default_state);
4026 		inhibit = false;
4027 	}
4028 
4029 	/* The second page of the context object contains some fields which must
4030 	 * be set up prior to the first execution. */
4031 	regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
4032 	execlists_init_reg_state(regs, ce, engine, ring, inhibit);
4033 	if (inhibit)
4034 		regs[CTX_CONTEXT_CONTROL] |=
4035 			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4036 
4037 	ret = 0;
4038 err_unpin_ctx:
4039 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4040 	i915_gem_object_unpin_map(ctx_obj);
4041 	return ret;
4042 }
4043 
4044 static int __execlists_context_alloc(struct intel_context *ce,
4045 				     struct intel_engine_cs *engine)
4046 {
4047 	struct drm_i915_gem_object *ctx_obj;
4048 	struct intel_ring *ring;
4049 	struct i915_vma *vma;
4050 	u32 context_size;
4051 	int ret;
4052 
4053 	GEM_BUG_ON(ce->state);
4054 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4055 
4056 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4057 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4058 
4059 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4060 	if (IS_ERR(ctx_obj))
4061 		return PTR_ERR(ctx_obj);
4062 
4063 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4064 	if (IS_ERR(vma)) {
4065 		ret = PTR_ERR(vma);
4066 		goto error_deref_obj;
4067 	}
4068 
4069 	if (!ce->timeline) {
4070 		struct intel_timeline *tl;
4071 
4072 		tl = intel_timeline_create(engine->gt, NULL);
4073 		if (IS_ERR(tl)) {
4074 			ret = PTR_ERR(tl);
4075 			goto error_deref_obj;
4076 		}
4077 
4078 		ce->timeline = tl;
4079 	}
4080 
4081 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4082 	if (IS_ERR(ring)) {
4083 		ret = PTR_ERR(ring);
4084 		goto error_deref_obj;
4085 	}
4086 
4087 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4088 	if (ret) {
4089 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4090 		goto error_ring_free;
4091 	}
4092 
4093 	ce->ring = ring;
4094 	ce->state = vma;
4095 
4096 	return 0;
4097 
4098 error_ring_free:
4099 	intel_ring_put(ring);
4100 error_deref_obj:
4101 	i915_gem_object_put(ctx_obj);
4102 	return ret;
4103 }
4104 
4105 static struct list_head *virtual_queue(struct virtual_engine *ve)
4106 {
4107 	return &ve->base.execlists.default_priolist.requests[0];
4108 }
4109 
4110 static void virtual_context_destroy(struct kref *kref)
4111 {
4112 	struct virtual_engine *ve =
4113 		container_of(kref, typeof(*ve), context.ref);
4114 	unsigned int n;
4115 
4116 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4117 	GEM_BUG_ON(ve->request);
4118 	GEM_BUG_ON(ve->context.inflight);
4119 
4120 	for (n = 0; n < ve->num_siblings; n++) {
4121 		struct intel_engine_cs *sibling = ve->siblings[n];
4122 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4123 
4124 		if (RB_EMPTY_NODE(node))
4125 			continue;
4126 
4127 		spin_lock_irq(&sibling->active.lock);
4128 
4129 		/* Detachment is lazily performed in the execlists tasklet */
4130 		if (!RB_EMPTY_NODE(node))
4131 			rb_erase_cached(node, &sibling->execlists.virtual);
4132 
4133 		spin_unlock_irq(&sibling->active.lock);
4134 	}
4135 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4136 
4137 	if (ve->context.state)
4138 		__execlists_context_fini(&ve->context);
4139 	intel_context_fini(&ve->context);
4140 
4141 	kfree(ve->bonds);
4142 	kfree(ve);
4143 }
4144 
4145 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4146 {
4147 	int swp;
4148 
4149 	/*
4150 	 * Pick a random sibling on starting to help spread the load around.
4151 	 *
4152 	 * New contexts are typically created with exactly the same order
4153 	 * of siblings, and often started in batches. Due to the way we iterate
4154 	 * the array of sibling when submitting requests, sibling[0] is
4155 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4156 	 * randomised across the system, we also help spread the load by the
4157 	 * first engine we inspect being different each time.
4158 	 *
4159 	 * NB This does not force us to execute on this engine, it will just
4160 	 * typically be the first we inspect for submission.
4161 	 */
4162 	swp = prandom_u32_max(ve->num_siblings);
4163 	if (!swp)
4164 		return;
4165 
4166 	swap(ve->siblings[swp], ve->siblings[0]);
4167 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4168 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4169 						ve->siblings[0]);
4170 }
4171 
4172 static int virtual_context_pin(struct intel_context *ce)
4173 {
4174 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4175 	int err;
4176 
4177 	/* Note: we must use a real engine class for setting up reg state */
4178 	err = __execlists_context_pin(ce, ve->siblings[0]);
4179 	if (err)
4180 		return err;
4181 
4182 	virtual_engine_initial_hint(ve);
4183 	return 0;
4184 }
4185 
4186 static void virtual_context_enter(struct intel_context *ce)
4187 {
4188 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4189 	unsigned int n;
4190 
4191 	for (n = 0; n < ve->num_siblings; n++)
4192 		intel_engine_pm_get(ve->siblings[n]);
4193 
4194 	intel_timeline_enter(ce->timeline);
4195 }
4196 
4197 static void virtual_context_exit(struct intel_context *ce)
4198 {
4199 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4200 	unsigned int n;
4201 
4202 	intel_timeline_exit(ce->timeline);
4203 
4204 	for (n = 0; n < ve->num_siblings; n++)
4205 		intel_engine_pm_put(ve->siblings[n]);
4206 }
4207 
4208 static const struct intel_context_ops virtual_context_ops = {
4209 	.pin = virtual_context_pin,
4210 	.unpin = execlists_context_unpin,
4211 
4212 	.enter = virtual_context_enter,
4213 	.exit = virtual_context_exit,
4214 
4215 	.destroy = virtual_context_destroy,
4216 };
4217 
4218 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4219 {
4220 	struct i915_request *rq;
4221 	intel_engine_mask_t mask;
4222 
4223 	rq = READ_ONCE(ve->request);
4224 	if (!rq)
4225 		return 0;
4226 
4227 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4228 	mask = rq->execution_mask;
4229 	if (unlikely(!mask)) {
4230 		/* Invalid selection, submit to a random engine in error */
4231 		i915_request_skip(rq, -ENODEV);
4232 		mask = ve->siblings[0]->mask;
4233 	}
4234 
4235 	GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
4236 		  ve->base.name,
4237 		  rq->fence.context, rq->fence.seqno,
4238 		  mask, ve->base.execlists.queue_priority_hint);
4239 
4240 	return mask;
4241 }
4242 
4243 static void virtual_submission_tasklet(unsigned long data)
4244 {
4245 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4246 	const int prio = ve->base.execlists.queue_priority_hint;
4247 	intel_engine_mask_t mask;
4248 	unsigned int n;
4249 
4250 	rcu_read_lock();
4251 	mask = virtual_submission_mask(ve);
4252 	rcu_read_unlock();
4253 	if (unlikely(!mask))
4254 		return;
4255 
4256 	local_irq_disable();
4257 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4258 		struct intel_engine_cs *sibling = ve->siblings[n];
4259 		struct ve_node * const node = &ve->nodes[sibling->id];
4260 		struct rb_node **parent, *rb;
4261 		bool first;
4262 
4263 		if (unlikely(!(mask & sibling->mask))) {
4264 			if (!RB_EMPTY_NODE(&node->rb)) {
4265 				spin_lock(&sibling->active.lock);
4266 				rb_erase_cached(&node->rb,
4267 						&sibling->execlists.virtual);
4268 				RB_CLEAR_NODE(&node->rb);
4269 				spin_unlock(&sibling->active.lock);
4270 			}
4271 			continue;
4272 		}
4273 
4274 		spin_lock(&sibling->active.lock);
4275 
4276 		if (!RB_EMPTY_NODE(&node->rb)) {
4277 			/*
4278 			 * Cheat and avoid rebalancing the tree if we can
4279 			 * reuse this node in situ.
4280 			 */
4281 			first = rb_first_cached(&sibling->execlists.virtual) ==
4282 				&node->rb;
4283 			if (prio == node->prio || (prio > node->prio && first))
4284 				goto submit_engine;
4285 
4286 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4287 		}
4288 
4289 		rb = NULL;
4290 		first = true;
4291 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4292 		while (*parent) {
4293 			struct ve_node *other;
4294 
4295 			rb = *parent;
4296 			other = rb_entry(rb, typeof(*other), rb);
4297 			if (prio > other->prio) {
4298 				parent = &rb->rb_left;
4299 			} else {
4300 				parent = &rb->rb_right;
4301 				first = false;
4302 			}
4303 		}
4304 
4305 		rb_link_node(&node->rb, rb, parent);
4306 		rb_insert_color_cached(&node->rb,
4307 				       &sibling->execlists.virtual,
4308 				       first);
4309 
4310 submit_engine:
4311 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4312 		node->prio = prio;
4313 		if (first && prio > sibling->execlists.queue_priority_hint) {
4314 			sibling->execlists.queue_priority_hint = prio;
4315 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4316 		}
4317 
4318 		spin_unlock(&sibling->active.lock);
4319 	}
4320 	local_irq_enable();
4321 }
4322 
4323 static void virtual_submit_request(struct i915_request *rq)
4324 {
4325 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4326 	struct i915_request *old;
4327 	unsigned long flags;
4328 
4329 	GEM_TRACE("%s: rq=%llx:%lld\n",
4330 		  ve->base.name,
4331 		  rq->fence.context,
4332 		  rq->fence.seqno);
4333 
4334 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4335 
4336 	spin_lock_irqsave(&ve->base.active.lock, flags);
4337 
4338 	old = ve->request;
4339 	if (old) { /* background completion event from preempt-to-busy */
4340 		GEM_BUG_ON(!i915_request_completed(old));
4341 		__i915_request_submit(old);
4342 		i915_request_put(old);
4343 	}
4344 
4345 	if (i915_request_completed(rq)) {
4346 		__i915_request_submit(rq);
4347 
4348 		ve->base.execlists.queue_priority_hint = INT_MIN;
4349 		ve->request = NULL;
4350 	} else {
4351 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4352 		ve->request = i915_request_get(rq);
4353 
4354 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4355 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4356 
4357 		tasklet_schedule(&ve->base.execlists.tasklet);
4358 	}
4359 
4360 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4361 }
4362 
4363 static struct ve_bond *
4364 virtual_find_bond(struct virtual_engine *ve,
4365 		  const struct intel_engine_cs *master)
4366 {
4367 	int i;
4368 
4369 	for (i = 0; i < ve->num_bonds; i++) {
4370 		if (ve->bonds[i].master == master)
4371 			return &ve->bonds[i];
4372 	}
4373 
4374 	return NULL;
4375 }
4376 
4377 static void
4378 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4379 {
4380 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4381 	intel_engine_mask_t allowed, exec;
4382 	struct ve_bond *bond;
4383 
4384 	allowed = ~to_request(signal)->engine->mask;
4385 
4386 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4387 	if (bond)
4388 		allowed &= bond->sibling_mask;
4389 
4390 	/* Restrict the bonded request to run on only the available engines */
4391 	exec = READ_ONCE(rq->execution_mask);
4392 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4393 		;
4394 
4395 	/* Prevent the master from being re-run on the bonded engines */
4396 	to_request(signal)->execution_mask &= ~allowed;
4397 }
4398 
4399 struct intel_context *
4400 intel_execlists_create_virtual(struct i915_gem_context *ctx,
4401 			       struct intel_engine_cs **siblings,
4402 			       unsigned int count)
4403 {
4404 	struct virtual_engine *ve;
4405 	unsigned int n;
4406 	int err;
4407 
4408 	if (count == 0)
4409 		return ERR_PTR(-EINVAL);
4410 
4411 	if (count == 1)
4412 		return intel_context_create(ctx, siblings[0]);
4413 
4414 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4415 	if (!ve)
4416 		return ERR_PTR(-ENOMEM);
4417 
4418 	ve->base.i915 = ctx->i915;
4419 	ve->base.gt = siblings[0]->gt;
4420 	ve->base.uncore = siblings[0]->uncore;
4421 	ve->base.id = -1;
4422 	ve->base.class = OTHER_CLASS;
4423 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4424 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4425 
4426 	/*
4427 	 * The decision on whether to submit a request using semaphores
4428 	 * depends on the saturated state of the engine. We only compute
4429 	 * this during HW submission of the request, and we need for this
4430 	 * state to be globally applied to all requests being submitted
4431 	 * to this engine. Virtual engines encompass more than one physical
4432 	 * engine and so we cannot accurately tell in advance if one of those
4433 	 * engines is already saturated and so cannot afford to use a semaphore
4434 	 * and be pessimized in priority for doing so -- if we are the only
4435 	 * context using semaphores after all other clients have stopped, we
4436 	 * will be starved on the saturated system. Such a global switch for
4437 	 * semaphores is less than ideal, but alas is the current compromise.
4438 	 */
4439 	ve->base.saturated = ALL_ENGINES;
4440 
4441 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4442 
4443 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4444 	intel_engine_init_breadcrumbs(&ve->base);
4445 
4446 	intel_engine_init_execlists(&ve->base);
4447 
4448 	ve->base.cops = &virtual_context_ops;
4449 	ve->base.request_alloc = execlists_request_alloc;
4450 
4451 	ve->base.schedule = i915_schedule;
4452 	ve->base.submit_request = virtual_submit_request;
4453 	ve->base.bond_execute = virtual_bond_execute;
4454 
4455 	INIT_LIST_HEAD(virtual_queue(ve));
4456 	ve->base.execlists.queue_priority_hint = INT_MIN;
4457 	tasklet_init(&ve->base.execlists.tasklet,
4458 		     virtual_submission_tasklet,
4459 		     (unsigned long)ve);
4460 
4461 	intel_context_init(&ve->context, ctx, &ve->base);
4462 
4463 	for (n = 0; n < count; n++) {
4464 		struct intel_engine_cs *sibling = siblings[n];
4465 
4466 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4467 		if (sibling->mask & ve->base.mask) {
4468 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4469 				  sibling->name);
4470 			err = -EINVAL;
4471 			goto err_put;
4472 		}
4473 
4474 		/*
4475 		 * The virtual engine implementation is tightly coupled to
4476 		 * the execlists backend -- we push out request directly
4477 		 * into a tree inside each physical engine. We could support
4478 		 * layering if we handle cloning of the requests and
4479 		 * submitting a copy into each backend.
4480 		 */
4481 		if (sibling->execlists.tasklet.func !=
4482 		    execlists_submission_tasklet) {
4483 			err = -ENODEV;
4484 			goto err_put;
4485 		}
4486 
4487 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4488 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4489 
4490 		ve->siblings[ve->num_siblings++] = sibling;
4491 		ve->base.mask |= sibling->mask;
4492 
4493 		/*
4494 		 * All physical engines must be compatible for their emission
4495 		 * functions (as we build the instructions during request
4496 		 * construction and do not alter them before submission
4497 		 * on the physical engine). We use the engine class as a guide
4498 		 * here, although that could be refined.
4499 		 */
4500 		if (ve->base.class != OTHER_CLASS) {
4501 			if (ve->base.class != sibling->class) {
4502 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4503 					  sibling->class, ve->base.class);
4504 				err = -EINVAL;
4505 				goto err_put;
4506 			}
4507 			continue;
4508 		}
4509 
4510 		ve->base.class = sibling->class;
4511 		ve->base.uabi_class = sibling->uabi_class;
4512 		snprintf(ve->base.name, sizeof(ve->base.name),
4513 			 "v%dx%d", ve->base.class, count);
4514 		ve->base.context_size = sibling->context_size;
4515 
4516 		ve->base.emit_bb_start = sibling->emit_bb_start;
4517 		ve->base.emit_flush = sibling->emit_flush;
4518 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4519 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4520 		ve->base.emit_fini_breadcrumb_dw =
4521 			sibling->emit_fini_breadcrumb_dw;
4522 
4523 		ve->base.flags = sibling->flags;
4524 	}
4525 
4526 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
4527 
4528 	err = __execlists_context_alloc(&ve->context, siblings[0]);
4529 	if (err)
4530 		goto err_put;
4531 
4532 	__set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
4533 
4534 	return &ve->context;
4535 
4536 err_put:
4537 	intel_context_put(&ve->context);
4538 	return ERR_PTR(err);
4539 }
4540 
4541 struct intel_context *
4542 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
4543 			      struct intel_engine_cs *src)
4544 {
4545 	struct virtual_engine *se = to_virtual_engine(src);
4546 	struct intel_context *dst;
4547 
4548 	dst = intel_execlists_create_virtual(ctx,
4549 					     se->siblings,
4550 					     se->num_siblings);
4551 	if (IS_ERR(dst))
4552 		return dst;
4553 
4554 	if (se->num_bonds) {
4555 		struct virtual_engine *de = to_virtual_engine(dst->engine);
4556 
4557 		de->bonds = kmemdup(se->bonds,
4558 				    sizeof(*se->bonds) * se->num_bonds,
4559 				    GFP_KERNEL);
4560 		if (!de->bonds) {
4561 			intel_context_put(dst);
4562 			return ERR_PTR(-ENOMEM);
4563 		}
4564 
4565 		de->num_bonds = se->num_bonds;
4566 	}
4567 
4568 	return dst;
4569 }
4570 
4571 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
4572 				     const struct intel_engine_cs *master,
4573 				     const struct intel_engine_cs *sibling)
4574 {
4575 	struct virtual_engine *ve = to_virtual_engine(engine);
4576 	struct ve_bond *bond;
4577 	int n;
4578 
4579 	/* Sanity check the sibling is part of the virtual engine */
4580 	for (n = 0; n < ve->num_siblings; n++)
4581 		if (sibling == ve->siblings[n])
4582 			break;
4583 	if (n == ve->num_siblings)
4584 		return -EINVAL;
4585 
4586 	bond = virtual_find_bond(ve, master);
4587 	if (bond) {
4588 		bond->sibling_mask |= sibling->mask;
4589 		return 0;
4590 	}
4591 
4592 	bond = krealloc(ve->bonds,
4593 			sizeof(*bond) * (ve->num_bonds + 1),
4594 			GFP_KERNEL);
4595 	if (!bond)
4596 		return -ENOMEM;
4597 
4598 	bond[ve->num_bonds].master = master;
4599 	bond[ve->num_bonds].sibling_mask = sibling->mask;
4600 
4601 	ve->bonds = bond;
4602 	ve->num_bonds++;
4603 
4604 	return 0;
4605 }
4606 
4607 struct intel_engine_cs *
4608 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
4609 				 unsigned int sibling)
4610 {
4611 	struct virtual_engine *ve = to_virtual_engine(engine);
4612 
4613 	if (sibling >= ve->num_siblings)
4614 		return NULL;
4615 
4616 	return ve->siblings[sibling];
4617 }
4618 
4619 void intel_execlists_show_requests(struct intel_engine_cs *engine,
4620 				   struct drm_printer *m,
4621 				   void (*show_request)(struct drm_printer *m,
4622 							struct i915_request *rq,
4623 							const char *prefix),
4624 				   unsigned int max)
4625 {
4626 	const struct intel_engine_execlists *execlists = &engine->execlists;
4627 	struct i915_request *rq, *last;
4628 	unsigned long flags;
4629 	unsigned int count;
4630 	struct rb_node *rb;
4631 
4632 	spin_lock_irqsave(&engine->active.lock, flags);
4633 
4634 	last = NULL;
4635 	count = 0;
4636 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
4637 		if (count++ < max - 1)
4638 			show_request(m, rq, "\t\tE ");
4639 		else
4640 			last = rq;
4641 	}
4642 	if (last) {
4643 		if (count > max) {
4644 			drm_printf(m,
4645 				   "\t\t...skipping %d executing requests...\n",
4646 				   count - max);
4647 		}
4648 		show_request(m, last, "\t\tE ");
4649 	}
4650 
4651 	last = NULL;
4652 	count = 0;
4653 	if (execlists->queue_priority_hint != INT_MIN)
4654 		drm_printf(m, "\t\tQueue priority hint: %d\n",
4655 			   execlists->queue_priority_hint);
4656 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
4657 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
4658 		int i;
4659 
4660 		priolist_for_each_request(rq, p, i) {
4661 			if (count++ < max - 1)
4662 				show_request(m, rq, "\t\tQ ");
4663 			else
4664 				last = rq;
4665 		}
4666 	}
4667 	if (last) {
4668 		if (count > max) {
4669 			drm_printf(m,
4670 				   "\t\t...skipping %d queued requests...\n",
4671 				   count - max);
4672 		}
4673 		show_request(m, last, "\t\tQ ");
4674 	}
4675 
4676 	last = NULL;
4677 	count = 0;
4678 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4679 		struct virtual_engine *ve =
4680 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4681 		struct i915_request *rq = READ_ONCE(ve->request);
4682 
4683 		if (rq) {
4684 			if (count++ < max - 1)
4685 				show_request(m, rq, "\t\tV ");
4686 			else
4687 				last = rq;
4688 		}
4689 	}
4690 	if (last) {
4691 		if (count > max) {
4692 			drm_printf(m,
4693 				   "\t\t...skipping %d virtual requests...\n",
4694 				   count - max);
4695 		}
4696 		show_request(m, last, "\t\tV ");
4697 	}
4698 
4699 	spin_unlock_irqrestore(&engine->active.lock, flags);
4700 }
4701 
4702 void intel_lr_context_reset(struct intel_engine_cs *engine,
4703 			    struct intel_context *ce,
4704 			    u32 head,
4705 			    bool scrub)
4706 {
4707 	GEM_BUG_ON(!intel_context_is_pinned(ce));
4708 
4709 	/*
4710 	 * We want a simple context + ring to execute the breadcrumb update.
4711 	 * We cannot rely on the context being intact across the GPU hang,
4712 	 * so clear it and rebuild just what we need for the breadcrumb.
4713 	 * All pending requests for this context will be zapped, and any
4714 	 * future request will be after userspace has had the opportunity
4715 	 * to recreate its own state.
4716 	 */
4717 	if (scrub)
4718 		restore_default_state(ce, engine);
4719 
4720 	/* Rerun the request; its payload has been neutered (if guilty). */
4721 	ce->ring->head = head;
4722 	intel_ring_update_space(ce->ring);
4723 
4724 	__execlists_update_reg_state(ce, engine);
4725 }
4726 
4727 bool
4728 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
4729 {
4730 	return engine->set_default_submission ==
4731 	       intel_execlists_set_default_submission;
4732 }
4733 
4734 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4735 #include "selftest_lrc.c"
4736 #endif
4737