xref: /linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision f6e8dc9edf963dbc99085e54f6ced6da9daa6100)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include <drm/drm_print.h>
7 
8 #include "gem/i915_gem_lmem.h"
9 
10 #include "gen8_engine_cs.h"
11 #include "i915_drv.h"
12 #include "i915_perf.h"
13 #include "i915_reg.h"
14 #include "intel_context.h"
15 #include "intel_engine.h"
16 #include "intel_engine_regs.h"
17 #include "intel_gpu_commands.h"
18 #include "intel_gt.h"
19 #include "intel_gt_regs.h"
20 #include "intel_lrc.h"
21 #include "intel_lrc_reg.h"
22 #include "intel_ring.h"
23 #include "shmem_utils.h"
24 
25 /*
26  * The per-platform tables are u8-encoded in @data. Decode @data and set the
27  * addresses' offset and commands in @regs. The following encoding is used
28  * for each byte. There are 2 steps: decoding commands and decoding addresses.
29  *
30  * Commands:
31  * [7]: create NOPs - number of NOPs are set in lower bits
32  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
33  *      MI_LRI_FORCE_POSTED
34  * [5:0]: Number of NOPs or registers to set values to in case of
35  *        MI_LOAD_REGISTER_IMM
36  *
37  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
38  * number of registers. They are set by using the REG/REG16 macros: the former
39  * is used for offsets smaller than 0x200 while the latter is for values bigger
40  * than that. Those macros already set all the bits documented below correctly:
41  *
42  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
43  *      follow, for the lower bits
44  * [6:0]: Register offset, without considering the engine base.
45  *
46  * This function only tweaks the commands and register offsets. Values are not
47  * filled out.
48  */
49 static void set_offsets(u32 *regs,
50 			const u8 *data,
51 			const struct intel_engine_cs *engine,
52 			bool close)
53 #define NOP(x) (BIT(7) | (x))
54 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
55 #define POSTED BIT(0)
56 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
57 #define REG16(x) \
58 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
59 	(((x) >> 2) & 0x7f)
60 #define END 0
61 {
62 	const u32 base = engine->mmio_base;
63 
64 	while (*data) {
65 		u8 count, flags;
66 
67 		if (*data & BIT(7)) { /* skip */
68 			count = *data++ & ~BIT(7);
69 			regs += count;
70 			continue;
71 		}
72 
73 		count = *data & 0x3f;
74 		flags = *data >> 6;
75 		data++;
76 
77 		*regs = MI_LOAD_REGISTER_IMM(count);
78 		if (flags & POSTED)
79 			*regs |= MI_LRI_FORCE_POSTED;
80 		if (GRAPHICS_VER(engine->i915) >= 11)
81 			*regs |= MI_LRI_LRM_CS_MMIO;
82 		regs++;
83 
84 		GEM_BUG_ON(!count);
85 		do {
86 			u32 offset = 0;
87 			u8 v;
88 
89 			do {
90 				v = *data++;
91 				offset <<= 7;
92 				offset |= v & ~BIT(7);
93 			} while (v & BIT(7));
94 
95 			regs[0] = base + (offset << 2);
96 			regs += 2;
97 		} while (--count);
98 	}
99 
100 	if (close) {
101 		/* Close the batch; used mainly by live_lrc_layout() */
102 		*regs = MI_BATCH_BUFFER_END;
103 		if (GRAPHICS_VER(engine->i915) >= 11)
104 			*regs |= BIT(0);
105 	}
106 }
107 
108 static const u8 gen8_xcs_offsets[] = {
109 	NOP(1),
110 	LRI(11, 0),
111 	REG16(0x244),
112 	REG(0x034),
113 	REG(0x030),
114 	REG(0x038),
115 	REG(0x03c),
116 	REG(0x168),
117 	REG(0x140),
118 	REG(0x110),
119 	REG(0x11c),
120 	REG(0x114),
121 	REG(0x118),
122 
123 	NOP(9),
124 	LRI(9, 0),
125 	REG16(0x3a8),
126 	REG16(0x28c),
127 	REG16(0x288),
128 	REG16(0x284),
129 	REG16(0x280),
130 	REG16(0x27c),
131 	REG16(0x278),
132 	REG16(0x274),
133 	REG16(0x270),
134 
135 	NOP(13),
136 	LRI(2, 0),
137 	REG16(0x200),
138 	REG(0x028),
139 
140 	END
141 };
142 
143 static const u8 gen9_xcs_offsets[] = {
144 	NOP(1),
145 	LRI(14, POSTED),
146 	REG16(0x244),
147 	REG(0x034),
148 	REG(0x030),
149 	REG(0x038),
150 	REG(0x03c),
151 	REG(0x168),
152 	REG(0x140),
153 	REG(0x110),
154 	REG(0x11c),
155 	REG(0x114),
156 	REG(0x118),
157 	REG(0x1c0),
158 	REG(0x1c4),
159 	REG(0x1c8),
160 
161 	NOP(3),
162 	LRI(9, POSTED),
163 	REG16(0x3a8),
164 	REG16(0x28c),
165 	REG16(0x288),
166 	REG16(0x284),
167 	REG16(0x280),
168 	REG16(0x27c),
169 	REG16(0x278),
170 	REG16(0x274),
171 	REG16(0x270),
172 
173 	NOP(13),
174 	LRI(1, POSTED),
175 	REG16(0x200),
176 
177 	NOP(13),
178 	LRI(44, POSTED),
179 	REG(0x028),
180 	REG(0x09c),
181 	REG(0x0c0),
182 	REG(0x178),
183 	REG(0x17c),
184 	REG16(0x358),
185 	REG(0x170),
186 	REG(0x150),
187 	REG(0x154),
188 	REG(0x158),
189 	REG16(0x41c),
190 	REG16(0x600),
191 	REG16(0x604),
192 	REG16(0x608),
193 	REG16(0x60c),
194 	REG16(0x610),
195 	REG16(0x614),
196 	REG16(0x618),
197 	REG16(0x61c),
198 	REG16(0x620),
199 	REG16(0x624),
200 	REG16(0x628),
201 	REG16(0x62c),
202 	REG16(0x630),
203 	REG16(0x634),
204 	REG16(0x638),
205 	REG16(0x63c),
206 	REG16(0x640),
207 	REG16(0x644),
208 	REG16(0x648),
209 	REG16(0x64c),
210 	REG16(0x650),
211 	REG16(0x654),
212 	REG16(0x658),
213 	REG16(0x65c),
214 	REG16(0x660),
215 	REG16(0x664),
216 	REG16(0x668),
217 	REG16(0x66c),
218 	REG16(0x670),
219 	REG16(0x674),
220 	REG16(0x678),
221 	REG16(0x67c),
222 	REG(0x068),
223 
224 	END
225 };
226 
227 static const u8 gen12_xcs_offsets[] = {
228 	NOP(1),
229 	LRI(13, POSTED),
230 	REG16(0x244),
231 	REG(0x034),
232 	REG(0x030),
233 	REG(0x038),
234 	REG(0x03c),
235 	REG(0x168),
236 	REG(0x140),
237 	REG(0x110),
238 	REG(0x1c0),
239 	REG(0x1c4),
240 	REG(0x1c8),
241 	REG(0x180),
242 	REG16(0x2b4),
243 
244 	NOP(5),
245 	LRI(9, POSTED),
246 	REG16(0x3a8),
247 	REG16(0x28c),
248 	REG16(0x288),
249 	REG16(0x284),
250 	REG16(0x280),
251 	REG16(0x27c),
252 	REG16(0x278),
253 	REG16(0x274),
254 	REG16(0x270),
255 
256 	END
257 };
258 
259 static const u8 dg2_xcs_offsets[] = {
260 	NOP(1),
261 	LRI(15, POSTED),
262 	REG16(0x244),
263 	REG(0x034),
264 	REG(0x030),
265 	REG(0x038),
266 	REG(0x03c),
267 	REG(0x168),
268 	REG(0x140),
269 	REG(0x110),
270 	REG(0x1c0),
271 	REG(0x1c4),
272 	REG(0x1c8),
273 	REG(0x180),
274 	REG16(0x2b4),
275 	REG(0x120),
276 	REG(0x124),
277 
278 	NOP(1),
279 	LRI(9, POSTED),
280 	REG16(0x3a8),
281 	REG16(0x28c),
282 	REG16(0x288),
283 	REG16(0x284),
284 	REG16(0x280),
285 	REG16(0x27c),
286 	REG16(0x278),
287 	REG16(0x274),
288 	REG16(0x270),
289 
290 	END
291 };
292 
293 static const u8 gen8_rcs_offsets[] = {
294 	NOP(1),
295 	LRI(14, POSTED),
296 	REG16(0x244),
297 	REG(0x034),
298 	REG(0x030),
299 	REG(0x038),
300 	REG(0x03c),
301 	REG(0x168),
302 	REG(0x140),
303 	REG(0x110),
304 	REG(0x11c),
305 	REG(0x114),
306 	REG(0x118),
307 	REG(0x1c0),
308 	REG(0x1c4),
309 	REG(0x1c8),
310 
311 	NOP(3),
312 	LRI(9, POSTED),
313 	REG16(0x3a8),
314 	REG16(0x28c),
315 	REG16(0x288),
316 	REG16(0x284),
317 	REG16(0x280),
318 	REG16(0x27c),
319 	REG16(0x278),
320 	REG16(0x274),
321 	REG16(0x270),
322 
323 	NOP(13),
324 	LRI(1, 0),
325 	REG(0x0c8),
326 
327 	END
328 };
329 
330 static const u8 gen9_rcs_offsets[] = {
331 	NOP(1),
332 	LRI(14, POSTED),
333 	REG16(0x244),
334 	REG(0x34),
335 	REG(0x30),
336 	REG(0x38),
337 	REG(0x3c),
338 	REG(0x168),
339 	REG(0x140),
340 	REG(0x110),
341 	REG(0x11c),
342 	REG(0x114),
343 	REG(0x118),
344 	REG(0x1c0),
345 	REG(0x1c4),
346 	REG(0x1c8),
347 
348 	NOP(3),
349 	LRI(9, POSTED),
350 	REG16(0x3a8),
351 	REG16(0x28c),
352 	REG16(0x288),
353 	REG16(0x284),
354 	REG16(0x280),
355 	REG16(0x27c),
356 	REG16(0x278),
357 	REG16(0x274),
358 	REG16(0x270),
359 
360 	NOP(13),
361 	LRI(1, 0),
362 	REG(0xc8),
363 
364 	NOP(13),
365 	LRI(44, POSTED),
366 	REG(0x28),
367 	REG(0x9c),
368 	REG(0xc0),
369 	REG(0x178),
370 	REG(0x17c),
371 	REG16(0x358),
372 	REG(0x170),
373 	REG(0x150),
374 	REG(0x154),
375 	REG(0x158),
376 	REG16(0x41c),
377 	REG16(0x600),
378 	REG16(0x604),
379 	REG16(0x608),
380 	REG16(0x60c),
381 	REG16(0x610),
382 	REG16(0x614),
383 	REG16(0x618),
384 	REG16(0x61c),
385 	REG16(0x620),
386 	REG16(0x624),
387 	REG16(0x628),
388 	REG16(0x62c),
389 	REG16(0x630),
390 	REG16(0x634),
391 	REG16(0x638),
392 	REG16(0x63c),
393 	REG16(0x640),
394 	REG16(0x644),
395 	REG16(0x648),
396 	REG16(0x64c),
397 	REG16(0x650),
398 	REG16(0x654),
399 	REG16(0x658),
400 	REG16(0x65c),
401 	REG16(0x660),
402 	REG16(0x664),
403 	REG16(0x668),
404 	REG16(0x66c),
405 	REG16(0x670),
406 	REG16(0x674),
407 	REG16(0x678),
408 	REG16(0x67c),
409 	REG(0x68),
410 
411 	END
412 };
413 
414 static const u8 gen11_rcs_offsets[] = {
415 	NOP(1),
416 	LRI(15, POSTED),
417 	REG16(0x244),
418 	REG(0x034),
419 	REG(0x030),
420 	REG(0x038),
421 	REG(0x03c),
422 	REG(0x168),
423 	REG(0x140),
424 	REG(0x110),
425 	REG(0x11c),
426 	REG(0x114),
427 	REG(0x118),
428 	REG(0x1c0),
429 	REG(0x1c4),
430 	REG(0x1c8),
431 	REG(0x180),
432 
433 	NOP(1),
434 	LRI(9, POSTED),
435 	REG16(0x3a8),
436 	REG16(0x28c),
437 	REG16(0x288),
438 	REG16(0x284),
439 	REG16(0x280),
440 	REG16(0x27c),
441 	REG16(0x278),
442 	REG16(0x274),
443 	REG16(0x270),
444 
445 	LRI(1, POSTED),
446 	REG(0x1b0),
447 
448 	NOP(10),
449 	LRI(1, 0),
450 	REG(0x0c8),
451 
452 	END
453 };
454 
455 static const u8 gen12_rcs_offsets[] = {
456 	NOP(1),
457 	LRI(13, POSTED),
458 	REG16(0x244),
459 	REG(0x034),
460 	REG(0x030),
461 	REG(0x038),
462 	REG(0x03c),
463 	REG(0x168),
464 	REG(0x140),
465 	REG(0x110),
466 	REG(0x1c0),
467 	REG(0x1c4),
468 	REG(0x1c8),
469 	REG(0x180),
470 	REG16(0x2b4),
471 
472 	NOP(5),
473 	LRI(9, POSTED),
474 	REG16(0x3a8),
475 	REG16(0x28c),
476 	REG16(0x288),
477 	REG16(0x284),
478 	REG16(0x280),
479 	REG16(0x27c),
480 	REG16(0x278),
481 	REG16(0x274),
482 	REG16(0x270),
483 
484 	LRI(3, POSTED),
485 	REG(0x1b0),
486 	REG16(0x5a8),
487 	REG16(0x5ac),
488 
489 	NOP(6),
490 	LRI(1, 0),
491 	REG(0x0c8),
492 	NOP(3 + 9 + 1),
493 
494 	LRI(51, POSTED),
495 	REG16(0x588),
496 	REG16(0x588),
497 	REG16(0x588),
498 	REG16(0x588),
499 	REG16(0x588),
500 	REG16(0x588),
501 	REG(0x028),
502 	REG(0x09c),
503 	REG(0x0c0),
504 	REG(0x178),
505 	REG(0x17c),
506 	REG16(0x358),
507 	REG(0x170),
508 	REG(0x150),
509 	REG(0x154),
510 	REG(0x158),
511 	REG16(0x41c),
512 	REG16(0x600),
513 	REG16(0x604),
514 	REG16(0x608),
515 	REG16(0x60c),
516 	REG16(0x610),
517 	REG16(0x614),
518 	REG16(0x618),
519 	REG16(0x61c),
520 	REG16(0x620),
521 	REG16(0x624),
522 	REG16(0x628),
523 	REG16(0x62c),
524 	REG16(0x630),
525 	REG16(0x634),
526 	REG16(0x638),
527 	REG16(0x63c),
528 	REG16(0x640),
529 	REG16(0x644),
530 	REG16(0x648),
531 	REG16(0x64c),
532 	REG16(0x650),
533 	REG16(0x654),
534 	REG16(0x658),
535 	REG16(0x65c),
536 	REG16(0x660),
537 	REG16(0x664),
538 	REG16(0x668),
539 	REG16(0x66c),
540 	REG16(0x670),
541 	REG16(0x674),
542 	REG16(0x678),
543 	REG16(0x67c),
544 	REG(0x068),
545 	REG(0x084),
546 	NOP(1),
547 
548 	END
549 };
550 
551 static const u8 dg2_rcs_offsets[] = {
552 	NOP(1),
553 	LRI(15, POSTED),
554 	REG16(0x244),
555 	REG(0x034),
556 	REG(0x030),
557 	REG(0x038),
558 	REG(0x03c),
559 	REG(0x168),
560 	REG(0x140),
561 	REG(0x110),
562 	REG(0x1c0),
563 	REG(0x1c4),
564 	REG(0x1c8),
565 	REG(0x180),
566 	REG16(0x2b4),
567 	REG(0x120),
568 	REG(0x124),
569 
570 	NOP(1),
571 	LRI(9, POSTED),
572 	REG16(0x3a8),
573 	REG16(0x28c),
574 	REG16(0x288),
575 	REG16(0x284),
576 	REG16(0x280),
577 	REG16(0x27c),
578 	REG16(0x278),
579 	REG16(0x274),
580 	REG16(0x270),
581 
582 	LRI(3, POSTED),
583 	REG(0x1b0),
584 	REG16(0x5a8),
585 	REG16(0x5ac),
586 
587 	NOP(6),
588 	LRI(1, 0),
589 	REG(0x0c8),
590 
591 	END
592 };
593 
594 static const u8 mtl_rcs_offsets[] = {
595 	NOP(1),
596 	LRI(15, POSTED),
597 	REG16(0x244),
598 	REG(0x034),
599 	REG(0x030),
600 	REG(0x038),
601 	REG(0x03c),
602 	REG(0x168),
603 	REG(0x140),
604 	REG(0x110),
605 	REG(0x1c0),
606 	REG(0x1c4),
607 	REG(0x1c8),
608 	REG(0x180),
609 	REG16(0x2b4),
610 	REG(0x120),
611 	REG(0x124),
612 
613 	NOP(1),
614 	LRI(9, POSTED),
615 	REG16(0x3a8),
616 	REG16(0x28c),
617 	REG16(0x288),
618 	REG16(0x284),
619 	REG16(0x280),
620 	REG16(0x27c),
621 	REG16(0x278),
622 	REG16(0x274),
623 	REG16(0x270),
624 
625 	NOP(2),
626 	LRI(2, POSTED),
627 	REG16(0x5a8),
628 	REG16(0x5ac),
629 
630 	NOP(6),
631 	LRI(1, 0),
632 	REG(0x0c8),
633 
634 	END
635 };
636 
637 #undef END
638 #undef REG16
639 #undef REG
640 #undef LRI
641 #undef NOP
642 
643 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
644 {
645 	/*
646 	 * The gen12+ lists only have the registers we program in the basic
647 	 * default state. We rely on the context image using relative
648 	 * addressing to automatic fixup the register state between the
649 	 * physical engines for virtual engine.
650 	 */
651 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
652 		   !intel_engine_has_relative_mmio(engine));
653 
654 	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
655 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
656 			return mtl_rcs_offsets;
657 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
658 			return dg2_rcs_offsets;
659 		else if (GRAPHICS_VER(engine->i915) >= 12)
660 			return gen12_rcs_offsets;
661 		else if (GRAPHICS_VER(engine->i915) >= 11)
662 			return gen11_rcs_offsets;
663 		else if (GRAPHICS_VER(engine->i915) >= 9)
664 			return gen9_rcs_offsets;
665 		else
666 			return gen8_rcs_offsets;
667 	} else {
668 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
669 			return dg2_xcs_offsets;
670 		else if (GRAPHICS_VER(engine->i915) >= 12)
671 			return gen12_xcs_offsets;
672 		else if (GRAPHICS_VER(engine->i915) >= 9)
673 			return gen9_xcs_offsets;
674 		else
675 			return gen8_xcs_offsets;
676 	}
677 }
678 
679 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
680 {
681 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
682 		return 0x70;
683 	else if (GRAPHICS_VER(engine->i915) >= 12)
684 		return 0x60;
685 	else if (GRAPHICS_VER(engine->i915) >= 9)
686 		return 0x54;
687 	else if (engine->class == RENDER_CLASS)
688 		return 0x58;
689 	else
690 		return -1;
691 }
692 
693 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
694 {
695 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
696 		return 0x80;
697 	else if (GRAPHICS_VER(engine->i915) >= 12)
698 		return 0x70;
699 	else if (GRAPHICS_VER(engine->i915) >= 9)
700 		return 0x64;
701 	else if (GRAPHICS_VER(engine->i915) >= 8 &&
702 		 engine->class == RENDER_CLASS)
703 		return 0xc4;
704 	else
705 		return -1;
706 }
707 
708 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
709 {
710 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
711 		return 0x84;
712 	else if (GRAPHICS_VER(engine->i915) >= 12)
713 		return 0x74;
714 	else if (GRAPHICS_VER(engine->i915) >= 9)
715 		return 0x68;
716 	else if (engine->class == RENDER_CLASS)
717 		return 0xd8;
718 	else
719 		return -1;
720 }
721 
722 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
723 {
724 	if (GRAPHICS_VER(engine->i915) >= 12)
725 		return 0x12;
726 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
727 		return 0x18;
728 	else
729 		return -1;
730 }
731 
732 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
733 {
734 	int x;
735 
736 	x = lrc_ring_wa_bb_per_ctx(engine);
737 	if (x < 0)
738 		return x;
739 
740 	return x + 2;
741 }
742 
743 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
744 {
745 	int x;
746 
747 	x = lrc_ring_indirect_ptr(engine);
748 	if (x < 0)
749 		return x;
750 
751 	return x + 2;
752 }
753 
754 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
755 {
756 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
757 		/*
758 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
759 		 * simply to match the RCS context image layout.
760 		 */
761 		return 0xc6;
762 	else if (engine->class != RENDER_CLASS)
763 		return -1;
764 	else if (GRAPHICS_VER(engine->i915) >= 12)
765 		return 0xb6;
766 	else if (GRAPHICS_VER(engine->i915) >= 11)
767 		return 0xaa;
768 	else
769 		return -1;
770 }
771 
772 static u32
773 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
774 {
775 	if (GRAPHICS_VER(engine->i915) >= 12)
776 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
777 	else if (GRAPHICS_VER(engine->i915) >= 11)
778 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
779 	else if (GRAPHICS_VER(engine->i915) >= 9)
780 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
781 	else if (GRAPHICS_VER(engine->i915) >= 8)
782 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
783 
784 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
785 
786 	return 0;
787 }
788 
789 static void
790 lrc_setup_bb_per_ctx(u32 *regs,
791 		     const struct intel_engine_cs *engine,
792 		     u32 ctx_bb_ggtt_addr)
793 {
794 	GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
795 	regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
796 		ctx_bb_ggtt_addr |
797 		PER_CTX_BB_FORCE |
798 		PER_CTX_BB_VALID;
799 }
800 
801 static void
802 lrc_setup_indirect_ctx(u32 *regs,
803 		       const struct intel_engine_cs *engine,
804 		       u32 ctx_bb_ggtt_addr,
805 		       u32 size)
806 {
807 	GEM_BUG_ON(!size);
808 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
809 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
810 	regs[lrc_ring_indirect_ptr(engine) + 1] =
811 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
812 
813 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
814 	regs[lrc_ring_indirect_offset(engine) + 1] =
815 		lrc_ring_indirect_offset_default(engine) << 6;
816 }
817 
818 static bool ctx_needs_runalone(const struct intel_context *ce)
819 {
820 	struct i915_gem_context *gem_ctx;
821 	bool ctx_is_protected = false;
822 
823 	/*
824 	 * Wa_14019159160 - Case 2.
825 	 * On some platforms, protected contexts require setting
826 	 * the LRC run-alone bit or else the encryption/decryption will not happen.
827 	 * NOTE: Case 2 only applies to PXP use-case of said workaround.
828 	 */
829 	if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
830 	    (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
831 		rcu_read_lock();
832 		gem_ctx = rcu_dereference(ce->gem_context);
833 		if (gem_ctx)
834 			ctx_is_protected = gem_ctx->uses_protected_content;
835 		rcu_read_unlock();
836 	}
837 
838 	return ctx_is_protected;
839 }
840 
841 static void init_common_regs(u32 * const regs,
842 			     const struct intel_context *ce,
843 			     const struct intel_engine_cs *engine,
844 			     bool inhibit)
845 {
846 	u32 ctl;
847 	int loc;
848 
849 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
850 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
851 	if (inhibit)
852 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
853 	if (GRAPHICS_VER(engine->i915) < 11)
854 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
855 					   CTX_CTRL_RS_CTX_ENABLE);
856 	/* Wa_14019159160 - Case 2.*/
857 	if (ctx_needs_runalone(ce))
858 		ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE);
859 	regs[CTX_CONTEXT_CONTROL] = ctl;
860 
861 	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
862 
863 	loc = lrc_ring_bb_offset(engine);
864 	if (loc != -1)
865 		regs[loc + 1] = 0;
866 }
867 
868 static void init_wa_bb_regs(u32 * const regs,
869 			    const struct intel_engine_cs *engine)
870 {
871 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
872 
873 	if (wa_ctx->per_ctx.size) {
874 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
875 
876 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
877 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
878 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
879 	}
880 
881 	if (wa_ctx->indirect_ctx.size) {
882 		lrc_setup_indirect_ctx(regs, engine,
883 				       i915_ggtt_offset(wa_ctx->vma) +
884 				       wa_ctx->indirect_ctx.offset,
885 				       wa_ctx->indirect_ctx.size);
886 	}
887 }
888 
889 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
890 {
891 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
892 		/* 64b PPGTT (48bit canonical)
893 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
894 		 * other PDP Descriptors are ignored.
895 		 */
896 		ASSIGN_CTX_PML4(ppgtt, regs);
897 	} else {
898 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
899 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
900 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
901 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
902 	}
903 }
904 
905 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
906 {
907 	if (i915_is_ggtt(vm))
908 		return i915_vm_to_ggtt(vm)->alias;
909 	else
910 		return i915_vm_to_ppgtt(vm);
911 }
912 
913 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
914 {
915 	int x;
916 
917 	x = lrc_ring_mi_mode(engine);
918 	if (x != -1) {
919 		regs[x + 1] &= ~STOP_RING;
920 		regs[x + 1] |= STOP_RING << 16;
921 	}
922 }
923 
924 static void __lrc_init_regs(u32 *regs,
925 			    const struct intel_context *ce,
926 			    const struct intel_engine_cs *engine,
927 			    bool inhibit)
928 {
929 	/*
930 	 * A context is actually a big batch buffer with several
931 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
932 	 * values we are setting here are only for the first context restore:
933 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
934 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
935 	 * we are not initializing here).
936 	 *
937 	 * Must keep consistent with virtual_update_register_offsets().
938 	 */
939 
940 	if (inhibit)
941 		memset(regs, 0, PAGE_SIZE);
942 
943 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
944 
945 	init_common_regs(regs, ce, engine, inhibit);
946 	init_ppgtt_regs(regs, vm_alias(ce->vm));
947 
948 	init_wa_bb_regs(regs, engine);
949 
950 	__reset_stop_ring(regs, engine);
951 }
952 
953 void lrc_init_regs(const struct intel_context *ce,
954 		   const struct intel_engine_cs *engine,
955 		   bool inhibit)
956 {
957 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
958 }
959 
960 void lrc_reset_regs(const struct intel_context *ce,
961 		    const struct intel_engine_cs *engine)
962 {
963 	__reset_stop_ring(ce->lrc_reg_state, engine);
964 }
965 
966 static void
967 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
968 {
969 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
970 		return;
971 
972 	vaddr += engine->context_size;
973 
974 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
975 }
976 
977 static void
978 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
979 {
980 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
981 		return;
982 
983 	vaddr += engine->context_size;
984 
985 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
986 		drm_err_once(&engine->i915->drm,
987 			     "%s context redzone overwritten!\n",
988 			     engine->name);
989 }
990 
991 static u32 context_wa_bb_offset(const struct intel_context *ce)
992 {
993 	return PAGE_SIZE * ce->wa_bb_page;
994 }
995 
996 /*
997  * per_ctx below determines which WABB section is used.
998  * When true, the function returns the location of the
999  * PER_CTX_BB.  When false, the function returns the
1000  * location of the INDIRECT_CTX.
1001  */
1002 static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
1003 {
1004 	void *ptr;
1005 
1006 	GEM_BUG_ON(!ce->wa_bb_page);
1007 
1008 	ptr = ce->lrc_reg_state;
1009 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1010 	ptr += context_wa_bb_offset(ce);
1011 	ptr += per_ctx ? PAGE_SIZE : 0;
1012 
1013 	return ptr;
1014 }
1015 
1016 void lrc_init_state(struct intel_context *ce,
1017 		    struct intel_engine_cs *engine,
1018 		    void *state)
1019 {
1020 	bool inhibit = true;
1021 
1022 	set_redzone(state, engine);
1023 
1024 	if (ce->default_state) {
1025 		shmem_read(ce->default_state, 0, state, engine->context_size);
1026 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
1027 		inhibit = false;
1028 	}
1029 
1030 	/* Clear the ppHWSP (inc. per-context counters) */
1031 	memset(state, 0, PAGE_SIZE);
1032 
1033 	/* Clear the indirect wa and storage */
1034 	if (ce->wa_bb_page)
1035 		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1036 
1037 	/*
1038 	 * The second page of the context object contains some registers which
1039 	 * must be set up prior to the first execution.
1040 	 */
1041 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1042 }
1043 
1044 u32 lrc_indirect_bb(const struct intel_context *ce)
1045 {
1046 	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1047 }
1048 
1049 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1050 {
1051 	/* If predication is active, this will be noop'ed */
1052 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1053 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1054 	*cs++ = 0;
1055 	*cs++ = 0; /* No predication */
1056 
1057 	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1058 	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
1059 	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1060 
1061 	/* Instructions are no longer predicated (disabled), we can proceed */
1062 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1063 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1064 	*cs++ = 0;
1065 	*cs++ = 1; /* enable predication before the next BB */
1066 
1067 	*cs++ = MI_BATCH_BUFFER_END;
1068 	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1069 
1070 	return cs;
1071 }
1072 
1073 static struct i915_vma *
1074 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1075 {
1076 	struct drm_i915_gem_object *obj;
1077 	struct i915_vma *vma;
1078 	u32 context_size;
1079 
1080 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1081 
1082 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1083 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1084 
1085 	if (GRAPHICS_VER(engine->i915) >= 12) {
1086 		ce->wa_bb_page = context_size / PAGE_SIZE;
1087 		/* INDIRECT_CTX and PER_CTX_BB need separate pages. */
1088 		context_size += PAGE_SIZE * 2;
1089 	}
1090 
1091 	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1092 		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1093 		context_size += PARENT_SCRATCH_SIZE;
1094 	}
1095 
1096 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1097 					  I915_BO_ALLOC_PM_VOLATILE);
1098 	if (IS_ERR(obj)) {
1099 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1100 		if (IS_ERR(obj))
1101 			return ERR_CAST(obj);
1102 
1103 		/*
1104 		 * Wa_22016122933: For Media version 13.0, all Media GT shared
1105 		 * memory needs to be mapped as WC on CPU side and UC (PAT
1106 		 * index 2) on GPU side.
1107 		 */
1108 		if (intel_gt_needs_wa_22016122933(engine->gt))
1109 			i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1110 	}
1111 
1112 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1113 	if (IS_ERR(vma)) {
1114 		i915_gem_object_put(obj);
1115 		return vma;
1116 	}
1117 
1118 	return vma;
1119 }
1120 
1121 static struct intel_timeline *
1122 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1123 {
1124 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1125 
1126 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1127 }
1128 
1129 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1130 {
1131 	struct intel_ring *ring;
1132 	struct i915_vma *vma;
1133 	int err;
1134 
1135 	GEM_BUG_ON(ce->state);
1136 
1137 	if (!intel_context_has_own_state(ce))
1138 		ce->default_state = engine->default_state;
1139 
1140 	vma = __lrc_alloc_state(ce, engine);
1141 	if (IS_ERR(vma))
1142 		return PTR_ERR(vma);
1143 
1144 	ring = intel_engine_create_ring(engine, ce->ring_size);
1145 	if (IS_ERR(ring)) {
1146 		err = PTR_ERR(ring);
1147 		goto err_vma;
1148 	}
1149 
1150 	if (!page_mask_bits(ce->timeline)) {
1151 		struct intel_timeline *tl;
1152 
1153 		/*
1154 		 * Use the static global HWSP for the kernel context, and
1155 		 * a dynamically allocated cacheline for everyone else.
1156 		 */
1157 		if (unlikely(ce->timeline))
1158 			tl = pinned_timeline(ce, engine);
1159 		else
1160 			tl = intel_timeline_create(engine->gt);
1161 		if (IS_ERR(tl)) {
1162 			err = PTR_ERR(tl);
1163 			goto err_ring;
1164 		}
1165 
1166 		ce->timeline = tl;
1167 	}
1168 
1169 	ce->ring = ring;
1170 	ce->state = vma;
1171 
1172 	return 0;
1173 
1174 err_ring:
1175 	intel_ring_put(ring);
1176 err_vma:
1177 	i915_vma_put(vma);
1178 	return err;
1179 }
1180 
1181 void lrc_reset(struct intel_context *ce)
1182 {
1183 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1184 
1185 	intel_ring_reset(ce->ring, ce->ring->emit);
1186 
1187 	/* Scrub away the garbage */
1188 	lrc_init_regs(ce, ce->engine, true);
1189 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1190 }
1191 
1192 int
1193 lrc_pre_pin(struct intel_context *ce,
1194 	    struct intel_engine_cs *engine,
1195 	    struct i915_gem_ww_ctx *ww,
1196 	    void **vaddr)
1197 {
1198 	GEM_BUG_ON(!ce->state);
1199 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1200 
1201 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1202 					 intel_gt_coherent_map_type(ce->engine->gt,
1203 								    ce->state->obj,
1204 								    false) |
1205 					 I915_MAP_OVERRIDE);
1206 
1207 	return PTR_ERR_OR_ZERO(*vaddr);
1208 }
1209 
1210 int
1211 lrc_pin(struct intel_context *ce,
1212 	struct intel_engine_cs *engine,
1213 	void *vaddr)
1214 {
1215 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1216 
1217 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1218 		lrc_init_state(ce, engine, vaddr);
1219 
1220 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1221 	return 0;
1222 }
1223 
1224 void lrc_unpin(struct intel_context *ce)
1225 {
1226 	if (unlikely(ce->parallel.last_rq)) {
1227 		i915_request_put(ce->parallel.last_rq);
1228 		ce->parallel.last_rq = NULL;
1229 	}
1230 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1231 		      ce->engine);
1232 }
1233 
1234 void lrc_post_unpin(struct intel_context *ce)
1235 {
1236 	i915_gem_object_unpin_map(ce->state->obj);
1237 }
1238 
1239 void lrc_fini(struct intel_context *ce)
1240 {
1241 	if (!ce->state)
1242 		return;
1243 
1244 	intel_ring_put(fetch_and_zero(&ce->ring));
1245 	i915_vma_put(fetch_and_zero(&ce->state));
1246 }
1247 
1248 void lrc_destroy(struct kref *kref)
1249 {
1250 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1251 
1252 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1253 	GEM_BUG_ON(intel_context_is_pinned(ce));
1254 
1255 	lrc_fini(ce);
1256 
1257 	intel_context_fini(ce);
1258 	intel_context_free(ce);
1259 }
1260 
1261 static u32 *
1262 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1263 {
1264 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1265 		MI_SRM_LRM_GLOBAL_GTT |
1266 		MI_LRI_LRM_CS_MMIO;
1267 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1268 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1269 		CTX_TIMESTAMP * sizeof(u32);
1270 	*cs++ = 0;
1271 
1272 	*cs++ = MI_LOAD_REGISTER_REG |
1273 		MI_LRR_SOURCE_CS_MMIO |
1274 		MI_LRI_LRM_CS_MMIO;
1275 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1276 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1277 
1278 	*cs++ = MI_LOAD_REGISTER_REG |
1279 		MI_LRR_SOURCE_CS_MMIO |
1280 		MI_LRI_LRM_CS_MMIO;
1281 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1282 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1283 
1284 	return cs;
1285 }
1286 
1287 static u32 *
1288 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1289 {
1290 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1291 
1292 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1293 		MI_SRM_LRM_GLOBAL_GTT |
1294 		MI_LRI_LRM_CS_MMIO;
1295 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1296 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1297 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1298 	*cs++ = 0;
1299 
1300 	return cs;
1301 }
1302 
1303 static u32 *
1304 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1305 {
1306 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1307 
1308 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1309 		MI_SRM_LRM_GLOBAL_GTT |
1310 		MI_LRI_LRM_CS_MMIO;
1311 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1312 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1313 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1314 	*cs++ = 0;
1315 
1316 	*cs++ = MI_LOAD_REGISTER_REG |
1317 		MI_LRR_SOURCE_CS_MMIO |
1318 		MI_LRI_LRM_CS_MMIO;
1319 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1320 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1321 
1322 	return cs;
1323 }
1324 
1325 /*
1326  * The bspec's tuning guide asks us to program a vertical watermark value of
1327  * 0x3FF.  However this register is not saved/restored properly by the
1328  * hardware, so we're required to apply the desired value via INDIRECT_CTX
1329  * batch buffer to ensure the value takes effect properly.  All other bits
1330  * in this register should remain at 0 (the hardware default).
1331  */
1332 static u32 *
1333 dg2_emit_draw_watermark_setting(u32 *cs)
1334 {
1335 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1336 	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1337 	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1338 
1339 	return cs;
1340 }
1341 
1342 static u32 *
1343 gen12_invalidate_state_cache(u32 *cs)
1344 {
1345 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1346 	*cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2);
1347 	*cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1348 	return cs;
1349 }
1350 
1351 static u32 *
1352 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1353 {
1354 	cs = gen12_emit_timestamp_wa(ce, cs);
1355 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1356 	cs = gen12_emit_restore_scratch(ce, cs);
1357 
1358 	/* Wa_16013000631:dg2 */
1359 	if (IS_DG2_G11(ce->engine->i915))
1360 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1361 
1362 	cs = gen12_emit_aux_table_inv(ce->engine, cs);
1363 
1364 	/* Wa_18022495364 */
1365 	if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10)))
1366 		cs = gen12_invalidate_state_cache(cs);
1367 
1368 	/* Wa_16014892111 */
1369 	if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1370 	    IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
1371 	    IS_DG2(ce->engine->i915))
1372 		cs = dg2_emit_draw_watermark_setting(cs);
1373 
1374 	return cs;
1375 }
1376 
1377 static u32 *
1378 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1379 {
1380 	cs = gen12_emit_timestamp_wa(ce, cs);
1381 	cs = gen12_emit_restore_scratch(ce, cs);
1382 
1383 	/* Wa_16013000631:dg2 */
1384 	if (IS_DG2_G11(ce->engine->i915))
1385 		if (ce->engine->class == COMPUTE_CLASS)
1386 			cs = gen8_emit_pipe_control(cs,
1387 						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1388 						    0);
1389 
1390 	return gen12_emit_aux_table_inv(ce->engine, cs);
1391 }
1392 
1393 static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
1394 {
1395 	struct intel_gt *gt = ce->engine->gt;
1396 	int mocs = gt->mocs.uc_index << 1;
1397 
1398 	/**
1399 	 * Wa_16018031267 / Wa_16018063123 requires that SW forces the
1400 	 * main copy engine arbitration into round robin mode.  We
1401 	 * additionally need to submit the following WABB blt command
1402 	 * to produce 4 subblits with each subblit generating 0 byte
1403 	 * write requests as WABB:
1404 	 *
1405 	 * XY_FASTCOLOR_BLT
1406 	 *  BG0    -> 5100000E
1407 	 *  BG1    -> 0000003F (Dest pitch)
1408 	 *  BG2    -> 00000000 (X1, Y1) = (0, 0)
1409 	 *  BG3    -> 00040001 (X2, Y2) = (1, 4)
1410 	 *  BG4    -> scratch
1411 	 *  BG5    -> scratch
1412 	 *  BG6-12 -> 00000000
1413 	 *  BG13   -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
1414 	 *  BG14   -> 00000010 (Qpitch = 4)
1415 	 *  BG15   -> 00000000
1416 	 */
1417 	*cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
1418 	*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
1419 	*cs++ = 0;
1420 	*cs++ = 4 << 16 | 1;
1421 	*cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1422 	*cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1423 	*cs++ = 0;
1424 	*cs++ = 0;
1425 	*cs++ = 0;
1426 	*cs++ = 0;
1427 	*cs++ = 0;
1428 	*cs++ = 0;
1429 	*cs++ = 0;
1430 	*cs++ = 0x20004004;
1431 	*cs++ = 0x10;
1432 	*cs++ = 0;
1433 
1434 	return cs;
1435 }
1436 
1437 static u32 *
1438 xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
1439 {
1440 	/* Wa_16018031267, Wa_16018063123 */
1441 	if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine))
1442 		cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
1443 
1444 	return cs;
1445 }
1446 
1447 static void
1448 setup_per_ctx_bb(const struct intel_context *ce,
1449 		 const struct intel_engine_cs *engine,
1450 		 u32 *(*emit)(const struct intel_context *, u32 *))
1451 {
1452 	/* Place PER_CTX_BB on next page after INDIRECT_CTX */
1453 	u32 * const start = context_wabb(ce, true);
1454 	u32 *cs;
1455 
1456 	cs = emit(ce, start);
1457 
1458 	/* PER_CTX_BB must manually terminate */
1459 	*cs++ = MI_BATCH_BUFFER_END;
1460 
1461 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1462 	lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
1463 			     lrc_indirect_bb(ce) + PAGE_SIZE);
1464 }
1465 
1466 static void
1467 setup_indirect_ctx_bb(const struct intel_context *ce,
1468 		      const struct intel_engine_cs *engine,
1469 		      u32 *(*emit)(const struct intel_context *, u32 *))
1470 {
1471 	u32 * const start = context_wabb(ce, false);
1472 	u32 *cs;
1473 
1474 	cs = emit(ce, start);
1475 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1476 	while ((unsigned long)cs % CACHELINE_BYTES)
1477 		*cs++ = MI_NOOP;
1478 
1479 	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1480 	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1481 
1482 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1483 			       lrc_indirect_bb(ce),
1484 			       (cs - start) * sizeof(*cs));
1485 }
1486 
1487 /*
1488  * The context descriptor encodes various attributes of a context,
1489  * including its GTT address and some flags. Because it's fairly
1490  * expensive to calculate, we'll just do it once and cache the result,
1491  * which remains valid until the context is unpinned.
1492  *
1493  * This is what a descriptor looks like, from LSB to MSB::
1494  *
1495  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1496  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1497  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1498  *      bits 53-54:    mbz, reserved for use by hardware
1499  *      bits 55-63:    group ID, currently unused and set to 0
1500  *
1501  * Starting from Gen11, the upper dword of the descriptor has a new format:
1502  *
1503  *      bits 32-36:    reserved
1504  *      bits 37-47:    SW context ID
1505  *      bits 48:53:    engine instance
1506  *      bit 54:        mbz, reserved for use by hardware
1507  *      bits 55-60:    SW counter
1508  *      bits 61-63:    engine class
1509  *
1510  * On Xe_HP, the upper dword of the descriptor has a new format:
1511  *
1512  *      bits 32-37:    virtual function number
1513  *      bit 38:        mbz, reserved for use by hardware
1514  *      bits 39-54:    SW context ID
1515  *      bits 55-57:    reserved
1516  *      bits 58-63:    SW counter
1517  *
1518  * engine info, SW context ID and SW counter need to form a unique number
1519  * (Context ID) per lrc.
1520  */
1521 static u32 lrc_descriptor(const struct intel_context *ce)
1522 {
1523 	u32 desc;
1524 
1525 	desc = INTEL_LEGACY_32B_CONTEXT;
1526 	if (i915_vm_is_4lvl(ce->vm))
1527 		desc = INTEL_LEGACY_64B_CONTEXT;
1528 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1529 
1530 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1531 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1532 		desc |= GEN8_CTX_L3LLC_COHERENT;
1533 
1534 	return i915_ggtt_offset(ce->state) | desc;
1535 }
1536 
1537 u32 lrc_update_regs(const struct intel_context *ce,
1538 		    const struct intel_engine_cs *engine,
1539 		    u32 head)
1540 {
1541 	struct intel_ring *ring = ce->ring;
1542 	u32 *regs = ce->lrc_reg_state;
1543 
1544 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1545 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1546 
1547 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1548 	regs[CTX_RING_HEAD] = head;
1549 	regs[CTX_RING_TAIL] = ring->tail;
1550 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1551 
1552 	/* RPCS */
1553 	if (engine->class == RENDER_CLASS) {
1554 		regs[CTX_R_PWR_CLK_STATE] =
1555 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1556 
1557 		i915_oa_init_reg_state(ce, engine);
1558 	}
1559 
1560 	if (ce->wa_bb_page) {
1561 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1562 
1563 		fn = gen12_emit_indirect_ctx_xcs;
1564 		if (ce->engine->class == RENDER_CLASS)
1565 			fn = gen12_emit_indirect_ctx_rcs;
1566 
1567 		/* Mutually exclusive wrt to global indirect bb */
1568 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1569 		setup_indirect_ctx_bb(ce, engine, fn);
1570 		setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
1571 	}
1572 
1573 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1574 }
1575 
1576 void lrc_update_offsets(struct intel_context *ce,
1577 			struct intel_engine_cs *engine)
1578 {
1579 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1580 }
1581 
1582 void lrc_check_regs(const struct intel_context *ce,
1583 		    const struct intel_engine_cs *engine,
1584 		    const char *when)
1585 {
1586 	const struct intel_ring *ring = ce->ring;
1587 	u32 *regs = ce->lrc_reg_state;
1588 	bool valid = true;
1589 	int x;
1590 
1591 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1592 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1593 		       engine->name,
1594 		       regs[CTX_RING_START],
1595 		       i915_ggtt_offset(ring->vma));
1596 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1597 		valid = false;
1598 	}
1599 
1600 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1601 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1602 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1603 		       engine->name,
1604 		       regs[CTX_RING_CTL],
1605 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1606 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1607 		valid = false;
1608 	}
1609 
1610 	x = lrc_ring_mi_mode(engine);
1611 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1612 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1613 		       engine->name, regs[x + 1]);
1614 		regs[x + 1] &= ~STOP_RING;
1615 		regs[x + 1] |= STOP_RING << 16;
1616 		valid = false;
1617 	}
1618 
1619 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1620 }
1621 
1622 /*
1623  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1624  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1625  * but there is a slight complication as this is applied in WA batch where the
1626  * values are only initialized once so we cannot take register value at the
1627  * beginning and reuse it further; hence we save its value to memory, upload a
1628  * constant value with bit21 set and then we restore it back with the saved value.
1629  * To simplify the WA, a constant value is formed by using the default value
1630  * of this register. This shouldn't be a problem because we are only modifying
1631  * it for a short period and this batch in non-premptible. We can ofcourse
1632  * use additional instructions that read the actual value of the register
1633  * at that time and set our bit of interest but it makes the WA complicated.
1634  *
1635  * This WA is also required for Gen9 so extracting as a function avoids
1636  * code duplication.
1637  */
1638 static u32 *
1639 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1640 {
1641 	/* NB no one else is allowed to scribble over scratch + 256! */
1642 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1643 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1644 	*batch++ = intel_gt_scratch_offset(engine->gt,
1645 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1646 	*batch++ = 0;
1647 
1648 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1649 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1650 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1651 
1652 	batch = gen8_emit_pipe_control(batch,
1653 				       PIPE_CONTROL_CS_STALL |
1654 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1655 				       0);
1656 
1657 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1658 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1659 	*batch++ = intel_gt_scratch_offset(engine->gt,
1660 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1661 	*batch++ = 0;
1662 
1663 	return batch;
1664 }
1665 
1666 /*
1667  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1668  * initialized at the beginning and shared across all contexts but this field
1669  * helps us to have multiple batches at different offsets and select them based
1670  * on a criteria. At the moment this batch always start at the beginning of the page
1671  * and at this point we don't have multiple wa_ctx batch buffers.
1672  *
1673  * The number of WA applied are not known at the beginning; we use this field
1674  * to return the no of DWORDS written.
1675  *
1676  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1677  * so it adds NOOPs as padding to make it cacheline aligned.
1678  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1679  * makes a complete batch buffer.
1680  */
1681 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1682 {
1683 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1684 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1685 
1686 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1687 	if (IS_BROADWELL(engine->i915))
1688 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1689 
1690 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1691 	/* Actual scratch location is at 128 bytes offset */
1692 	batch = gen8_emit_pipe_control(batch,
1693 				       PIPE_CONTROL_FLUSH_L3 |
1694 				       PIPE_CONTROL_STORE_DATA_INDEX |
1695 				       PIPE_CONTROL_CS_STALL |
1696 				       PIPE_CONTROL_QW_WRITE,
1697 				       LRC_PPHWSP_SCRATCH_ADDR);
1698 
1699 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1700 
1701 	/* Pad to end of cacheline */
1702 	while ((unsigned long)batch % CACHELINE_BYTES)
1703 		*batch++ = MI_NOOP;
1704 
1705 	/*
1706 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1707 	 * execution depends on the length specified in terms of cache lines
1708 	 * in the register CTX_RCS_INDIRECT_CTX
1709 	 */
1710 
1711 	return batch;
1712 }
1713 
1714 struct lri {
1715 	i915_reg_t reg;
1716 	u32 value;
1717 };
1718 
1719 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1720 {
1721 	GEM_BUG_ON(!count || count > 63);
1722 
1723 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1724 	do {
1725 		*batch++ = i915_mmio_reg_offset(lri->reg);
1726 		*batch++ = lri->value;
1727 	} while (lri++, --count);
1728 	*batch++ = MI_NOOP;
1729 
1730 	return batch;
1731 }
1732 
1733 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1734 {
1735 	static const struct lri lri[] = {
1736 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1737 		{
1738 			COMMON_SLICE_CHICKEN2,
1739 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1740 				       0),
1741 		},
1742 
1743 		/* BSpec: 11391 */
1744 		{
1745 			FF_SLICE_CHICKEN,
1746 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1747 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1748 		},
1749 
1750 		/* BSpec: 11299 */
1751 		{
1752 			_3D_CHICKEN3,
1753 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1754 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1755 		}
1756 	};
1757 
1758 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1759 
1760 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1761 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1762 
1763 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1764 	batch = gen8_emit_pipe_control(batch,
1765 				       PIPE_CONTROL_FLUSH_L3 |
1766 				       PIPE_CONTROL_STORE_DATA_INDEX |
1767 				       PIPE_CONTROL_CS_STALL |
1768 				       PIPE_CONTROL_QW_WRITE,
1769 				       LRC_PPHWSP_SCRATCH_ADDR);
1770 
1771 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1772 
1773 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1774 	if (HAS_POOLED_EU(engine->i915)) {
1775 		/*
1776 		 * EU pool configuration is setup along with golden context
1777 		 * during context initialization. This value depends on
1778 		 * device type (2x6 or 3x6) and needs to be updated based
1779 		 * on which subslice is disabled especially for 2x6
1780 		 * devices, however it is safe to load default
1781 		 * configuration of 3x6 device instead of masking off
1782 		 * corresponding bits because HW ignores bits of a disabled
1783 		 * subslice and drops down to appropriate config. Please
1784 		 * see render_state_setup() in i915_gem_render_state.c for
1785 		 * possible configurations, to avoid duplication they are
1786 		 * not shown here again.
1787 		 */
1788 		*batch++ = GEN9_MEDIA_POOL_STATE;
1789 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1790 		*batch++ = 0x00777000;
1791 		*batch++ = 0;
1792 		*batch++ = 0;
1793 		*batch++ = 0;
1794 	}
1795 
1796 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1797 
1798 	/* Pad to end of cacheline */
1799 	while ((unsigned long)batch % CACHELINE_BYTES)
1800 		*batch++ = MI_NOOP;
1801 
1802 	return batch;
1803 }
1804 
1805 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1806 
1807 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1808 {
1809 	struct drm_i915_gem_object *obj;
1810 	struct i915_vma *vma;
1811 	int err;
1812 
1813 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1814 	if (IS_ERR(obj))
1815 		return PTR_ERR(obj);
1816 
1817 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1818 	if (IS_ERR(vma)) {
1819 		err = PTR_ERR(vma);
1820 		goto err;
1821 	}
1822 
1823 	engine->wa_ctx.vma = vma;
1824 	return 0;
1825 
1826 err:
1827 	i915_gem_object_put(obj);
1828 	return err;
1829 }
1830 
1831 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1832 {
1833 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1834 }
1835 
1836 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1837 
1838 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1839 {
1840 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1841 	struct i915_wa_ctx_bb *wa_bb[] = {
1842 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1843 	};
1844 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1845 	struct i915_gem_ww_ctx ww;
1846 	void *batch, *batch_ptr;
1847 	unsigned int i;
1848 	int err;
1849 
1850 	if (GRAPHICS_VER(engine->i915) >= 11 ||
1851 	    !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1852 		return;
1853 
1854 	if (GRAPHICS_VER(engine->i915) == 9) {
1855 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1856 		wa_bb_fn[1] = NULL;
1857 	} else if (GRAPHICS_VER(engine->i915) == 8) {
1858 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1859 		wa_bb_fn[1] = NULL;
1860 	}
1861 
1862 	err = lrc_create_wa_ctx(engine);
1863 	if (err) {
1864 		/*
1865 		 * We continue even if we fail to initialize WA batch
1866 		 * because we only expect rare glitches but nothing
1867 		 * critical to prevent us from using GPU
1868 		 */
1869 		drm_err(&engine->i915->drm,
1870 			"Ignoring context switch w/a allocation error:%d\n",
1871 			err);
1872 		return;
1873 	}
1874 
1875 	if (!engine->wa_ctx.vma)
1876 		return;
1877 
1878 	i915_gem_ww_ctx_init(&ww, true);
1879 retry:
1880 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1881 	if (!err)
1882 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1883 	if (err)
1884 		goto err;
1885 
1886 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1887 	if (IS_ERR(batch)) {
1888 		err = PTR_ERR(batch);
1889 		goto err_unpin;
1890 	}
1891 
1892 	/*
1893 	 * Emit the two workaround batch buffers, recording the offset from the
1894 	 * start of the workaround batch buffer object for each and their
1895 	 * respective sizes.
1896 	 */
1897 	batch_ptr = batch;
1898 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1899 		wa_bb[i]->offset = batch_ptr - batch;
1900 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1901 						  CACHELINE_BYTES))) {
1902 			err = -EINVAL;
1903 			break;
1904 		}
1905 		if (wa_bb_fn[i])
1906 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1907 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1908 	}
1909 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1910 
1911 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1912 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1913 
1914 	/* Verify that we can handle failure to setup the wa_ctx */
1915 	if (!err)
1916 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1917 
1918 err_unpin:
1919 	if (err)
1920 		i915_vma_unpin(wa_ctx->vma);
1921 err:
1922 	if (err == -EDEADLK) {
1923 		err = i915_gem_ww_ctx_backoff(&ww);
1924 		if (!err)
1925 			goto retry;
1926 	}
1927 	i915_gem_ww_ctx_fini(&ww);
1928 
1929 	if (err) {
1930 		i915_vma_put(engine->wa_ctx.vma);
1931 
1932 		/* Clear all flags to prevent further use */
1933 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1934 	}
1935 }
1936 
1937 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1938 {
1939 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1940 	stats->runtime.num_underflow++;
1941 	stats->runtime.max_underflow =
1942 		max_t(u32, stats->runtime.max_underflow, -dt);
1943 #endif
1944 }
1945 
1946 static u32 lrc_get_runtime(const struct intel_context *ce)
1947 {
1948 	/*
1949 	 * We can use either ppHWSP[16] which is recorded before the context
1950 	 * switch (and so excludes the cost of context switches) or use the
1951 	 * value from the context image itself, which is saved/restored earlier
1952 	 * and so includes the cost of the save.
1953 	 */
1954 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1955 }
1956 
1957 void lrc_update_runtime(struct intel_context *ce)
1958 {
1959 	struct intel_context_stats *stats = &ce->stats;
1960 	u32 old;
1961 	s32 dt;
1962 
1963 	old = stats->runtime.last;
1964 	stats->runtime.last = lrc_get_runtime(ce);
1965 	dt = stats->runtime.last - old;
1966 	if (!dt)
1967 		return;
1968 
1969 	if (unlikely(dt < 0)) {
1970 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1971 			 old, stats->runtime.last, dt);
1972 		st_runtime_underflow(stats, dt);
1973 		return;
1974 	}
1975 
1976 	ewma_runtime_add(&stats->runtime.avg, dt);
1977 	stats->runtime.total += dt;
1978 }
1979 
1980 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1981 #include "selftest_lrc.c"
1982 #endif
1983