xref: /linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 2b0cfa6e49566c8fa6759734cf821aa6e8271a9e)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22 
23 /*
24  * The per-platform tables are u8-encoded in @data. Decode @data and set the
25  * addresses' offset and commands in @regs. The following encoding is used
26  * for each byte. There are 2 steps: decoding commands and decoding addresses.
27  *
28  * Commands:
29  * [7]: create NOPs - number of NOPs are set in lower bits
30  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
31  *      MI_LRI_FORCE_POSTED
32  * [5:0]: Number of NOPs or registers to set values to in case of
33  *        MI_LOAD_REGISTER_IMM
34  *
35  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
36  * number of registers. They are set by using the REG/REG16 macros: the former
37  * is used for offsets smaller than 0x200 while the latter is for values bigger
38  * than that. Those macros already set all the bits documented below correctly:
39  *
40  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
41  *      follow, for the lower bits
42  * [6:0]: Register offset, without considering the engine base.
43  *
44  * This function only tweaks the commands and register offsets. Values are not
45  * filled out.
46  */
47 static void set_offsets(u32 *regs,
48 			const u8 *data,
49 			const struct intel_engine_cs *engine,
50 			bool close)
51 #define NOP(x) (BIT(7) | (x))
52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
53 #define POSTED BIT(0)
54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
55 #define REG16(x) \
56 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
57 	(((x) >> 2) & 0x7f)
58 #define END 0
59 {
60 	const u32 base = engine->mmio_base;
61 
62 	while (*data) {
63 		u8 count, flags;
64 
65 		if (*data & BIT(7)) { /* skip */
66 			count = *data++ & ~BIT(7);
67 			regs += count;
68 			continue;
69 		}
70 
71 		count = *data & 0x3f;
72 		flags = *data >> 6;
73 		data++;
74 
75 		*regs = MI_LOAD_REGISTER_IMM(count);
76 		if (flags & POSTED)
77 			*regs |= MI_LRI_FORCE_POSTED;
78 		if (GRAPHICS_VER(engine->i915) >= 11)
79 			*regs |= MI_LRI_LRM_CS_MMIO;
80 		regs++;
81 
82 		GEM_BUG_ON(!count);
83 		do {
84 			u32 offset = 0;
85 			u8 v;
86 
87 			do {
88 				v = *data++;
89 				offset <<= 7;
90 				offset |= v & ~BIT(7);
91 			} while (v & BIT(7));
92 
93 			regs[0] = base + (offset << 2);
94 			regs += 2;
95 		} while (--count);
96 	}
97 
98 	if (close) {
99 		/* Close the batch; used mainly by live_lrc_layout() */
100 		*regs = MI_BATCH_BUFFER_END;
101 		if (GRAPHICS_VER(engine->i915) >= 11)
102 			*regs |= BIT(0);
103 	}
104 }
105 
106 static const u8 gen8_xcs_offsets[] = {
107 	NOP(1),
108 	LRI(11, 0),
109 	REG16(0x244),
110 	REG(0x034),
111 	REG(0x030),
112 	REG(0x038),
113 	REG(0x03c),
114 	REG(0x168),
115 	REG(0x140),
116 	REG(0x110),
117 	REG(0x11c),
118 	REG(0x114),
119 	REG(0x118),
120 
121 	NOP(9),
122 	LRI(9, 0),
123 	REG16(0x3a8),
124 	REG16(0x28c),
125 	REG16(0x288),
126 	REG16(0x284),
127 	REG16(0x280),
128 	REG16(0x27c),
129 	REG16(0x278),
130 	REG16(0x274),
131 	REG16(0x270),
132 
133 	NOP(13),
134 	LRI(2, 0),
135 	REG16(0x200),
136 	REG(0x028),
137 
138 	END
139 };
140 
141 static const u8 gen9_xcs_offsets[] = {
142 	NOP(1),
143 	LRI(14, POSTED),
144 	REG16(0x244),
145 	REG(0x034),
146 	REG(0x030),
147 	REG(0x038),
148 	REG(0x03c),
149 	REG(0x168),
150 	REG(0x140),
151 	REG(0x110),
152 	REG(0x11c),
153 	REG(0x114),
154 	REG(0x118),
155 	REG(0x1c0),
156 	REG(0x1c4),
157 	REG(0x1c8),
158 
159 	NOP(3),
160 	LRI(9, POSTED),
161 	REG16(0x3a8),
162 	REG16(0x28c),
163 	REG16(0x288),
164 	REG16(0x284),
165 	REG16(0x280),
166 	REG16(0x27c),
167 	REG16(0x278),
168 	REG16(0x274),
169 	REG16(0x270),
170 
171 	NOP(13),
172 	LRI(1, POSTED),
173 	REG16(0x200),
174 
175 	NOP(13),
176 	LRI(44, POSTED),
177 	REG(0x028),
178 	REG(0x09c),
179 	REG(0x0c0),
180 	REG(0x178),
181 	REG(0x17c),
182 	REG16(0x358),
183 	REG(0x170),
184 	REG(0x150),
185 	REG(0x154),
186 	REG(0x158),
187 	REG16(0x41c),
188 	REG16(0x600),
189 	REG16(0x604),
190 	REG16(0x608),
191 	REG16(0x60c),
192 	REG16(0x610),
193 	REG16(0x614),
194 	REG16(0x618),
195 	REG16(0x61c),
196 	REG16(0x620),
197 	REG16(0x624),
198 	REG16(0x628),
199 	REG16(0x62c),
200 	REG16(0x630),
201 	REG16(0x634),
202 	REG16(0x638),
203 	REG16(0x63c),
204 	REG16(0x640),
205 	REG16(0x644),
206 	REG16(0x648),
207 	REG16(0x64c),
208 	REG16(0x650),
209 	REG16(0x654),
210 	REG16(0x658),
211 	REG16(0x65c),
212 	REG16(0x660),
213 	REG16(0x664),
214 	REG16(0x668),
215 	REG16(0x66c),
216 	REG16(0x670),
217 	REG16(0x674),
218 	REG16(0x678),
219 	REG16(0x67c),
220 	REG(0x068),
221 
222 	END
223 };
224 
225 static const u8 gen12_xcs_offsets[] = {
226 	NOP(1),
227 	LRI(13, POSTED),
228 	REG16(0x244),
229 	REG(0x034),
230 	REG(0x030),
231 	REG(0x038),
232 	REG(0x03c),
233 	REG(0x168),
234 	REG(0x140),
235 	REG(0x110),
236 	REG(0x1c0),
237 	REG(0x1c4),
238 	REG(0x1c8),
239 	REG(0x180),
240 	REG16(0x2b4),
241 
242 	NOP(5),
243 	LRI(9, POSTED),
244 	REG16(0x3a8),
245 	REG16(0x28c),
246 	REG16(0x288),
247 	REG16(0x284),
248 	REG16(0x280),
249 	REG16(0x27c),
250 	REG16(0x278),
251 	REG16(0x274),
252 	REG16(0x270),
253 
254 	END
255 };
256 
257 static const u8 dg2_xcs_offsets[] = {
258 	NOP(1),
259 	LRI(15, POSTED),
260 	REG16(0x244),
261 	REG(0x034),
262 	REG(0x030),
263 	REG(0x038),
264 	REG(0x03c),
265 	REG(0x168),
266 	REG(0x140),
267 	REG(0x110),
268 	REG(0x1c0),
269 	REG(0x1c4),
270 	REG(0x1c8),
271 	REG(0x180),
272 	REG16(0x2b4),
273 	REG(0x120),
274 	REG(0x124),
275 
276 	NOP(1),
277 	LRI(9, POSTED),
278 	REG16(0x3a8),
279 	REG16(0x28c),
280 	REG16(0x288),
281 	REG16(0x284),
282 	REG16(0x280),
283 	REG16(0x27c),
284 	REG16(0x278),
285 	REG16(0x274),
286 	REG16(0x270),
287 
288 	END
289 };
290 
291 static const u8 gen8_rcs_offsets[] = {
292 	NOP(1),
293 	LRI(14, POSTED),
294 	REG16(0x244),
295 	REG(0x034),
296 	REG(0x030),
297 	REG(0x038),
298 	REG(0x03c),
299 	REG(0x168),
300 	REG(0x140),
301 	REG(0x110),
302 	REG(0x11c),
303 	REG(0x114),
304 	REG(0x118),
305 	REG(0x1c0),
306 	REG(0x1c4),
307 	REG(0x1c8),
308 
309 	NOP(3),
310 	LRI(9, POSTED),
311 	REG16(0x3a8),
312 	REG16(0x28c),
313 	REG16(0x288),
314 	REG16(0x284),
315 	REG16(0x280),
316 	REG16(0x27c),
317 	REG16(0x278),
318 	REG16(0x274),
319 	REG16(0x270),
320 
321 	NOP(13),
322 	LRI(1, 0),
323 	REG(0x0c8),
324 
325 	END
326 };
327 
328 static const u8 gen9_rcs_offsets[] = {
329 	NOP(1),
330 	LRI(14, POSTED),
331 	REG16(0x244),
332 	REG(0x34),
333 	REG(0x30),
334 	REG(0x38),
335 	REG(0x3c),
336 	REG(0x168),
337 	REG(0x140),
338 	REG(0x110),
339 	REG(0x11c),
340 	REG(0x114),
341 	REG(0x118),
342 	REG(0x1c0),
343 	REG(0x1c4),
344 	REG(0x1c8),
345 
346 	NOP(3),
347 	LRI(9, POSTED),
348 	REG16(0x3a8),
349 	REG16(0x28c),
350 	REG16(0x288),
351 	REG16(0x284),
352 	REG16(0x280),
353 	REG16(0x27c),
354 	REG16(0x278),
355 	REG16(0x274),
356 	REG16(0x270),
357 
358 	NOP(13),
359 	LRI(1, 0),
360 	REG(0xc8),
361 
362 	NOP(13),
363 	LRI(44, POSTED),
364 	REG(0x28),
365 	REG(0x9c),
366 	REG(0xc0),
367 	REG(0x178),
368 	REG(0x17c),
369 	REG16(0x358),
370 	REG(0x170),
371 	REG(0x150),
372 	REG(0x154),
373 	REG(0x158),
374 	REG16(0x41c),
375 	REG16(0x600),
376 	REG16(0x604),
377 	REG16(0x608),
378 	REG16(0x60c),
379 	REG16(0x610),
380 	REG16(0x614),
381 	REG16(0x618),
382 	REG16(0x61c),
383 	REG16(0x620),
384 	REG16(0x624),
385 	REG16(0x628),
386 	REG16(0x62c),
387 	REG16(0x630),
388 	REG16(0x634),
389 	REG16(0x638),
390 	REG16(0x63c),
391 	REG16(0x640),
392 	REG16(0x644),
393 	REG16(0x648),
394 	REG16(0x64c),
395 	REG16(0x650),
396 	REG16(0x654),
397 	REG16(0x658),
398 	REG16(0x65c),
399 	REG16(0x660),
400 	REG16(0x664),
401 	REG16(0x668),
402 	REG16(0x66c),
403 	REG16(0x670),
404 	REG16(0x674),
405 	REG16(0x678),
406 	REG16(0x67c),
407 	REG(0x68),
408 
409 	END
410 };
411 
412 static const u8 gen11_rcs_offsets[] = {
413 	NOP(1),
414 	LRI(15, POSTED),
415 	REG16(0x244),
416 	REG(0x034),
417 	REG(0x030),
418 	REG(0x038),
419 	REG(0x03c),
420 	REG(0x168),
421 	REG(0x140),
422 	REG(0x110),
423 	REG(0x11c),
424 	REG(0x114),
425 	REG(0x118),
426 	REG(0x1c0),
427 	REG(0x1c4),
428 	REG(0x1c8),
429 	REG(0x180),
430 
431 	NOP(1),
432 	LRI(9, POSTED),
433 	REG16(0x3a8),
434 	REG16(0x28c),
435 	REG16(0x288),
436 	REG16(0x284),
437 	REG16(0x280),
438 	REG16(0x27c),
439 	REG16(0x278),
440 	REG16(0x274),
441 	REG16(0x270),
442 
443 	LRI(1, POSTED),
444 	REG(0x1b0),
445 
446 	NOP(10),
447 	LRI(1, 0),
448 	REG(0x0c8),
449 
450 	END
451 };
452 
453 static const u8 gen12_rcs_offsets[] = {
454 	NOP(1),
455 	LRI(13, POSTED),
456 	REG16(0x244),
457 	REG(0x034),
458 	REG(0x030),
459 	REG(0x038),
460 	REG(0x03c),
461 	REG(0x168),
462 	REG(0x140),
463 	REG(0x110),
464 	REG(0x1c0),
465 	REG(0x1c4),
466 	REG(0x1c8),
467 	REG(0x180),
468 	REG16(0x2b4),
469 
470 	NOP(5),
471 	LRI(9, POSTED),
472 	REG16(0x3a8),
473 	REG16(0x28c),
474 	REG16(0x288),
475 	REG16(0x284),
476 	REG16(0x280),
477 	REG16(0x27c),
478 	REG16(0x278),
479 	REG16(0x274),
480 	REG16(0x270),
481 
482 	LRI(3, POSTED),
483 	REG(0x1b0),
484 	REG16(0x5a8),
485 	REG16(0x5ac),
486 
487 	NOP(6),
488 	LRI(1, 0),
489 	REG(0x0c8),
490 	NOP(3 + 9 + 1),
491 
492 	LRI(51, POSTED),
493 	REG16(0x588),
494 	REG16(0x588),
495 	REG16(0x588),
496 	REG16(0x588),
497 	REG16(0x588),
498 	REG16(0x588),
499 	REG(0x028),
500 	REG(0x09c),
501 	REG(0x0c0),
502 	REG(0x178),
503 	REG(0x17c),
504 	REG16(0x358),
505 	REG(0x170),
506 	REG(0x150),
507 	REG(0x154),
508 	REG(0x158),
509 	REG16(0x41c),
510 	REG16(0x600),
511 	REG16(0x604),
512 	REG16(0x608),
513 	REG16(0x60c),
514 	REG16(0x610),
515 	REG16(0x614),
516 	REG16(0x618),
517 	REG16(0x61c),
518 	REG16(0x620),
519 	REG16(0x624),
520 	REG16(0x628),
521 	REG16(0x62c),
522 	REG16(0x630),
523 	REG16(0x634),
524 	REG16(0x638),
525 	REG16(0x63c),
526 	REG16(0x640),
527 	REG16(0x644),
528 	REG16(0x648),
529 	REG16(0x64c),
530 	REG16(0x650),
531 	REG16(0x654),
532 	REG16(0x658),
533 	REG16(0x65c),
534 	REG16(0x660),
535 	REG16(0x664),
536 	REG16(0x668),
537 	REG16(0x66c),
538 	REG16(0x670),
539 	REG16(0x674),
540 	REG16(0x678),
541 	REG16(0x67c),
542 	REG(0x068),
543 	REG(0x084),
544 	NOP(1),
545 
546 	END
547 };
548 
549 static const u8 xehp_rcs_offsets[] = {
550 	NOP(1),
551 	LRI(13, POSTED),
552 	REG16(0x244),
553 	REG(0x034),
554 	REG(0x030),
555 	REG(0x038),
556 	REG(0x03c),
557 	REG(0x168),
558 	REG(0x140),
559 	REG(0x110),
560 	REG(0x1c0),
561 	REG(0x1c4),
562 	REG(0x1c8),
563 	REG(0x180),
564 	REG16(0x2b4),
565 
566 	NOP(5),
567 	LRI(9, POSTED),
568 	REG16(0x3a8),
569 	REG16(0x28c),
570 	REG16(0x288),
571 	REG16(0x284),
572 	REG16(0x280),
573 	REG16(0x27c),
574 	REG16(0x278),
575 	REG16(0x274),
576 	REG16(0x270),
577 
578 	LRI(3, POSTED),
579 	REG(0x1b0),
580 	REG16(0x5a8),
581 	REG16(0x5ac),
582 
583 	NOP(6),
584 	LRI(1, 0),
585 	REG(0x0c8),
586 
587 	END
588 };
589 
590 static const u8 dg2_rcs_offsets[] = {
591 	NOP(1),
592 	LRI(15, POSTED),
593 	REG16(0x244),
594 	REG(0x034),
595 	REG(0x030),
596 	REG(0x038),
597 	REG(0x03c),
598 	REG(0x168),
599 	REG(0x140),
600 	REG(0x110),
601 	REG(0x1c0),
602 	REG(0x1c4),
603 	REG(0x1c8),
604 	REG(0x180),
605 	REG16(0x2b4),
606 	REG(0x120),
607 	REG(0x124),
608 
609 	NOP(1),
610 	LRI(9, POSTED),
611 	REG16(0x3a8),
612 	REG16(0x28c),
613 	REG16(0x288),
614 	REG16(0x284),
615 	REG16(0x280),
616 	REG16(0x27c),
617 	REG16(0x278),
618 	REG16(0x274),
619 	REG16(0x270),
620 
621 	LRI(3, POSTED),
622 	REG(0x1b0),
623 	REG16(0x5a8),
624 	REG16(0x5ac),
625 
626 	NOP(6),
627 	LRI(1, 0),
628 	REG(0x0c8),
629 
630 	END
631 };
632 
633 static const u8 mtl_rcs_offsets[] = {
634 	NOP(1),
635 	LRI(15, POSTED),
636 	REG16(0x244),
637 	REG(0x034),
638 	REG(0x030),
639 	REG(0x038),
640 	REG(0x03c),
641 	REG(0x168),
642 	REG(0x140),
643 	REG(0x110),
644 	REG(0x1c0),
645 	REG(0x1c4),
646 	REG(0x1c8),
647 	REG(0x180),
648 	REG16(0x2b4),
649 	REG(0x120),
650 	REG(0x124),
651 
652 	NOP(1),
653 	LRI(9, POSTED),
654 	REG16(0x3a8),
655 	REG16(0x28c),
656 	REG16(0x288),
657 	REG16(0x284),
658 	REG16(0x280),
659 	REG16(0x27c),
660 	REG16(0x278),
661 	REG16(0x274),
662 	REG16(0x270),
663 
664 	NOP(2),
665 	LRI(2, POSTED),
666 	REG16(0x5a8),
667 	REG16(0x5ac),
668 
669 	NOP(6),
670 	LRI(1, 0),
671 	REG(0x0c8),
672 
673 	END
674 };
675 
676 #undef END
677 #undef REG16
678 #undef REG
679 #undef LRI
680 #undef NOP
681 
682 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
683 {
684 	/*
685 	 * The gen12+ lists only have the registers we program in the basic
686 	 * default state. We rely on the context image using relative
687 	 * addressing to automatic fixup the register state between the
688 	 * physical engines for virtual engine.
689 	 */
690 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
691 		   !intel_engine_has_relative_mmio(engine));
692 
693 	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
694 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
695 			return mtl_rcs_offsets;
696 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
697 			return dg2_rcs_offsets;
698 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
699 			return xehp_rcs_offsets;
700 		else if (GRAPHICS_VER(engine->i915) >= 12)
701 			return gen12_rcs_offsets;
702 		else if (GRAPHICS_VER(engine->i915) >= 11)
703 			return gen11_rcs_offsets;
704 		else if (GRAPHICS_VER(engine->i915) >= 9)
705 			return gen9_rcs_offsets;
706 		else
707 			return gen8_rcs_offsets;
708 	} else {
709 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
710 			return dg2_xcs_offsets;
711 		else if (GRAPHICS_VER(engine->i915) >= 12)
712 			return gen12_xcs_offsets;
713 		else if (GRAPHICS_VER(engine->i915) >= 9)
714 			return gen9_xcs_offsets;
715 		else
716 			return gen8_xcs_offsets;
717 	}
718 }
719 
720 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
721 {
722 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
723 		return 0x70;
724 	else if (GRAPHICS_VER(engine->i915) >= 12)
725 		return 0x60;
726 	else if (GRAPHICS_VER(engine->i915) >= 9)
727 		return 0x54;
728 	else if (engine->class == RENDER_CLASS)
729 		return 0x58;
730 	else
731 		return -1;
732 }
733 
734 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
735 {
736 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
737 		return 0x80;
738 	else if (GRAPHICS_VER(engine->i915) >= 12)
739 		return 0x70;
740 	else if (GRAPHICS_VER(engine->i915) >= 9)
741 		return 0x64;
742 	else if (GRAPHICS_VER(engine->i915) >= 8 &&
743 		 engine->class == RENDER_CLASS)
744 		return 0xc4;
745 	else
746 		return -1;
747 }
748 
749 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
750 {
751 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
752 		return 0x84;
753 	else if (GRAPHICS_VER(engine->i915) >= 12)
754 		return 0x74;
755 	else if (GRAPHICS_VER(engine->i915) >= 9)
756 		return 0x68;
757 	else if (engine->class == RENDER_CLASS)
758 		return 0xd8;
759 	else
760 		return -1;
761 }
762 
763 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
764 {
765 	if (GRAPHICS_VER(engine->i915) >= 12)
766 		return 0x12;
767 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
768 		return 0x18;
769 	else
770 		return -1;
771 }
772 
773 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
774 {
775 	int x;
776 
777 	x = lrc_ring_wa_bb_per_ctx(engine);
778 	if (x < 0)
779 		return x;
780 
781 	return x + 2;
782 }
783 
784 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
785 {
786 	int x;
787 
788 	x = lrc_ring_indirect_ptr(engine);
789 	if (x < 0)
790 		return x;
791 
792 	return x + 2;
793 }
794 
795 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
796 {
797 
798 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
799 		/*
800 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
801 		 * simply to match the RCS context image layout.
802 		 */
803 		return 0xc6;
804 	else if (engine->class != RENDER_CLASS)
805 		return -1;
806 	else if (GRAPHICS_VER(engine->i915) >= 12)
807 		return 0xb6;
808 	else if (GRAPHICS_VER(engine->i915) >= 11)
809 		return 0xaa;
810 	else
811 		return -1;
812 }
813 
814 static u32
815 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
816 {
817 	if (GRAPHICS_VER(engine->i915) >= 12)
818 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
819 	else if (GRAPHICS_VER(engine->i915) >= 11)
820 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
821 	else if (GRAPHICS_VER(engine->i915) >= 9)
822 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
823 	else if (GRAPHICS_VER(engine->i915) >= 8)
824 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
825 
826 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
827 
828 	return 0;
829 }
830 
831 static void
832 lrc_setup_bb_per_ctx(u32 *regs,
833 		     const struct intel_engine_cs *engine,
834 		     u32 ctx_bb_ggtt_addr)
835 {
836 	GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
837 	regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
838 		ctx_bb_ggtt_addr |
839 		PER_CTX_BB_FORCE |
840 		PER_CTX_BB_VALID;
841 }
842 
843 static void
844 lrc_setup_indirect_ctx(u32 *regs,
845 		       const struct intel_engine_cs *engine,
846 		       u32 ctx_bb_ggtt_addr,
847 		       u32 size)
848 {
849 	GEM_BUG_ON(!size);
850 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
851 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
852 	regs[lrc_ring_indirect_ptr(engine) + 1] =
853 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
854 
855 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
856 	regs[lrc_ring_indirect_offset(engine) + 1] =
857 		lrc_ring_indirect_offset_default(engine) << 6;
858 }
859 
860 static bool ctx_needs_runalone(const struct intel_context *ce)
861 {
862 	struct i915_gem_context *gem_ctx;
863 	bool ctx_is_protected = false;
864 
865 	/*
866 	 * On MTL and newer platforms, protected contexts require setting
867 	 * the LRC run-alone bit or else the encryption will not happen.
868 	 */
869 	if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
870 	    (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
871 		rcu_read_lock();
872 		gem_ctx = rcu_dereference(ce->gem_context);
873 		if (gem_ctx)
874 			ctx_is_protected = gem_ctx->uses_protected_content;
875 		rcu_read_unlock();
876 	}
877 
878 	return ctx_is_protected;
879 }
880 
881 static void init_common_regs(u32 * const regs,
882 			     const struct intel_context *ce,
883 			     const struct intel_engine_cs *engine,
884 			     bool inhibit)
885 {
886 	u32 ctl;
887 	int loc;
888 
889 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
890 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
891 	if (inhibit)
892 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
893 	if (GRAPHICS_VER(engine->i915) < 11)
894 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
895 					   CTX_CTRL_RS_CTX_ENABLE);
896 	if (ctx_needs_runalone(ce))
897 		ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE);
898 	regs[CTX_CONTEXT_CONTROL] = ctl;
899 
900 	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
901 
902 	loc = lrc_ring_bb_offset(engine);
903 	if (loc != -1)
904 		regs[loc + 1] = 0;
905 }
906 
907 static void init_wa_bb_regs(u32 * const regs,
908 			    const struct intel_engine_cs *engine)
909 {
910 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
911 
912 	if (wa_ctx->per_ctx.size) {
913 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
914 
915 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
916 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
917 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
918 	}
919 
920 	if (wa_ctx->indirect_ctx.size) {
921 		lrc_setup_indirect_ctx(regs, engine,
922 				       i915_ggtt_offset(wa_ctx->vma) +
923 				       wa_ctx->indirect_ctx.offset,
924 				       wa_ctx->indirect_ctx.size);
925 	}
926 }
927 
928 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
929 {
930 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
931 		/* 64b PPGTT (48bit canonical)
932 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
933 		 * other PDP Descriptors are ignored.
934 		 */
935 		ASSIGN_CTX_PML4(ppgtt, regs);
936 	} else {
937 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
938 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
939 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
940 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
941 	}
942 }
943 
944 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
945 {
946 	if (i915_is_ggtt(vm))
947 		return i915_vm_to_ggtt(vm)->alias;
948 	else
949 		return i915_vm_to_ppgtt(vm);
950 }
951 
952 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
953 {
954 	int x;
955 
956 	x = lrc_ring_mi_mode(engine);
957 	if (x != -1) {
958 		regs[x + 1] &= ~STOP_RING;
959 		regs[x + 1] |= STOP_RING << 16;
960 	}
961 }
962 
963 static void __lrc_init_regs(u32 *regs,
964 			    const struct intel_context *ce,
965 			    const struct intel_engine_cs *engine,
966 			    bool inhibit)
967 {
968 	/*
969 	 * A context is actually a big batch buffer with several
970 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
971 	 * values we are setting here are only for the first context restore:
972 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
973 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
974 	 * we are not initializing here).
975 	 *
976 	 * Must keep consistent with virtual_update_register_offsets().
977 	 */
978 
979 	if (inhibit)
980 		memset(regs, 0, PAGE_SIZE);
981 
982 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
983 
984 	init_common_regs(regs, ce, engine, inhibit);
985 	init_ppgtt_regs(regs, vm_alias(ce->vm));
986 
987 	init_wa_bb_regs(regs, engine);
988 
989 	__reset_stop_ring(regs, engine);
990 }
991 
992 void lrc_init_regs(const struct intel_context *ce,
993 		   const struct intel_engine_cs *engine,
994 		   bool inhibit)
995 {
996 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
997 }
998 
999 void lrc_reset_regs(const struct intel_context *ce,
1000 		    const struct intel_engine_cs *engine)
1001 {
1002 	__reset_stop_ring(ce->lrc_reg_state, engine);
1003 }
1004 
1005 static void
1006 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
1007 {
1008 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1009 		return;
1010 
1011 	vaddr += engine->context_size;
1012 
1013 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
1014 }
1015 
1016 static void
1017 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
1018 {
1019 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1020 		return;
1021 
1022 	vaddr += engine->context_size;
1023 
1024 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
1025 		drm_err_once(&engine->i915->drm,
1026 			     "%s context redzone overwritten!\n",
1027 			     engine->name);
1028 }
1029 
1030 static u32 context_wa_bb_offset(const struct intel_context *ce)
1031 {
1032 	return PAGE_SIZE * ce->wa_bb_page;
1033 }
1034 
1035 /*
1036  * per_ctx below determines which WABB section is used.
1037  * When true, the function returns the location of the
1038  * PER_CTX_BB.  When false, the function returns the
1039  * location of the INDIRECT_CTX.
1040  */
1041 static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
1042 {
1043 	void *ptr;
1044 
1045 	GEM_BUG_ON(!ce->wa_bb_page);
1046 
1047 	ptr = ce->lrc_reg_state;
1048 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1049 	ptr += context_wa_bb_offset(ce);
1050 	ptr += per_ctx ? PAGE_SIZE : 0;
1051 
1052 	return ptr;
1053 }
1054 
1055 void lrc_init_state(struct intel_context *ce,
1056 		    struct intel_engine_cs *engine,
1057 		    void *state)
1058 {
1059 	bool inhibit = true;
1060 
1061 	set_redzone(state, engine);
1062 
1063 	if (engine->default_state) {
1064 		shmem_read(engine->default_state, 0,
1065 			   state, engine->context_size);
1066 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
1067 		inhibit = false;
1068 	}
1069 
1070 	/* Clear the ppHWSP (inc. per-context counters) */
1071 	memset(state, 0, PAGE_SIZE);
1072 
1073 	/* Clear the indirect wa and storage */
1074 	if (ce->wa_bb_page)
1075 		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1076 
1077 	/*
1078 	 * The second page of the context object contains some registers which
1079 	 * must be set up prior to the first execution.
1080 	 */
1081 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1082 }
1083 
1084 u32 lrc_indirect_bb(const struct intel_context *ce)
1085 {
1086 	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1087 }
1088 
1089 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1090 {
1091 	/* If predication is active, this will be noop'ed */
1092 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1093 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1094 	*cs++ = 0;
1095 	*cs++ = 0; /* No predication */
1096 
1097 	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1098 	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
1099 	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1100 
1101 	/* Instructions are no longer predicated (disabled), we can proceed */
1102 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1103 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1104 	*cs++ = 0;
1105 	*cs++ = 1; /* enable predication before the next BB */
1106 
1107 	*cs++ = MI_BATCH_BUFFER_END;
1108 	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1109 
1110 	return cs;
1111 }
1112 
1113 static struct i915_vma *
1114 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1115 {
1116 	struct drm_i915_gem_object *obj;
1117 	struct i915_vma *vma;
1118 	u32 context_size;
1119 
1120 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1121 
1122 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1123 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1124 
1125 	if (GRAPHICS_VER(engine->i915) >= 12) {
1126 		ce->wa_bb_page = context_size / PAGE_SIZE;
1127 		/* INDIRECT_CTX and PER_CTX_BB need separate pages. */
1128 		context_size += PAGE_SIZE * 2;
1129 	}
1130 
1131 	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1132 		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1133 		context_size += PARENT_SCRATCH_SIZE;
1134 	}
1135 
1136 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1137 					  I915_BO_ALLOC_PM_VOLATILE);
1138 	if (IS_ERR(obj)) {
1139 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1140 		if (IS_ERR(obj))
1141 			return ERR_CAST(obj);
1142 
1143 		/*
1144 		 * Wa_22016122933: For Media version 13.0, all Media GT shared
1145 		 * memory needs to be mapped as WC on CPU side and UC (PAT
1146 		 * index 2) on GPU side.
1147 		 */
1148 		if (intel_gt_needs_wa_22016122933(engine->gt))
1149 			i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1150 	}
1151 
1152 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1153 	if (IS_ERR(vma)) {
1154 		i915_gem_object_put(obj);
1155 		return vma;
1156 	}
1157 
1158 	return vma;
1159 }
1160 
1161 static struct intel_timeline *
1162 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1163 {
1164 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1165 
1166 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1167 }
1168 
1169 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1170 {
1171 	struct intel_ring *ring;
1172 	struct i915_vma *vma;
1173 	int err;
1174 
1175 	GEM_BUG_ON(ce->state);
1176 
1177 	vma = __lrc_alloc_state(ce, engine);
1178 	if (IS_ERR(vma))
1179 		return PTR_ERR(vma);
1180 
1181 	ring = intel_engine_create_ring(engine, ce->ring_size);
1182 	if (IS_ERR(ring)) {
1183 		err = PTR_ERR(ring);
1184 		goto err_vma;
1185 	}
1186 
1187 	if (!page_mask_bits(ce->timeline)) {
1188 		struct intel_timeline *tl;
1189 
1190 		/*
1191 		 * Use the static global HWSP for the kernel context, and
1192 		 * a dynamically allocated cacheline for everyone else.
1193 		 */
1194 		if (unlikely(ce->timeline))
1195 			tl = pinned_timeline(ce, engine);
1196 		else
1197 			tl = intel_timeline_create(engine->gt);
1198 		if (IS_ERR(tl)) {
1199 			err = PTR_ERR(tl);
1200 			goto err_ring;
1201 		}
1202 
1203 		ce->timeline = tl;
1204 	}
1205 
1206 	ce->ring = ring;
1207 	ce->state = vma;
1208 
1209 	return 0;
1210 
1211 err_ring:
1212 	intel_ring_put(ring);
1213 err_vma:
1214 	i915_vma_put(vma);
1215 	return err;
1216 }
1217 
1218 void lrc_reset(struct intel_context *ce)
1219 {
1220 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1221 
1222 	intel_ring_reset(ce->ring, ce->ring->emit);
1223 
1224 	/* Scrub away the garbage */
1225 	lrc_init_regs(ce, ce->engine, true);
1226 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1227 }
1228 
1229 int
1230 lrc_pre_pin(struct intel_context *ce,
1231 	    struct intel_engine_cs *engine,
1232 	    struct i915_gem_ww_ctx *ww,
1233 	    void **vaddr)
1234 {
1235 	GEM_BUG_ON(!ce->state);
1236 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1237 
1238 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1239 					 intel_gt_coherent_map_type(ce->engine->gt,
1240 								    ce->state->obj,
1241 								    false) |
1242 					 I915_MAP_OVERRIDE);
1243 
1244 	return PTR_ERR_OR_ZERO(*vaddr);
1245 }
1246 
1247 int
1248 lrc_pin(struct intel_context *ce,
1249 	struct intel_engine_cs *engine,
1250 	void *vaddr)
1251 {
1252 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1253 
1254 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1255 		lrc_init_state(ce, engine, vaddr);
1256 
1257 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1258 	return 0;
1259 }
1260 
1261 void lrc_unpin(struct intel_context *ce)
1262 {
1263 	if (unlikely(ce->parallel.last_rq)) {
1264 		i915_request_put(ce->parallel.last_rq);
1265 		ce->parallel.last_rq = NULL;
1266 	}
1267 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1268 		      ce->engine);
1269 }
1270 
1271 void lrc_post_unpin(struct intel_context *ce)
1272 {
1273 	i915_gem_object_unpin_map(ce->state->obj);
1274 }
1275 
1276 void lrc_fini(struct intel_context *ce)
1277 {
1278 	if (!ce->state)
1279 		return;
1280 
1281 	intel_ring_put(fetch_and_zero(&ce->ring));
1282 	i915_vma_put(fetch_and_zero(&ce->state));
1283 }
1284 
1285 void lrc_destroy(struct kref *kref)
1286 {
1287 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1288 
1289 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1290 	GEM_BUG_ON(intel_context_is_pinned(ce));
1291 
1292 	lrc_fini(ce);
1293 
1294 	intel_context_fini(ce);
1295 	intel_context_free(ce);
1296 }
1297 
1298 static u32 *
1299 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1300 {
1301 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1302 		MI_SRM_LRM_GLOBAL_GTT |
1303 		MI_LRI_LRM_CS_MMIO;
1304 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1305 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1306 		CTX_TIMESTAMP * sizeof(u32);
1307 	*cs++ = 0;
1308 
1309 	*cs++ = MI_LOAD_REGISTER_REG |
1310 		MI_LRR_SOURCE_CS_MMIO |
1311 		MI_LRI_LRM_CS_MMIO;
1312 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1313 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1314 
1315 	*cs++ = MI_LOAD_REGISTER_REG |
1316 		MI_LRR_SOURCE_CS_MMIO |
1317 		MI_LRI_LRM_CS_MMIO;
1318 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1319 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1320 
1321 	return cs;
1322 }
1323 
1324 static u32 *
1325 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1326 {
1327 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1328 
1329 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1330 		MI_SRM_LRM_GLOBAL_GTT |
1331 		MI_LRI_LRM_CS_MMIO;
1332 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1333 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1334 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1335 	*cs++ = 0;
1336 
1337 	return cs;
1338 }
1339 
1340 static u32 *
1341 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1342 {
1343 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1344 
1345 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1346 		MI_SRM_LRM_GLOBAL_GTT |
1347 		MI_LRI_LRM_CS_MMIO;
1348 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1349 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1350 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1351 	*cs++ = 0;
1352 
1353 	*cs++ = MI_LOAD_REGISTER_REG |
1354 		MI_LRR_SOURCE_CS_MMIO |
1355 		MI_LRI_LRM_CS_MMIO;
1356 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1357 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1358 
1359 	return cs;
1360 }
1361 
1362 /*
1363  * The bspec's tuning guide asks us to program a vertical watermark value of
1364  * 0x3FF.  However this register is not saved/restored properly by the
1365  * hardware, so we're required to apply the desired value via INDIRECT_CTX
1366  * batch buffer to ensure the value takes effect properly.  All other bits
1367  * in this register should remain at 0 (the hardware default).
1368  */
1369 static u32 *
1370 dg2_emit_draw_watermark_setting(u32 *cs)
1371 {
1372 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1373 	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1374 	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1375 
1376 	return cs;
1377 }
1378 
1379 static u32 *
1380 gen12_invalidate_state_cache(u32 *cs)
1381 {
1382 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1383 	*cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2);
1384 	*cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1385 	return cs;
1386 }
1387 
1388 static u32 *
1389 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1390 {
1391 	cs = gen12_emit_timestamp_wa(ce, cs);
1392 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1393 	cs = gen12_emit_restore_scratch(ce, cs);
1394 
1395 	/* Wa_16013000631:dg2 */
1396 	if (IS_DG2_G11(ce->engine->i915))
1397 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1398 
1399 	cs = gen12_emit_aux_table_inv(ce->engine, cs);
1400 
1401 	/* Wa_18022495364 */
1402 	if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10)))
1403 		cs = gen12_invalidate_state_cache(cs);
1404 
1405 	/* Wa_16014892111 */
1406 	if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1407 	    IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
1408 	    IS_DG2(ce->engine->i915))
1409 		cs = dg2_emit_draw_watermark_setting(cs);
1410 
1411 	return cs;
1412 }
1413 
1414 static u32 *
1415 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1416 {
1417 	cs = gen12_emit_timestamp_wa(ce, cs);
1418 	cs = gen12_emit_restore_scratch(ce, cs);
1419 
1420 	/* Wa_16013000631:dg2 */
1421 	if (IS_DG2_G11(ce->engine->i915))
1422 		if (ce->engine->class == COMPUTE_CLASS)
1423 			cs = gen8_emit_pipe_control(cs,
1424 						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1425 						    0);
1426 
1427 	return gen12_emit_aux_table_inv(ce->engine, cs);
1428 }
1429 
1430 static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
1431 {
1432 	struct intel_gt *gt = ce->engine->gt;
1433 	int mocs = gt->mocs.uc_index << 1;
1434 
1435 	/**
1436 	 * Wa_16018031267 / Wa_16018063123 requires that SW forces the
1437 	 * main copy engine arbitration into round robin mode.  We
1438 	 * additionally need to submit the following WABB blt command
1439 	 * to produce 4 subblits with each subblit generating 0 byte
1440 	 * write requests as WABB:
1441 	 *
1442 	 * XY_FASTCOLOR_BLT
1443 	 *  BG0    -> 5100000E
1444 	 *  BG1    -> 0000003F (Dest pitch)
1445 	 *  BG2    -> 00000000 (X1, Y1) = (0, 0)
1446 	 *  BG3    -> 00040001 (X2, Y2) = (1, 4)
1447 	 *  BG4    -> scratch
1448 	 *  BG5    -> scratch
1449 	 *  BG6-12 -> 00000000
1450 	 *  BG13   -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
1451 	 *  BG14   -> 00000010 (Qpitch = 4)
1452 	 *  BG15   -> 00000000
1453 	 */
1454 	*cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
1455 	*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
1456 	*cs++ = 0;
1457 	*cs++ = 4 << 16 | 1;
1458 	*cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1459 	*cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1460 	*cs++ = 0;
1461 	*cs++ = 0;
1462 	*cs++ = 0;
1463 	*cs++ = 0;
1464 	*cs++ = 0;
1465 	*cs++ = 0;
1466 	*cs++ = 0;
1467 	*cs++ = 0x20004004;
1468 	*cs++ = 0x10;
1469 	*cs++ = 0;
1470 
1471 	return cs;
1472 }
1473 
1474 static u32 *
1475 xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
1476 {
1477 	/* Wa_16018031267, Wa_16018063123 */
1478 	if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine))
1479 		cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
1480 
1481 	return cs;
1482 }
1483 
1484 static void
1485 setup_per_ctx_bb(const struct intel_context *ce,
1486 		 const struct intel_engine_cs *engine,
1487 		 u32 *(*emit)(const struct intel_context *, u32 *))
1488 {
1489 	/* Place PER_CTX_BB on next page after INDIRECT_CTX */
1490 	u32 * const start = context_wabb(ce, true);
1491 	u32 *cs;
1492 
1493 	cs = emit(ce, start);
1494 
1495 	/* PER_CTX_BB must manually terminate */
1496 	*cs++ = MI_BATCH_BUFFER_END;
1497 
1498 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1499 	lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
1500 			     lrc_indirect_bb(ce) + PAGE_SIZE);
1501 }
1502 
1503 static void
1504 setup_indirect_ctx_bb(const struct intel_context *ce,
1505 		      const struct intel_engine_cs *engine,
1506 		      u32 *(*emit)(const struct intel_context *, u32 *))
1507 {
1508 	u32 * const start = context_wabb(ce, false);
1509 	u32 *cs;
1510 
1511 	cs = emit(ce, start);
1512 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1513 	while ((unsigned long)cs % CACHELINE_BYTES)
1514 		*cs++ = MI_NOOP;
1515 
1516 	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1517 	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1518 
1519 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1520 			       lrc_indirect_bb(ce),
1521 			       (cs - start) * sizeof(*cs));
1522 }
1523 
1524 /*
1525  * The context descriptor encodes various attributes of a context,
1526  * including its GTT address and some flags. Because it's fairly
1527  * expensive to calculate, we'll just do it once and cache the result,
1528  * which remains valid until the context is unpinned.
1529  *
1530  * This is what a descriptor looks like, from LSB to MSB::
1531  *
1532  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1533  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1534  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1535  *      bits 53-54:    mbz, reserved for use by hardware
1536  *      bits 55-63:    group ID, currently unused and set to 0
1537  *
1538  * Starting from Gen11, the upper dword of the descriptor has a new format:
1539  *
1540  *      bits 32-36:    reserved
1541  *      bits 37-47:    SW context ID
1542  *      bits 48:53:    engine instance
1543  *      bit 54:        mbz, reserved for use by hardware
1544  *      bits 55-60:    SW counter
1545  *      bits 61-63:    engine class
1546  *
1547  * On Xe_HP, the upper dword of the descriptor has a new format:
1548  *
1549  *      bits 32-37:    virtual function number
1550  *      bit 38:        mbz, reserved for use by hardware
1551  *      bits 39-54:    SW context ID
1552  *      bits 55-57:    reserved
1553  *      bits 58-63:    SW counter
1554  *
1555  * engine info, SW context ID and SW counter need to form a unique number
1556  * (Context ID) per lrc.
1557  */
1558 static u32 lrc_descriptor(const struct intel_context *ce)
1559 {
1560 	u32 desc;
1561 
1562 	desc = INTEL_LEGACY_32B_CONTEXT;
1563 	if (i915_vm_is_4lvl(ce->vm))
1564 		desc = INTEL_LEGACY_64B_CONTEXT;
1565 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1566 
1567 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1568 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1569 		desc |= GEN8_CTX_L3LLC_COHERENT;
1570 
1571 	return i915_ggtt_offset(ce->state) | desc;
1572 }
1573 
1574 u32 lrc_update_regs(const struct intel_context *ce,
1575 		    const struct intel_engine_cs *engine,
1576 		    u32 head)
1577 {
1578 	struct intel_ring *ring = ce->ring;
1579 	u32 *regs = ce->lrc_reg_state;
1580 
1581 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1582 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1583 
1584 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1585 	regs[CTX_RING_HEAD] = head;
1586 	regs[CTX_RING_TAIL] = ring->tail;
1587 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1588 
1589 	/* RPCS */
1590 	if (engine->class == RENDER_CLASS) {
1591 		regs[CTX_R_PWR_CLK_STATE] =
1592 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1593 
1594 		i915_oa_init_reg_state(ce, engine);
1595 	}
1596 
1597 	if (ce->wa_bb_page) {
1598 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1599 
1600 		fn = gen12_emit_indirect_ctx_xcs;
1601 		if (ce->engine->class == RENDER_CLASS)
1602 			fn = gen12_emit_indirect_ctx_rcs;
1603 
1604 		/* Mutually exclusive wrt to global indirect bb */
1605 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1606 		setup_indirect_ctx_bb(ce, engine, fn);
1607 		setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
1608 	}
1609 
1610 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1611 }
1612 
1613 void lrc_update_offsets(struct intel_context *ce,
1614 			struct intel_engine_cs *engine)
1615 {
1616 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1617 }
1618 
1619 void lrc_check_regs(const struct intel_context *ce,
1620 		    const struct intel_engine_cs *engine,
1621 		    const char *when)
1622 {
1623 	const struct intel_ring *ring = ce->ring;
1624 	u32 *regs = ce->lrc_reg_state;
1625 	bool valid = true;
1626 	int x;
1627 
1628 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1629 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1630 		       engine->name,
1631 		       regs[CTX_RING_START],
1632 		       i915_ggtt_offset(ring->vma));
1633 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1634 		valid = false;
1635 	}
1636 
1637 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1638 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1639 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1640 		       engine->name,
1641 		       regs[CTX_RING_CTL],
1642 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1643 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1644 		valid = false;
1645 	}
1646 
1647 	x = lrc_ring_mi_mode(engine);
1648 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1649 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1650 		       engine->name, regs[x + 1]);
1651 		regs[x + 1] &= ~STOP_RING;
1652 		regs[x + 1] |= STOP_RING << 16;
1653 		valid = false;
1654 	}
1655 
1656 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1657 }
1658 
1659 /*
1660  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1661  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1662  * but there is a slight complication as this is applied in WA batch where the
1663  * values are only initialized once so we cannot take register value at the
1664  * beginning and reuse it further; hence we save its value to memory, upload a
1665  * constant value with bit21 set and then we restore it back with the saved value.
1666  * To simplify the WA, a constant value is formed by using the default value
1667  * of this register. This shouldn't be a problem because we are only modifying
1668  * it for a short period and this batch in non-premptible. We can ofcourse
1669  * use additional instructions that read the actual value of the register
1670  * at that time and set our bit of interest but it makes the WA complicated.
1671  *
1672  * This WA is also required for Gen9 so extracting as a function avoids
1673  * code duplication.
1674  */
1675 static u32 *
1676 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1677 {
1678 	/* NB no one else is allowed to scribble over scratch + 256! */
1679 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1680 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1681 	*batch++ = intel_gt_scratch_offset(engine->gt,
1682 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1683 	*batch++ = 0;
1684 
1685 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1686 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1687 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1688 
1689 	batch = gen8_emit_pipe_control(batch,
1690 				       PIPE_CONTROL_CS_STALL |
1691 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1692 				       0);
1693 
1694 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1695 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1696 	*batch++ = intel_gt_scratch_offset(engine->gt,
1697 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1698 	*batch++ = 0;
1699 
1700 	return batch;
1701 }
1702 
1703 /*
1704  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1705  * initialized at the beginning and shared across all contexts but this field
1706  * helps us to have multiple batches at different offsets and select them based
1707  * on a criteria. At the moment this batch always start at the beginning of the page
1708  * and at this point we don't have multiple wa_ctx batch buffers.
1709  *
1710  * The number of WA applied are not known at the beginning; we use this field
1711  * to return the no of DWORDS written.
1712  *
1713  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1714  * so it adds NOOPs as padding to make it cacheline aligned.
1715  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1716  * makes a complete batch buffer.
1717  */
1718 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1719 {
1720 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1721 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1722 
1723 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1724 	if (IS_BROADWELL(engine->i915))
1725 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1726 
1727 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1728 	/* Actual scratch location is at 128 bytes offset */
1729 	batch = gen8_emit_pipe_control(batch,
1730 				       PIPE_CONTROL_FLUSH_L3 |
1731 				       PIPE_CONTROL_STORE_DATA_INDEX |
1732 				       PIPE_CONTROL_CS_STALL |
1733 				       PIPE_CONTROL_QW_WRITE,
1734 				       LRC_PPHWSP_SCRATCH_ADDR);
1735 
1736 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1737 
1738 	/* Pad to end of cacheline */
1739 	while ((unsigned long)batch % CACHELINE_BYTES)
1740 		*batch++ = MI_NOOP;
1741 
1742 	/*
1743 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1744 	 * execution depends on the length specified in terms of cache lines
1745 	 * in the register CTX_RCS_INDIRECT_CTX
1746 	 */
1747 
1748 	return batch;
1749 }
1750 
1751 struct lri {
1752 	i915_reg_t reg;
1753 	u32 value;
1754 };
1755 
1756 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1757 {
1758 	GEM_BUG_ON(!count || count > 63);
1759 
1760 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1761 	do {
1762 		*batch++ = i915_mmio_reg_offset(lri->reg);
1763 		*batch++ = lri->value;
1764 	} while (lri++, --count);
1765 	*batch++ = MI_NOOP;
1766 
1767 	return batch;
1768 }
1769 
1770 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1771 {
1772 	static const struct lri lri[] = {
1773 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1774 		{
1775 			COMMON_SLICE_CHICKEN2,
1776 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1777 				       0),
1778 		},
1779 
1780 		/* BSpec: 11391 */
1781 		{
1782 			FF_SLICE_CHICKEN,
1783 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1784 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1785 		},
1786 
1787 		/* BSpec: 11299 */
1788 		{
1789 			_3D_CHICKEN3,
1790 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1791 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1792 		}
1793 	};
1794 
1795 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1796 
1797 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1798 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1799 
1800 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1801 	batch = gen8_emit_pipe_control(batch,
1802 				       PIPE_CONTROL_FLUSH_L3 |
1803 				       PIPE_CONTROL_STORE_DATA_INDEX |
1804 				       PIPE_CONTROL_CS_STALL |
1805 				       PIPE_CONTROL_QW_WRITE,
1806 				       LRC_PPHWSP_SCRATCH_ADDR);
1807 
1808 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1809 
1810 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1811 	if (HAS_POOLED_EU(engine->i915)) {
1812 		/*
1813 		 * EU pool configuration is setup along with golden context
1814 		 * during context initialization. This value depends on
1815 		 * device type (2x6 or 3x6) and needs to be updated based
1816 		 * on which subslice is disabled especially for 2x6
1817 		 * devices, however it is safe to load default
1818 		 * configuration of 3x6 device instead of masking off
1819 		 * corresponding bits because HW ignores bits of a disabled
1820 		 * subslice and drops down to appropriate config. Please
1821 		 * see render_state_setup() in i915_gem_render_state.c for
1822 		 * possible configurations, to avoid duplication they are
1823 		 * not shown here again.
1824 		 */
1825 		*batch++ = GEN9_MEDIA_POOL_STATE;
1826 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1827 		*batch++ = 0x00777000;
1828 		*batch++ = 0;
1829 		*batch++ = 0;
1830 		*batch++ = 0;
1831 	}
1832 
1833 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1834 
1835 	/* Pad to end of cacheline */
1836 	while ((unsigned long)batch % CACHELINE_BYTES)
1837 		*batch++ = MI_NOOP;
1838 
1839 	return batch;
1840 }
1841 
1842 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1843 
1844 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1845 {
1846 	struct drm_i915_gem_object *obj;
1847 	struct i915_vma *vma;
1848 	int err;
1849 
1850 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1851 	if (IS_ERR(obj))
1852 		return PTR_ERR(obj);
1853 
1854 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1855 	if (IS_ERR(vma)) {
1856 		err = PTR_ERR(vma);
1857 		goto err;
1858 	}
1859 
1860 	engine->wa_ctx.vma = vma;
1861 	return 0;
1862 
1863 err:
1864 	i915_gem_object_put(obj);
1865 	return err;
1866 }
1867 
1868 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1869 {
1870 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1871 }
1872 
1873 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1874 
1875 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1876 {
1877 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1878 	struct i915_wa_ctx_bb *wa_bb[] = {
1879 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1880 	};
1881 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1882 	struct i915_gem_ww_ctx ww;
1883 	void *batch, *batch_ptr;
1884 	unsigned int i;
1885 	int err;
1886 
1887 	if (GRAPHICS_VER(engine->i915) >= 11 ||
1888 	    !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1889 		return;
1890 
1891 	if (GRAPHICS_VER(engine->i915) == 9) {
1892 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1893 		wa_bb_fn[1] = NULL;
1894 	} else if (GRAPHICS_VER(engine->i915) == 8) {
1895 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1896 		wa_bb_fn[1] = NULL;
1897 	}
1898 
1899 	err = lrc_create_wa_ctx(engine);
1900 	if (err) {
1901 		/*
1902 		 * We continue even if we fail to initialize WA batch
1903 		 * because we only expect rare glitches but nothing
1904 		 * critical to prevent us from using GPU
1905 		 */
1906 		drm_err(&engine->i915->drm,
1907 			"Ignoring context switch w/a allocation error:%d\n",
1908 			err);
1909 		return;
1910 	}
1911 
1912 	if (!engine->wa_ctx.vma)
1913 		return;
1914 
1915 	i915_gem_ww_ctx_init(&ww, true);
1916 retry:
1917 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1918 	if (!err)
1919 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1920 	if (err)
1921 		goto err;
1922 
1923 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1924 	if (IS_ERR(batch)) {
1925 		err = PTR_ERR(batch);
1926 		goto err_unpin;
1927 	}
1928 
1929 	/*
1930 	 * Emit the two workaround batch buffers, recording the offset from the
1931 	 * start of the workaround batch buffer object for each and their
1932 	 * respective sizes.
1933 	 */
1934 	batch_ptr = batch;
1935 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1936 		wa_bb[i]->offset = batch_ptr - batch;
1937 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1938 						  CACHELINE_BYTES))) {
1939 			err = -EINVAL;
1940 			break;
1941 		}
1942 		if (wa_bb_fn[i])
1943 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1944 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1945 	}
1946 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1947 
1948 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1949 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1950 
1951 	/* Verify that we can handle failure to setup the wa_ctx */
1952 	if (!err)
1953 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1954 
1955 err_unpin:
1956 	if (err)
1957 		i915_vma_unpin(wa_ctx->vma);
1958 err:
1959 	if (err == -EDEADLK) {
1960 		err = i915_gem_ww_ctx_backoff(&ww);
1961 		if (!err)
1962 			goto retry;
1963 	}
1964 	i915_gem_ww_ctx_fini(&ww);
1965 
1966 	if (err) {
1967 		i915_vma_put(engine->wa_ctx.vma);
1968 
1969 		/* Clear all flags to prevent further use */
1970 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1971 	}
1972 }
1973 
1974 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1975 {
1976 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1977 	stats->runtime.num_underflow++;
1978 	stats->runtime.max_underflow =
1979 		max_t(u32, stats->runtime.max_underflow, -dt);
1980 #endif
1981 }
1982 
1983 static u32 lrc_get_runtime(const struct intel_context *ce)
1984 {
1985 	/*
1986 	 * We can use either ppHWSP[16] which is recorded before the context
1987 	 * switch (and so excludes the cost of context switches) or use the
1988 	 * value from the context image itself, which is saved/restored earlier
1989 	 * and so includes the cost of the save.
1990 	 */
1991 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1992 }
1993 
1994 void lrc_update_runtime(struct intel_context *ce)
1995 {
1996 	struct intel_context_stats *stats = &ce->stats;
1997 	u32 old;
1998 	s32 dt;
1999 
2000 	old = stats->runtime.last;
2001 	stats->runtime.last = lrc_get_runtime(ce);
2002 	dt = stats->runtime.last - old;
2003 	if (!dt)
2004 		return;
2005 
2006 	if (unlikely(dt < 0)) {
2007 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
2008 			 old, stats->runtime.last, dt);
2009 		st_runtime_underflow(stats, dt);
2010 		return;
2011 	}
2012 
2013 	ewma_runtime_add(&stats->runtime.avg, dt);
2014 	stats->runtime.total += dt;
2015 }
2016 
2017 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2018 #include "selftest_lrc.c"
2019 #endif
2020