xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision c3fb1fb9e65fa6a108b4d19c61bdcb47fd4fe180)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_gt_regs.h"
18 #include "regs/xe_lrc_layout.h"
19 #include "xe_bb.h"
20 #include "xe_bo.h"
21 #include "xe_configfs.h"
22 #include "xe_device.h"
23 #include "xe_drm_client.h"
24 #include "xe_exec_queue_types.h"
25 #include "xe_gt.h"
26 #include "xe_gt_printk.h"
27 #include "xe_hw_fence.h"
28 #include "xe_map.h"
29 #include "xe_memirq.h"
30 #include "xe_mmio.h"
31 #include "xe_ring_ops.h"
32 #include "xe_sriov.h"
33 #include "xe_trace_lrc.h"
34 #include "xe_vm.h"
35 #include "xe_wa.h"
36 
37 #define LRC_VALID				BIT_ULL(0)
38 #define LRC_PRIVILEGE				BIT_ULL(8)
39 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
40 #define LRC_LEGACY_64B_CONTEXT			3
41 
42 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
43 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
44 
45 #define LRC_PPHWSP_SIZE				SZ_4K
46 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
47 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
48 
49 #define LRC_PRIORITY				GENMASK_ULL(10, 9)
50 #define LRC_PRIORITY_LOW			0
51 #define LRC_PRIORITY_NORMAL			1
52 #define LRC_PRIORITY_HIGH			2
53 
54 /*
55  * Layout of the LRC and associated data allocated as
56  * lrc->bo:
57  *
58  *   Region                       Size
59  *  +============================+=================================+ <- __xe_lrc_ring_offset()
60  *  | Ring                       | ring_size, see                  |
61  *  |                            | xe_lrc_init()                   |
62  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
63  *  | PPHWSP (includes SW state) | 4K                              |
64  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
65  *  | Engine Context Image       | n * 4K, see                     |
66  *  |                            | xe_gt_lrc_size()                |
67  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
68  *  | Indirect Ring State Page   | 0 or 4k, see                    |
69  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
70  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
71  *  | Indirect Context Page      | 0 or 4k, see                    |
72  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
73  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
74  *  | WA BB Per Ctx              | 4k                              |
75  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
76  */
77 
78 static struct xe_device *
79 lrc_to_xe(struct xe_lrc *lrc)
80 {
81 	return gt_to_xe(lrc->fence_ctx.gt);
82 }
83 
84 static bool
85 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
86 {
87 	struct xe_device *xe = gt_to_xe(gt);
88 
89 	if (XE_GT_WA(gt, 16010904313) &&
90 	    (class == XE_ENGINE_CLASS_RENDER ||
91 	     class == XE_ENGINE_CLASS_COMPUTE))
92 		return true;
93 
94 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
95 					       class, NULL))
96 		return true;
97 
98 	if (gt->ring_ops[class]->emit_aux_table_inv)
99 		return true;
100 
101 	return false;
102 }
103 
104 /**
105  * xe_gt_lrc_hang_replay_size() - Hang replay size
106  * @gt: The GT
107  * @class: Hardware engine class
108  *
109  * Determine size of GPU hang replay state for a GT and hardware engine class.
110  *
111  * Return: Size of GPU hang replay size
112  */
113 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
114 {
115 	struct xe_device *xe = gt_to_xe(gt);
116 	size_t size = 0;
117 
118 	/* Engine context image */
119 	switch (class) {
120 	case XE_ENGINE_CLASS_RENDER:
121 		if (GRAPHICS_VERx100(xe) >= 3510)
122 			size += 7 * SZ_4K;
123 		else if (GRAPHICS_VER(xe) >= 20)
124 			size += 3 * SZ_4K;
125 		else
126 			size += 13 * SZ_4K;
127 		break;
128 	case XE_ENGINE_CLASS_COMPUTE:
129 		if (GRAPHICS_VERx100(xe) >= 3510)
130 			size += 5 * SZ_4K;
131 		else if (GRAPHICS_VER(xe) >= 20)
132 			size += 2 * SZ_4K;
133 		else
134 			size += 13 * SZ_4K;
135 		break;
136 	default:
137 		WARN(1, "Unknown engine class: %d", class);
138 		fallthrough;
139 	case XE_ENGINE_CLASS_COPY:
140 	case XE_ENGINE_CLASS_VIDEO_DECODE:
141 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
142 	case XE_ENGINE_CLASS_OTHER:
143 		size += 1 * SZ_4K;
144 	}
145 
146 	return size;
147 }
148 
149 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
150 {
151 	size_t size = xe_gt_lrc_hang_replay_size(gt, class);
152 
153 	/* Add indirect ring state page */
154 	if (xe_gt_has_indirect_ring_state(gt))
155 		size += LRC_INDIRECT_RING_STATE_SIZE;
156 
157 	return size + LRC_PPHWSP_SIZE;
158 }
159 
160 /*
161  * The per-platform tables are u8-encoded in @data. Decode @data and set the
162  * addresses' offset and commands in @regs. The following encoding is used
163  * for each byte. There are 2 steps: decoding commands and decoding addresses.
164  *
165  * Commands:
166  * [7]: create NOPs - number of NOPs are set in lower bits
167  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
168  *      MI_LRI_FORCE_POSTED
169  * [5:0]: Number of NOPs or registers to set values to in case of
170  *        MI_LOAD_REGISTER_IMM
171  *
172  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
173  * number of registers. They are set by using the REG/REG16 macros: the former
174  * is used for offsets smaller than 0x200 while the latter is for values bigger
175  * than that. Those macros already set all the bits documented below correctly:
176  *
177  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
178  *      follow, for the lower bits
179  * [6:0]: Register offset, without considering the engine base.
180  *
181  * This function only tweaks the commands and register offsets. Values are not
182  * filled out.
183  */
184 static void set_offsets(u32 *regs,
185 			const u8 *data,
186 			const struct xe_hw_engine *hwe)
187 #define NOP(x) (BIT(7) | (x))
188 #define LRI(count, flags) ((flags) << 6 | (count) | \
189 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
190 #define POSTED BIT(0)
191 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
192 #define REG16(x) \
193 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
194 	(((x) >> 2) & 0x7f)
195 {
196 	const u32 base = hwe->mmio_base;
197 
198 	while (*data) {
199 		u8 count, flags;
200 
201 		if (*data & BIT(7)) { /* skip */
202 			count = *data++ & ~BIT(7);
203 			regs += count;
204 			continue;
205 		}
206 
207 		count = *data & 0x3f;
208 		flags = *data >> 6;
209 		data++;
210 
211 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
212 		if (flags & POSTED)
213 			*regs |= MI_LRI_FORCE_POSTED;
214 		*regs |= MI_LRI_LRM_CS_MMIO;
215 		regs++;
216 
217 		xe_gt_assert(hwe->gt, count);
218 		do {
219 			u32 offset = 0;
220 			u8 v;
221 
222 			do {
223 				v = *data++;
224 				offset <<= 7;
225 				offset |= v & ~BIT(7);
226 			} while (v & BIT(7));
227 
228 			regs[0] = base + (offset << 2);
229 			regs += 2;
230 		} while (--count);
231 	}
232 
233 	*regs = MI_BATCH_BUFFER_END | BIT(0);
234 }
235 
236 static const u8 gen12_xcs_offsets[] = {
237 	NOP(1),
238 	LRI(13, POSTED),
239 	REG16(0x244),
240 	REG(0x034),
241 	REG(0x030),
242 	REG(0x038),
243 	REG(0x03c),
244 	REG(0x168),
245 	REG(0x140),
246 	REG(0x110),
247 	REG(0x1c0),
248 	REG(0x1c4),
249 	REG(0x1c8),
250 	REG(0x180),
251 	REG16(0x2b4),
252 
253 	NOP(5),
254 	LRI(9, POSTED),
255 	REG16(0x3a8),
256 	REG16(0x28c),
257 	REG16(0x288),
258 	REG16(0x284),
259 	REG16(0x280),
260 	REG16(0x27c),
261 	REG16(0x278),
262 	REG16(0x274),
263 	REG16(0x270),
264 
265 	0
266 };
267 
268 static const u8 dg2_xcs_offsets[] = {
269 	NOP(1),
270 	LRI(15, POSTED),
271 	REG16(0x244),
272 	REG(0x034),
273 	REG(0x030),
274 	REG(0x038),
275 	REG(0x03c),
276 	REG(0x168),
277 	REG(0x140),
278 	REG(0x110),
279 	REG(0x1c0),
280 	REG(0x1c4),
281 	REG(0x1c8),
282 	REG(0x180),
283 	REG16(0x2b4),
284 	REG(0x120),
285 	REG(0x124),
286 
287 	NOP(1),
288 	LRI(9, POSTED),
289 	REG16(0x3a8),
290 	REG16(0x28c),
291 	REG16(0x288),
292 	REG16(0x284),
293 	REG16(0x280),
294 	REG16(0x27c),
295 	REG16(0x278),
296 	REG16(0x274),
297 	REG16(0x270),
298 
299 	0
300 };
301 
302 static const u8 gen12_rcs_offsets[] = {
303 	NOP(1),
304 	LRI(13, POSTED),
305 	REG16(0x244),
306 	REG(0x034),
307 	REG(0x030),
308 	REG(0x038),
309 	REG(0x03c),
310 	REG(0x168),
311 	REG(0x140),
312 	REG(0x110),
313 	REG(0x1c0),
314 	REG(0x1c4),
315 	REG(0x1c8),
316 	REG(0x180),
317 	REG16(0x2b4),
318 
319 	NOP(5),
320 	LRI(9, POSTED),
321 	REG16(0x3a8),
322 	REG16(0x28c),
323 	REG16(0x288),
324 	REG16(0x284),
325 	REG16(0x280),
326 	REG16(0x27c),
327 	REG16(0x278),
328 	REG16(0x274),
329 	REG16(0x270),
330 
331 	LRI(3, POSTED),
332 	REG(0x1b0),
333 	REG16(0x5a8),
334 	REG16(0x5ac),
335 
336 	NOP(6),
337 	LRI(1, 0),
338 	REG(0x0c8),
339 	NOP(3 + 9 + 1),
340 
341 	LRI(51, POSTED),
342 	REG16(0x588),
343 	REG16(0x588),
344 	REG16(0x588),
345 	REG16(0x588),
346 	REG16(0x588),
347 	REG16(0x588),
348 	REG(0x028),
349 	REG(0x09c),
350 	REG(0x0c0),
351 	REG(0x178),
352 	REG(0x17c),
353 	REG16(0x358),
354 	REG(0x170),
355 	REG(0x150),
356 	REG(0x154),
357 	REG(0x158),
358 	REG16(0x41c),
359 	REG16(0x600),
360 	REG16(0x604),
361 	REG16(0x608),
362 	REG16(0x60c),
363 	REG16(0x610),
364 	REG16(0x614),
365 	REG16(0x618),
366 	REG16(0x61c),
367 	REG16(0x620),
368 	REG16(0x624),
369 	REG16(0x628),
370 	REG16(0x62c),
371 	REG16(0x630),
372 	REG16(0x634),
373 	REG16(0x638),
374 	REG16(0x63c),
375 	REG16(0x640),
376 	REG16(0x644),
377 	REG16(0x648),
378 	REG16(0x64c),
379 	REG16(0x650),
380 	REG16(0x654),
381 	REG16(0x658),
382 	REG16(0x65c),
383 	REG16(0x660),
384 	REG16(0x664),
385 	REG16(0x668),
386 	REG16(0x66c),
387 	REG16(0x670),
388 	REG16(0x674),
389 	REG16(0x678),
390 	REG16(0x67c),
391 	REG(0x068),
392 	REG(0x084),
393 	NOP(1),
394 
395 	0
396 };
397 
398 static const u8 xehp_rcs_offsets[] = {
399 	NOP(1),
400 	LRI(13, POSTED),
401 	REG16(0x244),
402 	REG(0x034),
403 	REG(0x030),
404 	REG(0x038),
405 	REG(0x03c),
406 	REG(0x168),
407 	REG(0x140),
408 	REG(0x110),
409 	REG(0x1c0),
410 	REG(0x1c4),
411 	REG(0x1c8),
412 	REG(0x180),
413 	REG16(0x2b4),
414 
415 	NOP(5),
416 	LRI(9, POSTED),
417 	REG16(0x3a8),
418 	REG16(0x28c),
419 	REG16(0x288),
420 	REG16(0x284),
421 	REG16(0x280),
422 	REG16(0x27c),
423 	REG16(0x278),
424 	REG16(0x274),
425 	REG16(0x270),
426 
427 	LRI(3, POSTED),
428 	REG(0x1b0),
429 	REG16(0x5a8),
430 	REG16(0x5ac),
431 
432 	NOP(6),
433 	LRI(1, 0),
434 	REG(0x0c8),
435 
436 	0
437 };
438 
439 static const u8 dg2_rcs_offsets[] = {
440 	NOP(1),
441 	LRI(15, POSTED),
442 	REG16(0x244),
443 	REG(0x034),
444 	REG(0x030),
445 	REG(0x038),
446 	REG(0x03c),
447 	REG(0x168),
448 	REG(0x140),
449 	REG(0x110),
450 	REG(0x1c0),
451 	REG(0x1c4),
452 	REG(0x1c8),
453 	REG(0x180),
454 	REG16(0x2b4),
455 	REG(0x120),
456 	REG(0x124),
457 
458 	NOP(1),
459 	LRI(9, POSTED),
460 	REG16(0x3a8),
461 	REG16(0x28c),
462 	REG16(0x288),
463 	REG16(0x284),
464 	REG16(0x280),
465 	REG16(0x27c),
466 	REG16(0x278),
467 	REG16(0x274),
468 	REG16(0x270),
469 
470 	LRI(3, POSTED),
471 	REG(0x1b0),
472 	REG16(0x5a8),
473 	REG16(0x5ac),
474 
475 	NOP(6),
476 	LRI(1, 0),
477 	REG(0x0c8),
478 
479 	0
480 };
481 
482 static const u8 mtl_rcs_offsets[] = {
483 	NOP(1),
484 	LRI(15, POSTED),
485 	REG16(0x244),
486 	REG(0x034),
487 	REG(0x030),
488 	REG(0x038),
489 	REG(0x03c),
490 	REG(0x168),
491 	REG(0x140),
492 	REG(0x110),
493 	REG(0x1c0),
494 	REG(0x1c4),
495 	REG(0x1c8),
496 	REG(0x180),
497 	REG16(0x2b4),
498 	REG(0x120),
499 	REG(0x124),
500 
501 	NOP(1),
502 	LRI(9, POSTED),
503 	REG16(0x3a8),
504 	REG16(0x28c),
505 	REG16(0x288),
506 	REG16(0x284),
507 	REG16(0x280),
508 	REG16(0x27c),
509 	REG16(0x278),
510 	REG16(0x274),
511 	REG16(0x270),
512 
513 	NOP(2),
514 	LRI(2, POSTED),
515 	REG16(0x5a8),
516 	REG16(0x5ac),
517 
518 	NOP(6),
519 	LRI(1, 0),
520 	REG(0x0c8),
521 
522 	0
523 };
524 
525 #define XE2_CTX_COMMON \
526 	NOP(1),                 /* [0x00] */ \
527 	LRI(15, POSTED),        /* [0x01] */ \
528 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
529 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
530 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
531 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
532 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
533 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
534 	REG(0x140),             /* [0x0e] BB_ADDR */ \
535 	REG(0x110),             /* [0x10] BB_STATE */ \
536 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
537 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
538 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
539 	REG(0x180),             /* [0x18] CCID */ \
540 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
541 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
542 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
543 	\
544 	NOP(1),                 /* [0x20] */ \
545 	LRI(9, POSTED),         /* [0x21] */ \
546 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
547 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
548 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
549 	REG16(0x284),           /* [0x28] dummy reg */ \
550 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
551 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
552 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
553 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
554 	REG16(0x270)            /* [0x32] PTBP_LDW */
555 
556 static const u8 xe2_rcs_offsets[] = {
557 	XE2_CTX_COMMON,
558 
559 	NOP(2),                 /* [0x34] */
560 	LRI(2, POSTED),         /* [0x36] */
561 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
562 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
563 
564 	NOP(6),                 /* [0x41] */
565 	LRI(1, 0),              /* [0x47] */
566 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
567 
568 	0
569 };
570 
571 static const u8 xe2_bcs_offsets[] = {
572 	XE2_CTX_COMMON,
573 
574 	NOP(4 + 8 + 1),         /* [0x34] */
575 	LRI(2, POSTED),         /* [0x41] */
576 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
577 	REG16(0x204),           /* [0x44] BLIT_CCTL */
578 
579 	0
580 };
581 
582 static const u8 xe2_xcs_offsets[] = {
583 	XE2_CTX_COMMON,
584 
585 	0
586 };
587 
588 static const u8 xe2_indirect_ring_state_offsets[] = {
589 	NOP(1),                 /* [0x00] */
590 	LRI(5, POSTED),         /* [0x01] */
591 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
592 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
593 	REG(0x038),             /* [0x06] RING_BUFFER_START */
594 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
595 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
596 
597 	NOP(5),                 /* [0x0c] */
598 	LRI(9, POSTED),         /* [0x11] */
599 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
600 	REG(0x140),             /* [0x14] BB_ADDR */
601 	REG(0x110),             /* [0x16] BB_STATE */
602 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
603 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
604 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
605 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
606 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
607 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
608 
609 	NOP(12),                 /* [0x00] */
610 
611 	0
612 };
613 
614 #undef REG16
615 #undef REG
616 #undef LRI
617 #undef NOP
618 
619 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
620 {
621 	if (class == XE_ENGINE_CLASS_RENDER) {
622 		if (GRAPHICS_VER(xe) >= 20)
623 			return xe2_rcs_offsets;
624 		else if (GRAPHICS_VERx100(xe) >= 1270)
625 			return mtl_rcs_offsets;
626 		else if (GRAPHICS_VERx100(xe) >= 1255)
627 			return dg2_rcs_offsets;
628 		else if (GRAPHICS_VERx100(xe) >= 1250)
629 			return xehp_rcs_offsets;
630 		else
631 			return gen12_rcs_offsets;
632 	} else if (class == XE_ENGINE_CLASS_COPY) {
633 		if (GRAPHICS_VER(xe) >= 20)
634 			return xe2_bcs_offsets;
635 		else
636 			return gen12_xcs_offsets;
637 	} else {
638 		if (GRAPHICS_VER(xe) >= 20)
639 			return xe2_xcs_offsets;
640 		else if (GRAPHICS_VERx100(xe) >= 1255)
641 			return dg2_xcs_offsets;
642 		else
643 			return gen12_xcs_offsets;
644 	}
645 }
646 
647 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
648 {
649 	regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
650 							    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
651 
652 	if (xe_gt_has_indirect_ring_state(hwe->gt))
653 		regs[CTX_CONTEXT_CONTROL] |=
654 			REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
655 }
656 
657 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
658 {
659 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
660 	struct xe_device *xe = gt_to_xe(hwe->gt);
661 	u8 num_regs;
662 
663 	if (!xe_device_uses_memirq(xe))
664 		return;
665 
666 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
667 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
668 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
669 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
670 
671 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
672 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
673 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
674 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
675 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
676 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
677 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
678 
679 	if (xe_device_has_msix(xe)) {
680 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
681 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
682 	}
683 }
684 
685 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
686 {
687 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
688 }
689 
690 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
691 {
692 	return 0;
693 }
694 
695 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
696 {
697 	return lrc->ring.size;
698 }
699 
700 /* Make the magic macros work */
701 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
702 #define __xe_lrc_regs_offset xe_lrc_regs_offset
703 
704 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512
705 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
706 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
707 
708 #define LRC_SEQNO_OFFSET 0
709 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8)
710 
711 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
712 {
713 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
714 }
715 
716 /**
717  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
718  * @xe: the &xe_device struct instance
719  *
720  * Returns: Size of the LRC registers area for current platform
721  */
722 size_t xe_lrc_reg_size(struct xe_device *xe)
723 {
724 	if (GRAPHICS_VERx100(xe) >= 1250)
725 		return 96 * sizeof(u32);
726 	else
727 		return 80 * sizeof(u32);
728 }
729 
730 size_t xe_lrc_skip_size(struct xe_device *xe)
731 {
732 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
733 }
734 
735 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
736 {
737 	return LRC_SEQNO_OFFSET;
738 }
739 
740 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
741 {
742 	return LRC_START_SEQNO_OFFSET;
743 }
744 
745 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
746 {
747 	/* This is stored in the driver-defined portion of PPHWSP */
748 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
749 }
750 
751 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
752 {
753 	/* The parallel is stored in the driver-defined portion of PPHWSP */
754 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
755 }
756 
757 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
758 {
759 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
760 }
761 
762 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
763 {
764 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
765 }
766 
767 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
768 {
769 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
770 }
771 
772 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
773 {
774 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
775 		     LRC_INDIRECT_RING_STATE_SIZE;
776 
777 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
778 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
779 
780 	return offset;
781 }
782 
783 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
784 {
785 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
786 }
787 
788 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
789 {
790 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
791 }
792 
793 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \
794 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
795 { \
796 	struct xe_bo *bo = (bo_expr); \
797 	struct iosys_map map = bo->vmap; \
798 \
799 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
800 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
801 	return map; \
802 } \
803 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
804 { \
805 	struct xe_bo *bo = (bo_expr); \
806 \
807 	return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \
808 } \
809 
810 DECL_MAP_ADDR_HELPERS(ring, lrc->bo)
811 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo)
812 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo)
813 DECL_MAP_ADDR_HELPERS(regs, lrc->bo)
814 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo)
815 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo)
816 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo)
817 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo)
818 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo)
819 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo)
820 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo)
821 
822 #undef DECL_MAP_ADDR_HELPERS
823 
824 /**
825  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
826  * @lrc: Pointer to the lrc.
827  *
828  * Returns: ctx timestamp GGTT address
829  */
830 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
831 {
832 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
833 }
834 
835 /**
836  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
837  * @lrc: Pointer to the lrc.
838  *
839  * Returns: ctx timestamp udw GGTT address
840  */
841 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
842 {
843 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
844 }
845 
846 /**
847  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
848  * @lrc: Pointer to the lrc.
849  *
850  * Returns: ctx timestamp value
851  */
852 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
853 {
854 	struct xe_device *xe = lrc_to_xe(lrc);
855 	struct iosys_map map;
856 	u32 ldw, udw = 0;
857 
858 	map = __xe_lrc_ctx_timestamp_map(lrc);
859 	ldw = xe_map_read32(xe, &map);
860 
861 	if (xe->info.has_64bit_timestamp) {
862 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
863 		udw = xe_map_read32(xe, &map);
864 	}
865 
866 	return (u64)udw << 32 | ldw;
867 }
868 
869 /**
870  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
871  * @lrc: Pointer to the lrc.
872  *
873  * Returns: ctx timestamp job GGTT address
874  */
875 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
876 {
877 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
878 }
879 
880 /**
881  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
882  * @lrc: Pointer to the lrc.
883  *
884  * Returns: ctx timestamp job value
885  */
886 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
887 {
888 	struct xe_device *xe = lrc_to_xe(lrc);
889 	struct iosys_map map;
890 
891 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
892 	return xe_map_read32(xe, &map);
893 }
894 
895 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
896 {
897 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
898 }
899 
900 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
901 {
902 	if (!xe_lrc_has_indirect_ring_state(lrc))
903 		return 0;
904 
905 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
906 }
907 
908 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
909 {
910 	struct xe_device *xe = lrc_to_xe(lrc);
911 	struct iosys_map map;
912 
913 	map = __xe_lrc_indirect_ring_map(lrc);
914 	iosys_map_incr(&map, reg_nr * sizeof(u32));
915 	return xe_map_read32(xe, &map);
916 }
917 
918 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
919 					  int reg_nr, u32 val)
920 {
921 	struct xe_device *xe = lrc_to_xe(lrc);
922 	struct iosys_map map;
923 
924 	map = __xe_lrc_indirect_ring_map(lrc);
925 	iosys_map_incr(&map, reg_nr * sizeof(u32));
926 	xe_map_write32(xe, &map, val);
927 }
928 
929 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
930 {
931 	struct xe_device *xe = lrc_to_xe(lrc);
932 	struct iosys_map map;
933 
934 	map = __xe_lrc_regs_map(lrc);
935 	iosys_map_incr(&map, reg_nr * sizeof(u32));
936 	return xe_map_read32(xe, &map);
937 }
938 
939 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
940 {
941 	struct xe_device *xe = lrc_to_xe(lrc);
942 	struct iosys_map map;
943 
944 	map = __xe_lrc_regs_map(lrc);
945 	iosys_map_incr(&map, reg_nr * sizeof(u32));
946 	xe_map_write32(xe, &map, val);
947 }
948 
949 static void *empty_lrc_data(struct xe_hw_engine *hwe)
950 {
951 	struct xe_gt *gt = hwe->gt;
952 	void *data;
953 	u32 *regs;
954 
955 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
956 	if (!data)
957 		return NULL;
958 
959 	/* 1st page: Per-Process of HW status Page */
960 	regs = data + LRC_PPHWSP_SIZE;
961 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
962 	set_context_control(regs, hwe);
963 	set_memory_based_intr(regs, hwe);
964 	if (xe_gt_has_indirect_ring_state(gt)) {
965 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
966 		       LRC_INDIRECT_RING_STATE_SIZE;
967 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
968 	}
969 
970 	return data;
971 }
972 
973 /**
974  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
975  * of given engine.
976  * @hwe: the &xe_hw_engine struct instance
977  */
978 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
979 {
980 	struct xe_gt *gt = hwe->gt;
981 	u32 *regs;
982 
983 	if (!gt->default_lrc[hwe->class])
984 		return;
985 
986 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
987 	set_memory_based_intr(regs, hwe);
988 }
989 
990 /**
991  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
992  * for given LRC.
993  * @lrc: the &xe_lrc struct instance
994  * @hwe: the &xe_hw_engine struct instance
995  * @regs: scratch buffer to be used as temporary storage
996  */
997 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
998 					    u32 *regs)
999 {
1000 	struct xe_gt *gt = hwe->gt;
1001 	struct iosys_map map;
1002 	size_t regs_len;
1003 
1004 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
1005 		return;
1006 
1007 	map = __xe_lrc_regs_map(lrc);
1008 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1009 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1010 	set_memory_based_intr(regs, hwe);
1011 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1012 }
1013 
1014 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1015 {
1016 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1017 
1018 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1019 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1020 }
1021 
1022 static void xe_lrc_finish(struct xe_lrc *lrc)
1023 {
1024 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1025 	xe_bo_unpin_map_no_vm(lrc->bo);
1026 	xe_bo_unpin_map_no_vm(lrc->seqno_bo);
1027 }
1028 
1029 /*
1030  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1031  * in calculating active context run ticks.
1032  *
1033  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1034  * context, but only gets updated when the context switches out. In order to
1035  * check how long a context has been active before it switches out, two things
1036  * are required:
1037  *
1038  * (1) Determine if the context is running:
1039  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1040  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1041  * initialized. During a query, we just check for this value to determine if the
1042  * context is active. If the context switched out, it would overwrite this
1043  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1044  * the last part of context restore, so reusing this LRC location will not
1045  * clobber anything.
1046  *
1047  * (2) Calculate the time that the context has been active for:
1048  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1049  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1050  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1051  * engine instance. Since we do not know which instance the context is running
1052  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1053  * store it in the PPHSWP.
1054  */
1055 #define CONTEXT_ACTIVE 1ULL
1056 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1057 				    struct xe_hw_engine *hwe,
1058 				    u32 *batch,
1059 				    size_t max_len)
1060 {
1061 	u32 *cmd = batch;
1062 
1063 	if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1064 		return 0;
1065 
1066 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1067 		return -ENOSPC;
1068 
1069 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1070 	*cmd++ = ENGINE_ID(0).addr;
1071 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1072 	*cmd++ = 0;
1073 
1074 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1075 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1076 	*cmd++ = 0;
1077 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1078 
1079 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1080 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1081 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1082 		*cmd++ = 0;
1083 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1084 	}
1085 
1086 	return cmd - batch;
1087 }
1088 
1089 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1090 				  u32 *batch, size_t max_len)
1091 {
1092 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1093 	u32 *cmd = batch;
1094 
1095 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1096 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1097 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1098 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1099 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1100 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1101 		return 0;
1102 
1103 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1104 		return -ENOSPC;
1105 
1106 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1107 		 MI_LRM_ASYNC;
1108 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1109 	*cmd++ = ts_addr;
1110 	*cmd++ = 0;
1111 
1112 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1113 		 MI_LRM_ASYNC;
1114 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1115 	*cmd++ = ts_addr;
1116 	*cmd++ = 0;
1117 
1118 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1119 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1120 	*cmd++ = ts_addr;
1121 	*cmd++ = 0;
1122 
1123 	return cmd - batch;
1124 }
1125 
1126 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1127 						  struct xe_hw_engine *hwe,
1128 						  u32 *batch, size_t max_len)
1129 {
1130 	struct xe_device *xe = gt_to_xe(lrc->gt);
1131 	const u32 *user_batch;
1132 	u32 *cmd = batch;
1133 	u32 count;
1134 
1135 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1136 						    hwe->class, &user_batch);
1137 	if (!count)
1138 		return 0;
1139 
1140 	if (count > max_len)
1141 		return -ENOSPC;
1142 
1143 	/*
1144 	 * This should be used only for tests and validation. Taint the kernel
1145 	 * as anything could be submitted directly in context switches
1146 	 */
1147 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1148 
1149 	memcpy(cmd, user_batch, count * sizeof(u32));
1150 	cmd += count;
1151 
1152 	return cmd - batch;
1153 }
1154 
1155 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1156 						 struct xe_hw_engine *hwe,
1157 						 u32 *batch, size_t max_len)
1158 {
1159 	struct xe_device *xe = gt_to_xe(lrc->gt);
1160 	const u32 *user_batch;
1161 	u32 *cmd = batch;
1162 	u32 count;
1163 
1164 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1165 						   hwe->class, &user_batch);
1166 	if (!count)
1167 		return 0;
1168 
1169 	if (count > max_len)
1170 		return -ENOSPC;
1171 
1172 	/*
1173 	 * This should be used only for tests and validation. Taint the kernel
1174 	 * as anything could be submitted directly in context switches
1175 	 */
1176 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1177 
1178 	memcpy(cmd, user_batch, count * sizeof(u32));
1179 	cmd += count;
1180 
1181 	return cmd - batch;
1182 }
1183 
1184 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1185 					       struct xe_hw_engine *hwe,
1186 					       u32 *batch, size_t max_len)
1187 {
1188 	u32 *cmd = batch;
1189 
1190 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1191 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1192 		return 0;
1193 
1194 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1195 		return -ENOSPC;
1196 
1197 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_LRM_CS_MMIO | MI_LRI_NUM_REGS(1);
1198 	*cmd++ = CS_DEBUG_MODE2(0).addr;
1199 	*cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1200 
1201 	return cmd - batch;
1202 }
1203 
1204 static ssize_t setup_invalidate_auxccs_wa(struct xe_lrc *lrc,
1205 					  struct xe_hw_engine *hwe,
1206 					  u32 *batch, size_t max_len)
1207 {
1208 	struct xe_gt *gt = lrc->gt;
1209 	u32 *(*emit)(struct xe_gt *gt, u32 *cmd) =
1210 		gt->ring_ops[hwe->class]->emit_aux_table_inv;
1211 
1212 	if (!emit)
1213 		return 0;
1214 
1215 	if (xe_gt_WARN_ON(gt, max_len < 8))
1216 		return -ENOSPC;
1217 
1218 	return emit(gt, batch) - batch;
1219 }
1220 
1221 struct bo_setup {
1222 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1223 			 u32 *batch, size_t max_size);
1224 };
1225 
1226 struct bo_setup_state {
1227 	/* Input: */
1228 	struct xe_lrc		*lrc;
1229 	struct xe_hw_engine	*hwe;
1230 	size_t			max_size;
1231 	size_t                  reserve_dw;
1232 	unsigned int		offset;
1233 	const struct bo_setup	*funcs;
1234 	unsigned int		num_funcs;
1235 
1236 	/* State: */
1237 	u32			*buffer;
1238 	u32			*ptr;
1239 	unsigned int		written;
1240 };
1241 
1242 static int setup_bo(struct bo_setup_state *state)
1243 {
1244 	ssize_t remain;
1245 
1246 	if (state->lrc->bo->vmap.is_iomem) {
1247 		xe_gt_assert(state->hwe->gt, state->buffer);
1248 		state->ptr = state->buffer;
1249 	} else {
1250 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1251 	}
1252 
1253 	remain = state->max_size / sizeof(u32);
1254 
1255 	for (size_t i = 0; i < state->num_funcs; i++) {
1256 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1257 						    state->ptr, remain);
1258 
1259 		remain -= len;
1260 
1261 		/*
1262 		 * Caller has asked for at least reserve_dw to remain unused.
1263 		 */
1264 		if (len < 0 ||
1265 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1266 			goto fail;
1267 
1268 		state->ptr += len;
1269 		state->written += len;
1270 	}
1271 
1272 	return 0;
1273 
1274 fail:
1275 	return -ENOSPC;
1276 }
1277 
1278 static void finish_bo(struct bo_setup_state *state)
1279 {
1280 	if (!state->lrc->bo->vmap.is_iomem)
1281 		return;
1282 
1283 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1284 			 state->offset, state->buffer,
1285 			 state->written * sizeof(u32));
1286 }
1287 
1288 /**
1289  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1290  * @lrc: the &xe_lrc struct instance
1291  * @hwe: the &xe_hw_engine struct instance
1292  * @scratch: preallocated scratch buffer for temporary storage
1293  * Return: 0 on success, negative error code on failure
1294  */
1295 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1296 {
1297 	static const struct bo_setup funcs[] = {
1298 		{ .setup = setup_timestamp_wa },
1299 		{ .setup = setup_invalidate_state_cache_wa },
1300 		{ .setup = setup_utilization_wa },
1301 		{ .setup = setup_configfs_post_ctx_restore_bb },
1302 	};
1303 	struct bo_setup_state state = {
1304 		.lrc = lrc,
1305 		.hwe = hwe,
1306 		.max_size = LRC_WA_BB_SIZE,
1307 		.buffer = scratch,
1308 		.reserve_dw = 1,
1309 		.offset = __xe_lrc_wa_bb_offset(lrc),
1310 		.funcs = funcs,
1311 		.num_funcs = ARRAY_SIZE(funcs),
1312 	};
1313 	int ret;
1314 
1315 	ret = setup_bo(&state);
1316 	if (ret)
1317 		return ret;
1318 
1319 	*state.ptr++ = MI_BATCH_BUFFER_END;
1320 	state.written++;
1321 
1322 	finish_bo(&state);
1323 
1324 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1325 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1326 
1327 	return 0;
1328 }
1329 
1330 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1331 {
1332 	u32 *buf = NULL;
1333 	int ret;
1334 
1335 	if (lrc->bo->vmap.is_iomem) {
1336 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1337 		if (!buf)
1338 			return -ENOMEM;
1339 	}
1340 
1341 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1342 
1343 	kfree(buf);
1344 
1345 	return ret;
1346 }
1347 
1348 static int
1349 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1350 {
1351 	static const struct bo_setup rcs_funcs[] = {
1352 		{ .setup = setup_timestamp_wa },
1353 		{ .setup = setup_invalidate_auxccs_wa },
1354 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1355 	};
1356 	static const struct bo_setup xcs_funcs[] = {
1357 		{ .setup = setup_invalidate_auxccs_wa },
1358 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1359 	};
1360 	struct bo_setup_state state = {
1361 		.lrc = lrc,
1362 		.hwe = hwe,
1363 		.max_size = (63 * 64) /* max 63 cachelines */,
1364 		.buffer = NULL,
1365 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1366 	};
1367 	int ret;
1368 
1369 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1370 		return 0;
1371 
1372 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1373 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1374 		state.funcs = rcs_funcs;
1375 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1376 	} else {
1377 		state.funcs = xcs_funcs;
1378 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1379 	}
1380 
1381 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1382 		return 0;
1383 
1384 	if (lrc->bo->vmap.is_iomem) {
1385 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1386 		if (!state.buffer)
1387 			return -ENOMEM;
1388 	}
1389 
1390 	ret = setup_bo(&state);
1391 	if (ret) {
1392 		kfree(state.buffer);
1393 		return ret;
1394 	}
1395 
1396 	/*
1397 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1398 	 * execute: size for indirect ctx must be a multiple of 64.
1399 	 */
1400 	while (state.written & 0xf) {
1401 		*state.ptr++ = MI_NOOP;
1402 		state.written++;
1403 	}
1404 
1405 	finish_bo(&state);
1406 	kfree(state.buffer);
1407 
1408 	/*
1409 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1410 	 * varies per engine class, but the default is good enough
1411 	 */
1412 	xe_lrc_write_ctx_reg(lrc,
1413 			     CTX_CS_INDIRECT_CTX,
1414 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1415 			     /* Size in CLs. */
1416 			     (state.written * sizeof(u32) / 64));
1417 
1418 	return 0;
1419 }
1420 
1421 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1422 {
1423 	struct xe_device *xe = gt_to_xe(lrc->gt);
1424 
1425 	xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1426 		       priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1427 
1428 	/* xe_multi_queue_priority is directly mapped to LRC priority values */
1429 	return priority;
1430 }
1431 
1432 /**
1433  * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1434  * @lrc: Logical Ring Context
1435  * @priority: Multi queue priority of the exec queue
1436  *
1437  * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1438  */
1439 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1440 {
1441 	lrc->desc &= ~LRC_PRIORITY;
1442 	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1443 }
1444 
1445 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1446 			   void *replay_state, u16 msix_vec, u32 init_flags)
1447 {
1448 	struct xe_gt *gt = hwe->gt;
1449 	struct xe_tile *tile = gt_to_tile(gt);
1450 	struct xe_device *xe = gt_to_xe(gt);
1451 	struct iosys_map map;
1452 	u32 arb_enable;
1453 	u32 state_cache_perf_fix[3];
1454 	int err;
1455 
1456 	/*
1457 	 * Init Per-Process of HW status Page, LRC / context state to known
1458 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1459 	 * it's the early submission to record the lrc: build a new empty one from
1460 	 * scratch.
1461 	 */
1462 	map = __xe_lrc_pphwsp_map(lrc);
1463 	if (gt->default_lrc[hwe->class] || replay_state) {
1464 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1465 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1466 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1467 				 lrc->size - LRC_PPHWSP_SIZE);
1468 		if (replay_state)
1469 			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1470 					 replay_state, lrc->replay_size);
1471 	} else {
1472 		void *init_data = empty_lrc_data(hwe);
1473 
1474 		if (!init_data) {
1475 			return -ENOMEM;
1476 		}
1477 
1478 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
1479 		kfree(init_data);
1480 	}
1481 
1482 	if (vm)
1483 		xe_lrc_set_ppgtt(lrc, vm);
1484 
1485 	if (xe_device_has_msix(xe)) {
1486 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1487 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1488 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1489 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1490 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1491 	}
1492 
1493 	if (xe_gt_has_indirect_ring_state(gt)) {
1494 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1495 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1496 
1497 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1498 					      __xe_lrc_ring_ggtt_addr(lrc));
1499 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1500 
1501 		/* Match head and tail pointers */
1502 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail);
1503 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1504 
1505 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1506 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1507 	} else {
1508 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1509 
1510 		/* Match head and tail pointers */
1511 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail);
1512 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1513 
1514 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1515 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1516 	}
1517 
1518 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1519 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1520 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1521 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE));
1522 
1523 	if (init_flags & XE_LRC_CREATE_PXP)
1524 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1525 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1526 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE));
1527 
1528 	lrc->ctx_timestamp = 0;
1529 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1530 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1531 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1532 
1533 	if (xe->info.has_asid && vm)
1534 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1535 
1536 	lrc->desc = LRC_VALID;
1537 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1538 	/* TODO: Priority */
1539 
1540 	/* While this appears to have something about privileged batches or
1541 	 * some such, it really just means PPGTT mode.
1542 	 */
1543 	if (vm)
1544 		lrc->desc |= LRC_PRIVILEGE;
1545 
1546 	if (GRAPHICS_VERx100(xe) < 1250) {
1547 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1548 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1549 	}
1550 
1551 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1552 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1553 
1554 	if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) {
1555 		state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1556 		state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
1557 		state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX);
1558 		xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
1559 	}
1560 
1561 	map = __xe_lrc_seqno_map(lrc);
1562 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1563 
1564 	map = __xe_lrc_start_seqno_map(lrc);
1565 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1566 
1567 	err = setup_wa_bb(lrc, hwe);
1568 	if (err)
1569 		return err;
1570 
1571 	err = setup_indirect_ctx(lrc, hwe);
1572 
1573 	return err;
1574 }
1575 
1576 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1577 		       void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
1578 {
1579 	struct xe_gt *gt = hwe->gt;
1580 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1581 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1582 	struct xe_tile *tile = gt_to_tile(gt);
1583 	struct xe_device *xe = gt_to_xe(gt);
1584 	struct xe_bo *bo;
1585 	u32 bo_flags;
1586 	int err;
1587 
1588 	kref_init(&lrc->refcount);
1589 	lrc->gt = gt;
1590 	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1591 	lrc->size = lrc_size;
1592 	lrc->flags = 0;
1593 	lrc->ring.size = ring_size;
1594 	lrc->ring.tail = 0;
1595 
1596 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1597 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1598 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1599 	}
1600 
1601 	if (xe_gt_has_indirect_ring_state(gt))
1602 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1603 
1604 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1605 		   XE_BO_FLAG_GGTT_INVALIDATE;
1606 
1607 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1608 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1609 
1610 	bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
1611 				       ttm_bo_type_kernel,
1612 				       bo_flags, false);
1613 	if (IS_ERR(bo))
1614 		return PTR_ERR(bo);
1615 
1616 	lrc->bo = bo;
1617 
1618 	bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
1619 				       ttm_bo_type_kernel,
1620 				       XE_BO_FLAG_GGTT |
1621 				       XE_BO_FLAG_GGTT_INVALIDATE |
1622 				       XE_BO_FLAG_SYSTEM, false);
1623 	if (IS_ERR(bo)) {
1624 		err = PTR_ERR(bo);
1625 		goto err_lrc_finish;
1626 	}
1627 	lrc->seqno_bo = bo;
1628 
1629 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1630 			     hwe->fence_irq, hwe->name);
1631 
1632 	err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags);
1633 	if (err)
1634 		goto err_lrc_finish;
1635 
1636 	if (vm && vm->xef)
1637 		xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1638 
1639 	return 0;
1640 
1641 err_lrc_finish:
1642 	xe_lrc_finish(lrc);
1643 	return err;
1644 }
1645 
1646 /**
1647  * xe_lrc_create - Create a LRC
1648  * @hwe: Hardware Engine
1649  * @vm: The VM (address space)
1650  * @replay_state: GPU hang replay state
1651  * @ring_size: LRC ring size
1652  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1653  * @flags: LRC initialization flags
1654  *
1655  * Allocate and initialize the Logical Ring Context (LRC).
1656  *
1657  * Return pointer to created LRC upon success and an error pointer
1658  * upon failure.
1659  */
1660 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1661 			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1662 {
1663 	struct xe_lrc *lrc;
1664 	int err;
1665 
1666 	lrc = kzalloc_obj(*lrc);
1667 	if (!lrc)
1668 		return ERR_PTR(-ENOMEM);
1669 
1670 	err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1671 	if (err) {
1672 		kfree(lrc);
1673 		return ERR_PTR(err);
1674 	}
1675 
1676 	return lrc;
1677 }
1678 
1679 /**
1680  * xe_lrc_destroy - Destroy the LRC
1681  * @ref: reference to LRC
1682  *
1683  * Called when ref == 0, release resources held by the Logical Ring Context
1684  * (LRC) and free the LRC memory.
1685  */
1686 void xe_lrc_destroy(struct kref *ref)
1687 {
1688 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1689 
1690 	xe_lrc_finish(lrc);
1691 	kfree(lrc);
1692 }
1693 
1694 /**
1695  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1696  * @lrc: the &xe_lrc struct instance
1697  */
1698 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1699 {
1700 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1701 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1702 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1703 
1704 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1705 					      __xe_lrc_ring_ggtt_addr(lrc));
1706 	} else {
1707 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1708 	}
1709 }
1710 
1711 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1712 {
1713 	if (xe_lrc_has_indirect_ring_state(lrc))
1714 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1715 	else
1716 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1717 }
1718 
1719 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1720 {
1721 	if (xe_lrc_has_indirect_ring_state(lrc))
1722 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1723 	else
1724 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1725 }
1726 
1727 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1728 {
1729 	if (xe_lrc_has_indirect_ring_state(lrc))
1730 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1731 	else
1732 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1733 }
1734 
1735 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1736 {
1737 	if (xe_lrc_has_indirect_ring_state(lrc))
1738 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1739 	else
1740 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1741 }
1742 
1743 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1744 {
1745 	if (xe_lrc_has_indirect_ring_state(lrc))
1746 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1747 	else
1748 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1749 }
1750 
1751 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1752 {
1753 	const u32 head = xe_lrc_ring_head(lrc);
1754 	const u32 tail = lrc->ring.tail;
1755 	const u32 size = lrc->ring.size;
1756 
1757 	return ((head - tail - 1) & (size - 1)) + 1;
1758 }
1759 
1760 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1761 				const void *data, size_t size)
1762 {
1763 	struct xe_device *xe = lrc_to_xe(lrc);
1764 
1765 	iosys_map_incr(&ring, lrc->ring.tail);
1766 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1767 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1768 }
1769 
1770 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1771 {
1772 	struct xe_device *xe = lrc_to_xe(lrc);
1773 	struct iosys_map ring;
1774 	u32 rhs;
1775 	size_t aligned_size;
1776 
1777 	xe_assert(xe, IS_ALIGNED(size, 4));
1778 	aligned_size = ALIGN(size, 8);
1779 
1780 	ring = __xe_lrc_ring_map(lrc);
1781 
1782 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1783 	rhs = lrc->ring.size - lrc->ring.tail;
1784 	if (size > rhs) {
1785 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1786 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1787 	} else {
1788 		__xe_lrc_write_ring(lrc, ring, data, size);
1789 	}
1790 
1791 	if (aligned_size > size) {
1792 		u32 noop = MI_NOOP;
1793 
1794 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1795 	}
1796 }
1797 
1798 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1799 {
1800 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1801 }
1802 
1803 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1804 {
1805 	return __xe_lrc_seqno_ggtt_addr(lrc);
1806 }
1807 
1808 /**
1809  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1810  *
1811  * Allocate but don't initialize an lrc seqno fence.
1812  *
1813  * Return: Pointer to the allocated fence or
1814  * negative error pointer on error.
1815  */
1816 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1817 {
1818 	return xe_hw_fence_alloc();
1819 }
1820 
1821 /**
1822  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1823  * @fence: Pointer to the fence to free.
1824  *
1825  * Frees an lrc seqno fence that hasn't yet been
1826  * initialized.
1827  */
1828 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1829 {
1830 	xe_hw_fence_free(fence);
1831 }
1832 
1833 /**
1834  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1835  * @lrc: Pointer to the lrc.
1836  * @fence: Pointer to the fence to initialize.
1837  *
1838  * Initializes a pre-allocated lrc seqno fence.
1839  * After initialization, the fence is subject to normal
1840  * dma-fence refcounting.
1841  */
1842 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1843 {
1844 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1845 }
1846 
1847 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1848 {
1849 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1850 
1851 	return xe_map_read32(lrc_to_xe(lrc), &map);
1852 }
1853 
1854 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1855 {
1856 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1857 
1858 	return xe_map_read32(lrc_to_xe(lrc), &map);
1859 }
1860 
1861 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1862 {
1863 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1864 }
1865 
1866 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1867 {
1868 	return __xe_lrc_parallel_ggtt_addr(lrc);
1869 }
1870 
1871 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1872 {
1873 	return __xe_lrc_parallel_map(lrc);
1874 }
1875 
1876 /**
1877  * xe_lrc_engine_id() - Read engine id value
1878  * @lrc: Pointer to the lrc.
1879  *
1880  * Returns: context id value
1881  */
1882 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1883 {
1884 	struct xe_device *xe = lrc_to_xe(lrc);
1885 	struct iosys_map map;
1886 
1887 	map = __xe_lrc_engine_id_map(lrc);
1888 	return xe_map_read32(xe, &map);
1889 }
1890 
1891 static int instr_dw(u32 cmd_header)
1892 {
1893 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1894 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1895 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1896 		return 1;
1897 
1898 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1899 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1900 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1901 
1902 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1903 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1904 }
1905 
1906 static int dump_mi_command(struct drm_printer *p,
1907 			   struct xe_gt *gt,
1908 			   u32 *start,
1909 			   u32 *dw,
1910 			   int remaining_dw)
1911 {
1912 	u32 inst_header = *dw;
1913 	u32 numdw = instr_dw(inst_header);
1914 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1915 	int num_noop;
1916 
1917 	/* First check for commands that don't have/use a '# DW' field */
1918 	switch (inst_header & MI_OPCODE) {
1919 	case MI_NOOP:
1920 		num_noop = 1;
1921 		while (num_noop < remaining_dw &&
1922 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1923 			num_noop++;
1924 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_NOOP (%d dwords)\n",
1925 			   dw - num_noop - start, inst_header, num_noop);
1926 		return num_noop;
1927 
1928 	case MI_TOPOLOGY_FILTER:
1929 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_TOPOLOGY_FILTER\n",
1930 			   dw - start, inst_header);
1931 		return 1;
1932 
1933 	case MI_BATCH_BUFFER_END:
1934 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_BATCH_BUFFER_END\n",
1935 			   dw - start, inst_header);
1936 		/* Return 'remaining_dw' to consume the rest of the LRC */
1937 		return remaining_dw;
1938 	}
1939 
1940 	/*
1941 	 * Any remaining commands include a # of dwords.  We should make sure
1942 	 * it doesn't exceed the remaining size of the LRC.
1943 	 */
1944 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1945 		numdw = remaining_dw;
1946 
1947 	switch (inst_header & MI_OPCODE) {
1948 	case MI_LOAD_REGISTER_IMM:
1949 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1950 			   dw - start, inst_header, (numdw - 1) / 2);
1951 		for (int i = 1; i < numdw; i += 2)
1952 			drm_printf(p, "LRC[%#5tx]  =  - %#6x = %#010x\n",
1953 				   &dw[i] - start, dw[i], dw[i + 1]);
1954 		return numdw;
1955 
1956 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1957 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1958 			   dw - start, inst_header,
1959 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1960 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1961 		if (numdw == 4)
1962 			drm_printf(p, "LRC[%#5tx]  =  - %#6x = %#010llx\n",
1963 				   dw - start,
1964 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1965 		else
1966 			drm_printf(p, "LRC[%#5tx]  =  - %*ph (%s)\n",
1967 				   dw - start, (int)sizeof(u32) * (numdw - 1),
1968 				   dw + 1, numdw < 4 ? "truncated" : "malformed");
1969 		return numdw;
1970 
1971 	case MI_FORCE_WAKEUP:
1972 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_FORCE_WAKEUP\n",
1973 			   dw - start, inst_header);
1974 		return numdw;
1975 
1976 	default:
1977 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown MI opcode %#x, likely %d dwords\n",
1978 			   dw - start, inst_header, opcode, numdw);
1979 		return numdw;
1980 	}
1981 }
1982 
1983 static int dump_gfxpipe_command(struct drm_printer *p,
1984 				struct xe_gt *gt,
1985 				u32 *start,
1986 				u32 *dw,
1987 				int remaining_dw)
1988 {
1989 	u32 numdw = instr_dw(*dw);
1990 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1991 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1992 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1993 
1994 	/*
1995 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1996 	 * remaining size of the LRC.
1997 	 */
1998 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1999 		numdw = remaining_dw;
2000 
2001 	switch (*dw & GFXPIPE_MATCH_MASK) {
2002 #define MATCH(cmd) \
2003 	case cmd: \
2004 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] " #cmd " (%d dwords)\n", \
2005 			   dw - start, *dw, numdw); \
2006 		return numdw
2007 #define MATCH3D(cmd) \
2008 	case CMD_##cmd: \
2009 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] " #cmd " (%d dwords)\n", \
2010 			   dw - start, *dw, numdw); \
2011 		return numdw
2012 
2013 	MATCH(STATE_BASE_ADDRESS);
2014 	MATCH(STATE_SIP);
2015 	MATCH(GPGPU_CSR_BASE_ADDRESS);
2016 	MATCH(STATE_COMPUTE_MODE);
2017 	MATCH3D(3DSTATE_BTD);
2018 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
2019 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
2020 
2021 	MATCH3D(3DSTATE_VF_STATISTICS);
2022 
2023 	MATCH(PIPELINE_SELECT);
2024 
2025 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
2026 	MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN);
2027 	MATCH3D(3DSTATE_CLEAR_PARAMS);
2028 	MATCH3D(3DSTATE_DEPTH_BUFFER);
2029 	MATCH3D(3DSTATE_STENCIL_BUFFER);
2030 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
2031 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
2032 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
2033 	MATCH3D(3DSTATE_INDEX_BUFFER);
2034 	MATCH3D(3DSTATE_VF);
2035 	MATCH3D(3DSTATE_MULTISAMPLE);
2036 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
2037 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
2038 	MATCH3D(3DSTATE_VS);
2039 	MATCH3D(3DSTATE_GS);
2040 	MATCH3D(3DSTATE_CLIP);
2041 	MATCH3D(3DSTATE_SF);
2042 	MATCH3D(3DSTATE_WM);
2043 	MATCH3D(3DSTATE_CONSTANT_VS);
2044 	MATCH3D(3DSTATE_CONSTANT_GS);
2045 	MATCH3D(3DSTATE_CONSTANT_PS);
2046 	MATCH3D(3DSTATE_SAMPLE_MASK);
2047 	MATCH3D(3DSTATE_CONSTANT_HS);
2048 	MATCH3D(3DSTATE_CONSTANT_DS);
2049 	MATCH3D(3DSTATE_HS);
2050 	MATCH3D(3DSTATE_TE);
2051 	MATCH3D(3DSTATE_DS);
2052 	MATCH3D(3DSTATE_STREAMOUT);
2053 	MATCH3D(3DSTATE_SBE);
2054 	MATCH3D(3DSTATE_PS);
2055 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
2056 	MATCH3D(3DSTATE_CPS_POINTERS);
2057 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2058 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2059 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2060 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2061 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2062 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2063 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2064 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2065 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2066 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2067 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2068 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2069 	MATCH3D(3DSTATE_VF_INSTANCING);
2070 	MATCH3D(3DSTATE_VF_SGVS);
2071 	MATCH3D(3DSTATE_VF_TOPOLOGY);
2072 	MATCH3D(3DSTATE_WM_CHROMAKEY);
2073 	MATCH3D(3DSTATE_PS_BLEND);
2074 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2075 	MATCH3D(3DSTATE_PS_EXTRA);
2076 	MATCH3D(3DSTATE_RASTER);
2077 	MATCH3D(3DSTATE_SBE_SWIZ);
2078 	MATCH3D(3DSTATE_WM_HZ_OP);
2079 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2080 	MATCH3D(3DSTATE_VF_SGVS_2);
2081 	MATCH3D(3DSTATE_VFG);
2082 	MATCH3D(3DSTATE_URB_ALLOC_VS);
2083 	MATCH3D(3DSTATE_URB_ALLOC_HS);
2084 	MATCH3D(3DSTATE_URB_ALLOC_DS);
2085 	MATCH3D(3DSTATE_URB_ALLOC_GS);
2086 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2087 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2088 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2089 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2090 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2091 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2092 	MATCH3D(3DSTATE_AMFS);
2093 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
2094 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2095 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2096 	MATCH3D(3DSTATE_MESH_CONTROL);
2097 	MATCH3D(3DSTATE_MESH_DISTRIB);
2098 	MATCH3D(3DSTATE_TASK_REDISTRIB);
2099 	MATCH3D(3DSTATE_MESH_SHADER);
2100 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
2101 	MATCH3D(3DSTATE_TASK_CONTROL);
2102 	MATCH3D(3DSTATE_TASK_SHADER);
2103 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
2104 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
2105 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
2106 	MATCH3D(3DSTATE_CLIP_MESH);
2107 	MATCH3D(3DSTATE_SBE_MESH);
2108 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2109 	MATCH3D(3DSTATE_COARSE_PIXEL);
2110 	MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT);
2111 	MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT);
2112 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2);
2113 	MATCH3D(3DSTATE_CC_STATE_POINTERS_2);
2114 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2);
2115 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2);
2116 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2);
2117 
2118 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2119 	MATCH3D(3DSTATE_URB_MEMORY);
2120 	MATCH3D(3DSTATE_CHROMA_KEY);
2121 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2122 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2123 	MATCH3D(3DSTATE_LINE_STIPPLE);
2124 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2125 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2126 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2127 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2128 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2129 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2130 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2131 	MATCH3D(3DSTATE_SO_DECL_LIST);
2132 	MATCH3D(3DSTATE_SO_BUFFER);
2133 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2134 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2135 	MATCH3D(3DSTATE_3D_MODE);
2136 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2137 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2138 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2139 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2);
2140 
2141 	default:
2142 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2143 			   dw - start, *dw, pipeline, opcode, subopcode, numdw);
2144 		return numdw;
2145 	}
2146 }
2147 
2148 static int dump_gfx_state_command(struct drm_printer *p,
2149 				  struct xe_gt *gt,
2150 				  u32 *start,
2151 				  u32 *dw,
2152 				  int remaining_dw)
2153 {
2154 	u32 numdw = instr_dw(*dw);
2155 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2156 
2157 	/*
2158 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2159 	 * remaining size of the LRC.
2160 	 */
2161 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2162 		numdw = remaining_dw;
2163 
2164 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2165 	MATCH(STATE_WRITE_INLINE);
2166 
2167 	default:
2168 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2169 			   dw - start, *dw, opcode, numdw);
2170 		return numdw;
2171 	}
2172 }
2173 
2174 void xe_lrc_dump_default(struct drm_printer *p,
2175 			 struct xe_gt *gt,
2176 			 enum xe_engine_class hwe_class)
2177 {
2178 	u32 *dw, *start;
2179 	int remaining_dw, num_dw;
2180 
2181 	if (!gt->default_lrc[hwe_class]) {
2182 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2183 		return;
2184 	}
2185 
2186 	/*
2187 	 * Skip the beginning of the LRC since it contains the per-process
2188 	 * hardware status page.
2189 	 */
2190 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2191 	start = dw;
2192 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2193 
2194 	while (remaining_dw > 0) {
2195 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2196 			num_dw = dump_mi_command(p, gt, start, dw, remaining_dw);
2197 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2198 			num_dw = dump_gfxpipe_command(p, gt, start, dw, remaining_dw);
2199 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2200 			num_dw = dump_gfx_state_command(p, gt, start, dw, remaining_dw);
2201 		} else {
2202 			num_dw = min(instr_dw(*dw), remaining_dw);
2203 			drm_printf(p, "LRC[%#5tx]  =  [%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2204 				   dw - start,
2205 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2206 				   num_dw);
2207 		}
2208 
2209 		dw += num_dw;
2210 		remaining_dw -= num_dw;
2211 	}
2212 }
2213 
2214 /*
2215  * Lookup the value of a register within the offset/value pairs of an
2216  * MI_LOAD_REGISTER_IMM instruction.
2217  *
2218  * Return -ENOENT if the register is not present in the MI_LRI instruction.
2219  */
2220 static int lookup_reg_in_mi_lri(u32 offset, u32 *value,
2221 				const u32 *dword_pair, int num_regs)
2222 {
2223 	for (int i = 0; i < num_regs; i++) {
2224 		if (dword_pair[2 * i] == offset) {
2225 			*value = dword_pair[2 * i + 1];
2226 			return 0;
2227 		}
2228 	}
2229 
2230 	return -ENOENT;
2231 }
2232 
2233 /*
2234  * Lookup the value of a register in a specific engine type's default LRC.
2235  *
2236  * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register
2237  * cannot be found in the default LRC.
2238  */
2239 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt,
2240 				    enum xe_engine_class hwe_class,
2241 				    u32 offset,
2242 				    u32 *value)
2243 {
2244 	u32 *dw;
2245 	int remaining_dw, ret;
2246 
2247 	if (!gt->default_lrc[hwe_class])
2248 		return -EINVAL;
2249 
2250 	/*
2251 	 * Skip the beginning of the LRC since it contains the per-process
2252 	 * hardware status page.
2253 	 */
2254 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2255 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2256 
2257 	while (remaining_dw > 0) {
2258 		u32 num_dw = instr_dw(*dw);
2259 
2260 		if (num_dw > remaining_dw)
2261 			num_dw = remaining_dw;
2262 
2263 		switch (*dw & XE_INSTR_CMD_TYPE) {
2264 		case XE_INSTR_MI:
2265 			switch (*dw & MI_OPCODE) {
2266 			case MI_BATCH_BUFFER_END:
2267 				/* End of LRC; register not found */
2268 				return -ENOENT;
2269 
2270 			case MI_NOOP:
2271 			case MI_TOPOLOGY_FILTER:
2272 				/*
2273 				 * MI_NOOP and MI_TOPOLOGY_FILTER don't have
2274 				 * a length field and are always 1-dword
2275 				 * instructions.
2276 				 */
2277 				remaining_dw--;
2278 				dw++;
2279 				break;
2280 
2281 			case MI_LOAD_REGISTER_IMM:
2282 				ret = lookup_reg_in_mi_lri(offset, value,
2283 							   dw + 1, (num_dw - 1) / 2);
2284 				if (ret == 0)
2285 					return 0;
2286 
2287 				fallthrough;
2288 
2289 			default:
2290 				/*
2291 				 * Jump to next instruction based on length
2292 				 * field.
2293 				 */
2294 				remaining_dw -= num_dw;
2295 				dw += num_dw;
2296 				break;
2297 			}
2298 			break;
2299 
2300 		default:
2301 			/* Jump to next instruction based on length field. */
2302 			remaining_dw -= num_dw;
2303 			dw += num_dw;
2304 		}
2305 	}
2306 
2307 	return -ENOENT;
2308 }
2309 
2310 struct instr_state {
2311 	u32 instr;
2312 	u16 num_dw;
2313 };
2314 
2315 static const struct instr_state xe_hpg_svg_state[] = {
2316 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2317 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2318 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2319 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2320 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2321 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2322 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2323 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2324 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2325 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2326 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2327 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2328 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2329 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2330 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2331 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2332 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2333 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2334 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2335 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2336 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2337 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2338 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2339 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2340 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2341 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2342 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2343 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2344 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2345 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2346 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2347 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2348 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2349 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2350 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2351 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2352 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2353 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2354 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2355 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2356 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2357 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2358 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2359 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2360 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2361 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2362 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2363 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2364 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2365 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2366 };
2367 
2368 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2369 {
2370 	struct xe_gt *gt = q->hwe->gt;
2371 	struct xe_device *xe = gt_to_xe(gt);
2372 	const struct instr_state *state_table = NULL;
2373 	int state_table_size = 0;
2374 
2375 	/*
2376 	 * Wa_14019789679
2377 	 *
2378 	 * If the driver doesn't explicitly emit the SVG instructions while
2379 	 * setting up the default LRC, the context switch will write 0's
2380 	 * (noops) into the LRC memory rather than the expected instruction
2381 	 * headers.  Application contexts start out as a copy of the default
2382 	 * LRC, and if they also do not emit specific settings for some SVG
2383 	 * state, then on context restore they'll unintentionally inherit
2384 	 * whatever state setting the previous context had programmed into the
2385 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2386 	 * prevent the hardware from resetting that state back to any specific
2387 	 * value).
2388 	 *
2389 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2390 	 * since that's a specific state setting that can easily cause GPU
2391 	 * hangs if unintentionally inherited.  However to be safe we'll
2392 	 * continue to emit all of the SVG state since it's best not to leak
2393 	 * any of the state between contexts, even if that leakage is harmless.
2394 	 */
2395 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2396 		state_table = xe_hpg_svg_state;
2397 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2398 	}
2399 
2400 	if (!state_table) {
2401 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2402 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2403 		return cs;
2404 	}
2405 
2406 	for (int i = 0; i < state_table_size; i++) {
2407 		u32 instr = state_table[i].instr;
2408 		u16 num_dw = state_table[i].num_dw;
2409 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2410 
2411 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2412 		xe_gt_assert(gt, num_dw != 0);
2413 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2414 
2415 		/*
2416 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2417 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2418 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2419 		 * Just make the replacement here rather than defining a
2420 		 * whole separate table for the single trivial change.
2421 		 */
2422 		if (GRAPHICS_VER(xe) >= 20 &&
2423 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2424 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2425 
2426 		*cs = instr;
2427 		if (!is_single_dw)
2428 			*cs |= (num_dw - 2);
2429 
2430 		cs += num_dw;
2431 	}
2432 
2433 	return cs;
2434 }
2435 
2436 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2437 {
2438 	struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2439 
2440 	if (!snapshot)
2441 		return NULL;
2442 
2443 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2444 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2445 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2446 	snapshot->head = xe_lrc_ring_head(lrc);
2447 	snapshot->tail.internal = lrc->ring.tail;
2448 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2449 	snapshot->start = xe_lrc_ring_start(lrc);
2450 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2451 	snapshot->seqno = xe_lrc_seqno(lrc);
2452 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2453 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2454 	snapshot->lrc_size = lrc->size;
2455 	snapshot->replay_offset = 0;
2456 	snapshot->replay_size = lrc->replay_size;
2457 	snapshot->lrc_snapshot = NULL;
2458 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2459 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2460 	return snapshot;
2461 }
2462 
2463 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2464 {
2465 	struct xe_bo *bo;
2466 	struct iosys_map src;
2467 
2468 	if (!snapshot)
2469 		return;
2470 
2471 	bo = snapshot->lrc_bo;
2472 	snapshot->lrc_bo = NULL;
2473 
2474 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2475 	if (!snapshot->lrc_snapshot)
2476 		goto put_bo;
2477 
2478 	xe_bo_lock(bo, false);
2479 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2480 		xe_map_memcpy_from(xe_bo_device(bo),
2481 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2482 				   snapshot->lrc_size);
2483 		ttm_bo_vunmap(&bo->ttm, &src);
2484 	} else {
2485 		kvfree(snapshot->lrc_snapshot);
2486 		snapshot->lrc_snapshot = NULL;
2487 	}
2488 	xe_bo_unlock(bo);
2489 put_bo:
2490 	xe_bo_put(bo);
2491 }
2492 
2493 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2494 {
2495 	unsigned long i;
2496 
2497 	if (!snapshot)
2498 		return;
2499 
2500 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2501 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2502 		   snapshot->ring_addr);
2503 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2504 		   snapshot->indirect_context_desc);
2505 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2506 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2507 		   snapshot->tail.internal, snapshot->tail.memory);
2508 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2509 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2510 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2511 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2512 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2513 
2514 	if (!snapshot->lrc_snapshot)
2515 		return;
2516 
2517 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2518 	drm_puts(p, "\t[HWSP].data: ");
2519 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2520 		u32 *val = snapshot->lrc_snapshot + i;
2521 		char dumped[ASCII85_BUFSZ];
2522 
2523 		drm_puts(p, ascii85_encode(*val, dumped));
2524 	}
2525 
2526 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2527 	drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2528 	drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2529 
2530 	drm_puts(p, "\t[HWCTX].data: ");
2531 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2532 		u32 *val = snapshot->lrc_snapshot + i;
2533 		char dumped[ASCII85_BUFSZ];
2534 
2535 		drm_puts(p, ascii85_encode(*val, dumped));
2536 	}
2537 	drm_puts(p, "\n");
2538 }
2539 
2540 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2541 {
2542 	if (!snapshot)
2543 		return;
2544 
2545 	kvfree(snapshot->lrc_snapshot);
2546 	if (snapshot->lrc_bo)
2547 		xe_bo_put(snapshot->lrc_bo);
2548 
2549 	kfree(snapshot);
2550 }
2551 
2552 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2553 {
2554 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2555 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2556 	struct xe_hw_engine *hwe;
2557 	u64 val;
2558 
2559 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2560 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2561 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2562 			    class, instance))
2563 		return -1;
2564 
2565 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2566 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2567 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2568 	else
2569 		val = xe_mmio_read32(&hwe->gt->mmio,
2570 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2571 
2572 	*reg_ctx_ts = val;
2573 
2574 	return 0;
2575 }
2576 
2577 /**
2578  * xe_lrc_timestamp() - Current ctx timestamp
2579  * @lrc: Pointer to the lrc.
2580  *
2581  * Return latest ctx timestamp. With support for active contexts, the
2582  * calculation may be slightly racy, so follow a read-again logic to ensure that
2583  * the context is still active before returning the right timestamp.
2584  *
2585  * Returns: New ctx timestamp value
2586  */
2587 u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2588 {
2589 	u64 lrc_ts, reg_ts, new_ts = lrc->ctx_timestamp;
2590 	u32 engine_id;
2591 
2592 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2593 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2594 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2595 		new_ts = lrc_ts;
2596 		goto done;
2597 	}
2598 
2599 	if (lrc_ts == CONTEXT_ACTIVE) {
2600 		engine_id = xe_lrc_engine_id(lrc);
2601 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2602 			new_ts = reg_ts;
2603 
2604 		/* read lrc again to ensure context is still active */
2605 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2606 	}
2607 
2608 	/*
2609 	 * If context switched out, just use the lrc_ts. Note that this needs to
2610 	 * be a separate if condition.
2611 	 */
2612 	if (lrc_ts != CONTEXT_ACTIVE)
2613 		new_ts = lrc_ts;
2614 
2615 done:
2616 	return new_ts;
2617 }
2618 
2619 /**
2620  * xe_lrc_update_timestamp() - Update ctx timestamp
2621  * @lrc: Pointer to the lrc.
2622  * @old_ts: Old timestamp value
2623  *
2624  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2625  * update saved value.
2626  *
2627  * Returns: New ctx timestamp value
2628  */
2629 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2630 {
2631 	*old_ts = lrc->ctx_timestamp;
2632 	lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
2633 
2634 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2635 
2636 	return lrc->ctx_timestamp;
2637 }
2638 
2639 /**
2640  * xe_lrc_ring_is_idle() - LRC is idle
2641  * @lrc: Pointer to the lrc.
2642  *
2643  * Compare LRC ring head and tail to determine if idle.
2644  *
2645  * Return: True is ring is idle, False otherwise
2646  */
2647 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2648 {
2649 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2650 }
2651