xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 13c072b8e91a5ccb5855ca1ba6fe3ea467dbf94d)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_gt_regs.h"
18 #include "regs/xe_lrc_layout.h"
19 #include "xe_bb.h"
20 #include "xe_bo.h"
21 #include "xe_configfs.h"
22 #include "xe_device.h"
23 #include "xe_drm_client.h"
24 #include "xe_exec_queue_types.h"
25 #include "xe_gt.h"
26 #include "xe_gt_printk.h"
27 #include "xe_hw_fence.h"
28 #include "xe_map.h"
29 #include "xe_memirq.h"
30 #include "xe_mmio.h"
31 #include "xe_sriov.h"
32 #include "xe_trace_lrc.h"
33 #include "xe_vm.h"
34 #include "xe_wa.h"
35 
36 #define LRC_VALID				BIT_ULL(0)
37 #define LRC_PRIVILEGE				BIT_ULL(8)
38 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
39 #define LRC_LEGACY_64B_CONTEXT			3
40 
41 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
42 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
43 
44 #define LRC_PPHWSP_SIZE				SZ_4K
45 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
46 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
47 
48 #define LRC_PRIORITY				GENMASK_ULL(10, 9)
49 #define LRC_PRIORITY_LOW			0
50 #define LRC_PRIORITY_NORMAL			1
51 #define LRC_PRIORITY_HIGH			2
52 
53 /*
54  * Layout of the LRC and associated data allocated as
55  * lrc->bo:
56  *
57  *   Region                       Size
58  *  +============================+=================================+ <- __xe_lrc_ring_offset()
59  *  | Ring                       | ring_size, see                  |
60  *  |                            | xe_lrc_init()                   |
61  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
62  *  | PPHWSP (includes SW state) | 4K                              |
63  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
64  *  | Engine Context Image       | n * 4K, see                     |
65  *  |                            | xe_gt_lrc_size()                |
66  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
67  *  | Indirect Ring State Page   | 0 or 4k, see                    |
68  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
69  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
70  *  | Indirect Context Page      | 0 or 4k, see                    |
71  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
72  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
73  *  | WA BB Per Ctx              | 4k                              |
74  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
75  */
76 
77 static struct xe_device *
78 lrc_to_xe(struct xe_lrc *lrc)
79 {
80 	return gt_to_xe(lrc->fence_ctx.gt);
81 }
82 
83 static bool
84 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
85 {
86 	struct xe_device *xe = gt_to_xe(gt);
87 
88 	if (XE_GT_WA(gt, 16010904313) &&
89 	    (class == XE_ENGINE_CLASS_RENDER ||
90 	     class == XE_ENGINE_CLASS_COMPUTE))
91 		return true;
92 
93 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
94 					       class, NULL))
95 		return true;
96 
97 	return false;
98 }
99 
100 /**
101  * xe_gt_lrc_hang_replay_size() - Hang replay size
102  * @gt: The GT
103  * @class: Hardware engine class
104  *
105  * Determine size of GPU hang replay state for a GT and hardware engine class.
106  *
107  * Return: Size of GPU hang replay size
108  */
109 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
110 {
111 	struct xe_device *xe = gt_to_xe(gt);
112 	size_t size = 0;
113 
114 	/* Engine context image */
115 	switch (class) {
116 	case XE_ENGINE_CLASS_RENDER:
117 		if (GRAPHICS_VERx100(xe) >= 3510)
118 			size += 7 * SZ_4K;
119 		else if (GRAPHICS_VER(xe) >= 20)
120 			size += 3 * SZ_4K;
121 		else
122 			size += 13 * SZ_4K;
123 		break;
124 	case XE_ENGINE_CLASS_COMPUTE:
125 		if (GRAPHICS_VERx100(xe) >= 3510)
126 			size += 5 * SZ_4K;
127 		else if (GRAPHICS_VER(xe) >= 20)
128 			size += 2 * SZ_4K;
129 		else
130 			size += 13 * SZ_4K;
131 		break;
132 	default:
133 		WARN(1, "Unknown engine class: %d", class);
134 		fallthrough;
135 	case XE_ENGINE_CLASS_COPY:
136 	case XE_ENGINE_CLASS_VIDEO_DECODE:
137 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
138 	case XE_ENGINE_CLASS_OTHER:
139 		size += 1 * SZ_4K;
140 	}
141 
142 	return size;
143 }
144 
145 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
146 {
147 	size_t size = xe_gt_lrc_hang_replay_size(gt, class);
148 
149 	/* Add indirect ring state page */
150 	if (xe_gt_has_indirect_ring_state(gt))
151 		size += LRC_INDIRECT_RING_STATE_SIZE;
152 
153 	return size + LRC_PPHWSP_SIZE;
154 }
155 
156 /*
157  * The per-platform tables are u8-encoded in @data. Decode @data and set the
158  * addresses' offset and commands in @regs. The following encoding is used
159  * for each byte. There are 2 steps: decoding commands and decoding addresses.
160  *
161  * Commands:
162  * [7]: create NOPs - number of NOPs are set in lower bits
163  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
164  *      MI_LRI_FORCE_POSTED
165  * [5:0]: Number of NOPs or registers to set values to in case of
166  *        MI_LOAD_REGISTER_IMM
167  *
168  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
169  * number of registers. They are set by using the REG/REG16 macros: the former
170  * is used for offsets smaller than 0x200 while the latter is for values bigger
171  * than that. Those macros already set all the bits documented below correctly:
172  *
173  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
174  *      follow, for the lower bits
175  * [6:0]: Register offset, without considering the engine base.
176  *
177  * This function only tweaks the commands and register offsets. Values are not
178  * filled out.
179  */
180 static void set_offsets(u32 *regs,
181 			const u8 *data,
182 			const struct xe_hw_engine *hwe)
183 #define NOP(x) (BIT(7) | (x))
184 #define LRI(count, flags) ((flags) << 6 | (count) | \
185 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
186 #define POSTED BIT(0)
187 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
188 #define REG16(x) \
189 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
190 	(((x) >> 2) & 0x7f)
191 {
192 	const u32 base = hwe->mmio_base;
193 
194 	while (*data) {
195 		u8 count, flags;
196 
197 		if (*data & BIT(7)) { /* skip */
198 			count = *data++ & ~BIT(7);
199 			regs += count;
200 			continue;
201 		}
202 
203 		count = *data & 0x3f;
204 		flags = *data >> 6;
205 		data++;
206 
207 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
208 		if (flags & POSTED)
209 			*regs |= MI_LRI_FORCE_POSTED;
210 		*regs |= MI_LRI_LRM_CS_MMIO;
211 		regs++;
212 
213 		xe_gt_assert(hwe->gt, count);
214 		do {
215 			u32 offset = 0;
216 			u8 v;
217 
218 			do {
219 				v = *data++;
220 				offset <<= 7;
221 				offset |= v & ~BIT(7);
222 			} while (v & BIT(7));
223 
224 			regs[0] = base + (offset << 2);
225 			regs += 2;
226 		} while (--count);
227 	}
228 
229 	*regs = MI_BATCH_BUFFER_END | BIT(0);
230 }
231 
232 static const u8 gen12_xcs_offsets[] = {
233 	NOP(1),
234 	LRI(13, POSTED),
235 	REG16(0x244),
236 	REG(0x034),
237 	REG(0x030),
238 	REG(0x038),
239 	REG(0x03c),
240 	REG(0x168),
241 	REG(0x140),
242 	REG(0x110),
243 	REG(0x1c0),
244 	REG(0x1c4),
245 	REG(0x1c8),
246 	REG(0x180),
247 	REG16(0x2b4),
248 
249 	NOP(5),
250 	LRI(9, POSTED),
251 	REG16(0x3a8),
252 	REG16(0x28c),
253 	REG16(0x288),
254 	REG16(0x284),
255 	REG16(0x280),
256 	REG16(0x27c),
257 	REG16(0x278),
258 	REG16(0x274),
259 	REG16(0x270),
260 
261 	0
262 };
263 
264 static const u8 dg2_xcs_offsets[] = {
265 	NOP(1),
266 	LRI(15, POSTED),
267 	REG16(0x244),
268 	REG(0x034),
269 	REG(0x030),
270 	REG(0x038),
271 	REG(0x03c),
272 	REG(0x168),
273 	REG(0x140),
274 	REG(0x110),
275 	REG(0x1c0),
276 	REG(0x1c4),
277 	REG(0x1c8),
278 	REG(0x180),
279 	REG16(0x2b4),
280 	REG(0x120),
281 	REG(0x124),
282 
283 	NOP(1),
284 	LRI(9, POSTED),
285 	REG16(0x3a8),
286 	REG16(0x28c),
287 	REG16(0x288),
288 	REG16(0x284),
289 	REG16(0x280),
290 	REG16(0x27c),
291 	REG16(0x278),
292 	REG16(0x274),
293 	REG16(0x270),
294 
295 	0
296 };
297 
298 static const u8 gen12_rcs_offsets[] = {
299 	NOP(1),
300 	LRI(13, POSTED),
301 	REG16(0x244),
302 	REG(0x034),
303 	REG(0x030),
304 	REG(0x038),
305 	REG(0x03c),
306 	REG(0x168),
307 	REG(0x140),
308 	REG(0x110),
309 	REG(0x1c0),
310 	REG(0x1c4),
311 	REG(0x1c8),
312 	REG(0x180),
313 	REG16(0x2b4),
314 
315 	NOP(5),
316 	LRI(9, POSTED),
317 	REG16(0x3a8),
318 	REG16(0x28c),
319 	REG16(0x288),
320 	REG16(0x284),
321 	REG16(0x280),
322 	REG16(0x27c),
323 	REG16(0x278),
324 	REG16(0x274),
325 	REG16(0x270),
326 
327 	LRI(3, POSTED),
328 	REG(0x1b0),
329 	REG16(0x5a8),
330 	REG16(0x5ac),
331 
332 	NOP(6),
333 	LRI(1, 0),
334 	REG(0x0c8),
335 	NOP(3 + 9 + 1),
336 
337 	LRI(51, POSTED),
338 	REG16(0x588),
339 	REG16(0x588),
340 	REG16(0x588),
341 	REG16(0x588),
342 	REG16(0x588),
343 	REG16(0x588),
344 	REG(0x028),
345 	REG(0x09c),
346 	REG(0x0c0),
347 	REG(0x178),
348 	REG(0x17c),
349 	REG16(0x358),
350 	REG(0x170),
351 	REG(0x150),
352 	REG(0x154),
353 	REG(0x158),
354 	REG16(0x41c),
355 	REG16(0x600),
356 	REG16(0x604),
357 	REG16(0x608),
358 	REG16(0x60c),
359 	REG16(0x610),
360 	REG16(0x614),
361 	REG16(0x618),
362 	REG16(0x61c),
363 	REG16(0x620),
364 	REG16(0x624),
365 	REG16(0x628),
366 	REG16(0x62c),
367 	REG16(0x630),
368 	REG16(0x634),
369 	REG16(0x638),
370 	REG16(0x63c),
371 	REG16(0x640),
372 	REG16(0x644),
373 	REG16(0x648),
374 	REG16(0x64c),
375 	REG16(0x650),
376 	REG16(0x654),
377 	REG16(0x658),
378 	REG16(0x65c),
379 	REG16(0x660),
380 	REG16(0x664),
381 	REG16(0x668),
382 	REG16(0x66c),
383 	REG16(0x670),
384 	REG16(0x674),
385 	REG16(0x678),
386 	REG16(0x67c),
387 	REG(0x068),
388 	REG(0x084),
389 	NOP(1),
390 
391 	0
392 };
393 
394 static const u8 xehp_rcs_offsets[] = {
395 	NOP(1),
396 	LRI(13, POSTED),
397 	REG16(0x244),
398 	REG(0x034),
399 	REG(0x030),
400 	REG(0x038),
401 	REG(0x03c),
402 	REG(0x168),
403 	REG(0x140),
404 	REG(0x110),
405 	REG(0x1c0),
406 	REG(0x1c4),
407 	REG(0x1c8),
408 	REG(0x180),
409 	REG16(0x2b4),
410 
411 	NOP(5),
412 	LRI(9, POSTED),
413 	REG16(0x3a8),
414 	REG16(0x28c),
415 	REG16(0x288),
416 	REG16(0x284),
417 	REG16(0x280),
418 	REG16(0x27c),
419 	REG16(0x278),
420 	REG16(0x274),
421 	REG16(0x270),
422 
423 	LRI(3, POSTED),
424 	REG(0x1b0),
425 	REG16(0x5a8),
426 	REG16(0x5ac),
427 
428 	NOP(6),
429 	LRI(1, 0),
430 	REG(0x0c8),
431 
432 	0
433 };
434 
435 static const u8 dg2_rcs_offsets[] = {
436 	NOP(1),
437 	LRI(15, POSTED),
438 	REG16(0x244),
439 	REG(0x034),
440 	REG(0x030),
441 	REG(0x038),
442 	REG(0x03c),
443 	REG(0x168),
444 	REG(0x140),
445 	REG(0x110),
446 	REG(0x1c0),
447 	REG(0x1c4),
448 	REG(0x1c8),
449 	REG(0x180),
450 	REG16(0x2b4),
451 	REG(0x120),
452 	REG(0x124),
453 
454 	NOP(1),
455 	LRI(9, POSTED),
456 	REG16(0x3a8),
457 	REG16(0x28c),
458 	REG16(0x288),
459 	REG16(0x284),
460 	REG16(0x280),
461 	REG16(0x27c),
462 	REG16(0x278),
463 	REG16(0x274),
464 	REG16(0x270),
465 
466 	LRI(3, POSTED),
467 	REG(0x1b0),
468 	REG16(0x5a8),
469 	REG16(0x5ac),
470 
471 	NOP(6),
472 	LRI(1, 0),
473 	REG(0x0c8),
474 
475 	0
476 };
477 
478 static const u8 mtl_rcs_offsets[] = {
479 	NOP(1),
480 	LRI(15, POSTED),
481 	REG16(0x244),
482 	REG(0x034),
483 	REG(0x030),
484 	REG(0x038),
485 	REG(0x03c),
486 	REG(0x168),
487 	REG(0x140),
488 	REG(0x110),
489 	REG(0x1c0),
490 	REG(0x1c4),
491 	REG(0x1c8),
492 	REG(0x180),
493 	REG16(0x2b4),
494 	REG(0x120),
495 	REG(0x124),
496 
497 	NOP(1),
498 	LRI(9, POSTED),
499 	REG16(0x3a8),
500 	REG16(0x28c),
501 	REG16(0x288),
502 	REG16(0x284),
503 	REG16(0x280),
504 	REG16(0x27c),
505 	REG16(0x278),
506 	REG16(0x274),
507 	REG16(0x270),
508 
509 	NOP(2),
510 	LRI(2, POSTED),
511 	REG16(0x5a8),
512 	REG16(0x5ac),
513 
514 	NOP(6),
515 	LRI(1, 0),
516 	REG(0x0c8),
517 
518 	0
519 };
520 
521 #define XE2_CTX_COMMON \
522 	NOP(1),                 /* [0x00] */ \
523 	LRI(15, POSTED),        /* [0x01] */ \
524 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
525 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
526 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
527 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
528 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
529 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
530 	REG(0x140),             /* [0x0e] BB_ADDR */ \
531 	REG(0x110),             /* [0x10] BB_STATE */ \
532 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
533 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
534 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
535 	REG(0x180),             /* [0x18] CCID */ \
536 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
537 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
538 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
539 	\
540 	NOP(1),                 /* [0x20] */ \
541 	LRI(9, POSTED),         /* [0x21] */ \
542 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
543 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
544 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
545 	REG16(0x284),           /* [0x28] dummy reg */ \
546 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
547 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
548 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
549 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
550 	REG16(0x270)            /* [0x32] PTBP_LDW */
551 
552 static const u8 xe2_rcs_offsets[] = {
553 	XE2_CTX_COMMON,
554 
555 	NOP(2),                 /* [0x34] */
556 	LRI(2, POSTED),         /* [0x36] */
557 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
558 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
559 
560 	NOP(6),                 /* [0x41] */
561 	LRI(1, 0),              /* [0x47] */
562 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
563 
564 	0
565 };
566 
567 static const u8 xe2_bcs_offsets[] = {
568 	XE2_CTX_COMMON,
569 
570 	NOP(4 + 8 + 1),         /* [0x34] */
571 	LRI(2, POSTED),         /* [0x41] */
572 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
573 	REG16(0x204),           /* [0x44] BLIT_CCTL */
574 
575 	0
576 };
577 
578 static const u8 xe2_xcs_offsets[] = {
579 	XE2_CTX_COMMON,
580 
581 	0
582 };
583 
584 static const u8 xe2_indirect_ring_state_offsets[] = {
585 	NOP(1),                 /* [0x00] */
586 	LRI(5, POSTED),         /* [0x01] */
587 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
588 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
589 	REG(0x038),             /* [0x06] RING_BUFFER_START */
590 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
591 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
592 
593 	NOP(5),                 /* [0x0c] */
594 	LRI(9, POSTED),         /* [0x11] */
595 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
596 	REG(0x140),             /* [0x14] BB_ADDR */
597 	REG(0x110),             /* [0x16] BB_STATE */
598 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
599 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
600 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
601 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
602 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
603 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
604 
605 	NOP(12),                 /* [0x00] */
606 
607 	0
608 };
609 
610 #undef REG16
611 #undef REG
612 #undef LRI
613 #undef NOP
614 
615 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
616 {
617 	if (class == XE_ENGINE_CLASS_RENDER) {
618 		if (GRAPHICS_VER(xe) >= 20)
619 			return xe2_rcs_offsets;
620 		else if (GRAPHICS_VERx100(xe) >= 1270)
621 			return mtl_rcs_offsets;
622 		else if (GRAPHICS_VERx100(xe) >= 1255)
623 			return dg2_rcs_offsets;
624 		else if (GRAPHICS_VERx100(xe) >= 1250)
625 			return xehp_rcs_offsets;
626 		else
627 			return gen12_rcs_offsets;
628 	} else if (class == XE_ENGINE_CLASS_COPY) {
629 		if (GRAPHICS_VER(xe) >= 20)
630 			return xe2_bcs_offsets;
631 		else
632 			return gen12_xcs_offsets;
633 	} else {
634 		if (GRAPHICS_VER(xe) >= 20)
635 			return xe2_xcs_offsets;
636 		else if (GRAPHICS_VERx100(xe) >= 1255)
637 			return dg2_xcs_offsets;
638 		else
639 			return gen12_xcs_offsets;
640 	}
641 }
642 
643 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
644 {
645 	regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
646 							    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
647 
648 	if (xe_gt_has_indirect_ring_state(hwe->gt))
649 		regs[CTX_CONTEXT_CONTROL] |=
650 			REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
651 }
652 
653 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
654 {
655 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
656 	struct xe_device *xe = gt_to_xe(hwe->gt);
657 	u8 num_regs;
658 
659 	if (!xe_device_uses_memirq(xe))
660 		return;
661 
662 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
663 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
664 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
665 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
666 
667 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
668 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
669 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
670 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
671 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
672 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
673 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
674 
675 	if (xe_device_has_msix(xe)) {
676 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
677 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
678 	}
679 }
680 
681 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
682 {
683 	struct xe_device *xe = gt_to_xe(hwe->gt);
684 
685 	if (GRAPHICS_VERx100(xe) >= 1250)
686 		return 0x70;
687 	else
688 		return 0x60;
689 }
690 
691 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
692 {
693 	int x;
694 
695 	x = lrc_ring_mi_mode(hwe);
696 	regs[x + 1] &= ~STOP_RING;
697 	regs[x + 1] |= STOP_RING << 16;
698 }
699 
700 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
701 {
702 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
703 }
704 
705 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
706 {
707 	return 0;
708 }
709 
710 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
711 {
712 	return lrc->ring.size;
713 }
714 
715 /* Make the magic macros work */
716 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
717 #define __xe_lrc_regs_offset xe_lrc_regs_offset
718 
719 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512
720 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
721 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
722 
723 #define LRC_SEQNO_OFFSET 0
724 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8)
725 
726 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
727 {
728 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
729 }
730 
731 /**
732  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
733  * @xe: the &xe_device struct instance
734  *
735  * Returns: Size of the LRC registers area for current platform
736  */
737 size_t xe_lrc_reg_size(struct xe_device *xe)
738 {
739 	if (GRAPHICS_VERx100(xe) >= 1250)
740 		return 96 * sizeof(u32);
741 	else
742 		return 80 * sizeof(u32);
743 }
744 
745 size_t xe_lrc_skip_size(struct xe_device *xe)
746 {
747 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
748 }
749 
750 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
751 {
752 	return LRC_SEQNO_OFFSET;
753 }
754 
755 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
756 {
757 	return LRC_START_SEQNO_OFFSET;
758 }
759 
760 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
761 {
762 	/* This is stored in the driver-defined portion of PPHWSP */
763 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
764 }
765 
766 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
767 {
768 	/* The parallel is stored in the driver-defined portion of PPHWSP */
769 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
770 }
771 
772 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
773 {
774 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
775 }
776 
777 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
778 {
779 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
780 }
781 
782 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
783 {
784 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
785 }
786 
787 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
788 {
789 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
790 		     LRC_INDIRECT_RING_STATE_SIZE;
791 
792 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
793 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
794 
795 	return offset;
796 }
797 
798 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
799 {
800 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
801 }
802 
803 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
804 {
805 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
806 }
807 
808 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \
809 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
810 { \
811 	struct xe_bo *bo = (bo_expr); \
812 	struct iosys_map map = bo->vmap; \
813 \
814 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
815 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
816 	return map; \
817 } \
818 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
819 { \
820 	struct xe_bo *bo = (bo_expr); \
821 \
822 	return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \
823 } \
824 
825 DECL_MAP_ADDR_HELPERS(ring, lrc->bo)
826 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo)
827 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo)
828 DECL_MAP_ADDR_HELPERS(regs, lrc->bo)
829 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo)
830 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo)
831 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo)
832 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo)
833 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo)
834 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo)
835 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo)
836 
837 #undef DECL_MAP_ADDR_HELPERS
838 
839 /**
840  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
841  * @lrc: Pointer to the lrc.
842  *
843  * Returns: ctx timestamp GGTT address
844  */
845 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
846 {
847 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
848 }
849 
850 /**
851  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
852  * @lrc: Pointer to the lrc.
853  *
854  * Returns: ctx timestamp udw GGTT address
855  */
856 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
857 {
858 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
859 }
860 
861 /**
862  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
863  * @lrc: Pointer to the lrc.
864  *
865  * Returns: ctx timestamp value
866  */
867 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
868 {
869 	struct xe_device *xe = lrc_to_xe(lrc);
870 	struct iosys_map map;
871 	u32 ldw, udw = 0;
872 
873 	map = __xe_lrc_ctx_timestamp_map(lrc);
874 	ldw = xe_map_read32(xe, &map);
875 
876 	if (xe->info.has_64bit_timestamp) {
877 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
878 		udw = xe_map_read32(xe, &map);
879 	}
880 
881 	return (u64)udw << 32 | ldw;
882 }
883 
884 /**
885  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
886  * @lrc: Pointer to the lrc.
887  *
888  * Returns: ctx timestamp job GGTT address
889  */
890 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
891 {
892 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
893 }
894 
895 /**
896  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
897  * @lrc: Pointer to the lrc.
898  *
899  * Returns: ctx timestamp job value
900  */
901 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
902 {
903 	struct xe_device *xe = lrc_to_xe(lrc);
904 	struct iosys_map map;
905 
906 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
907 	return xe_map_read32(xe, &map);
908 }
909 
910 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
911 {
912 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
913 }
914 
915 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
916 {
917 	if (!xe_lrc_has_indirect_ring_state(lrc))
918 		return 0;
919 
920 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
921 }
922 
923 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
924 {
925 	struct xe_device *xe = lrc_to_xe(lrc);
926 	struct iosys_map map;
927 
928 	map = __xe_lrc_indirect_ring_map(lrc);
929 	iosys_map_incr(&map, reg_nr * sizeof(u32));
930 	return xe_map_read32(xe, &map);
931 }
932 
933 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
934 					  int reg_nr, u32 val)
935 {
936 	struct xe_device *xe = lrc_to_xe(lrc);
937 	struct iosys_map map;
938 
939 	map = __xe_lrc_indirect_ring_map(lrc);
940 	iosys_map_incr(&map, reg_nr * sizeof(u32));
941 	xe_map_write32(xe, &map, val);
942 }
943 
944 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
945 {
946 	struct xe_device *xe = lrc_to_xe(lrc);
947 	struct iosys_map map;
948 
949 	map = __xe_lrc_regs_map(lrc);
950 	iosys_map_incr(&map, reg_nr * sizeof(u32));
951 	return xe_map_read32(xe, &map);
952 }
953 
954 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
955 {
956 	struct xe_device *xe = lrc_to_xe(lrc);
957 	struct iosys_map map;
958 
959 	map = __xe_lrc_regs_map(lrc);
960 	iosys_map_incr(&map, reg_nr * sizeof(u32));
961 	xe_map_write32(xe, &map, val);
962 }
963 
964 static void *empty_lrc_data(struct xe_hw_engine *hwe)
965 {
966 	struct xe_gt *gt = hwe->gt;
967 	void *data;
968 	u32 *regs;
969 
970 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
971 	if (!data)
972 		return NULL;
973 
974 	/* 1st page: Per-Process of HW status Page */
975 	regs = data + LRC_PPHWSP_SIZE;
976 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
977 	set_context_control(regs, hwe);
978 	set_memory_based_intr(regs, hwe);
979 	reset_stop_ring(regs, hwe);
980 	if (xe_gt_has_indirect_ring_state(gt)) {
981 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
982 		       LRC_INDIRECT_RING_STATE_SIZE;
983 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
984 	}
985 
986 	return data;
987 }
988 
989 /**
990  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
991  * of given engine.
992  * @hwe: the &xe_hw_engine struct instance
993  */
994 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
995 {
996 	struct xe_gt *gt = hwe->gt;
997 	u32 *regs;
998 
999 	if (!gt->default_lrc[hwe->class])
1000 		return;
1001 
1002 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
1003 	set_memory_based_intr(regs, hwe);
1004 }
1005 
1006 /**
1007  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
1008  * for given LRC.
1009  * @lrc: the &xe_lrc struct instance
1010  * @hwe: the &xe_hw_engine struct instance
1011  * @regs: scratch buffer to be used as temporary storage
1012  */
1013 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1014 					    u32 *regs)
1015 {
1016 	struct xe_gt *gt = hwe->gt;
1017 	struct iosys_map map;
1018 	size_t regs_len;
1019 
1020 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
1021 		return;
1022 
1023 	map = __xe_lrc_regs_map(lrc);
1024 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1025 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1026 	set_memory_based_intr(regs, hwe);
1027 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1028 }
1029 
1030 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1031 {
1032 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1033 
1034 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1035 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1036 }
1037 
1038 static void xe_lrc_finish(struct xe_lrc *lrc)
1039 {
1040 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1041 	xe_bo_unpin_map_no_vm(lrc->bo);
1042 	xe_bo_unpin_map_no_vm(lrc->seqno_bo);
1043 }
1044 
1045 /*
1046  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1047  * in calculating active context run ticks.
1048  *
1049  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1050  * context, but only gets updated when the context switches out. In order to
1051  * check how long a context has been active before it switches out, two things
1052  * are required:
1053  *
1054  * (1) Determine if the context is running:
1055  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1056  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1057  * initialized. During a query, we just check for this value to determine if the
1058  * context is active. If the context switched out, it would overwrite this
1059  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1060  * the last part of context restore, so reusing this LRC location will not
1061  * clobber anything.
1062  *
1063  * (2) Calculate the time that the context has been active for:
1064  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1065  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1066  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1067  * engine instance. Since we do not know which instance the context is running
1068  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1069  * store it in the PPHSWP.
1070  */
1071 #define CONTEXT_ACTIVE 1ULL
1072 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1073 				    struct xe_hw_engine *hwe,
1074 				    u32 *batch,
1075 				    size_t max_len)
1076 {
1077 	u32 *cmd = batch;
1078 
1079 	if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1080 		return 0;
1081 
1082 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1083 		return -ENOSPC;
1084 
1085 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1086 	*cmd++ = ENGINE_ID(0).addr;
1087 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1088 	*cmd++ = 0;
1089 
1090 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1091 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1092 	*cmd++ = 0;
1093 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1094 
1095 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1096 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1097 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1098 		*cmd++ = 0;
1099 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1100 	}
1101 
1102 	return cmd - batch;
1103 }
1104 
1105 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1106 				  u32 *batch, size_t max_len)
1107 {
1108 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1109 	u32 *cmd = batch;
1110 
1111 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1112 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1113 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1114 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1115 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1116 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1117 		return 0;
1118 
1119 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1120 		return -ENOSPC;
1121 
1122 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1123 		 MI_LRM_ASYNC;
1124 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1125 	*cmd++ = ts_addr;
1126 	*cmd++ = 0;
1127 
1128 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1129 		 MI_LRM_ASYNC;
1130 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1131 	*cmd++ = ts_addr;
1132 	*cmd++ = 0;
1133 
1134 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1135 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1136 	*cmd++ = ts_addr;
1137 	*cmd++ = 0;
1138 
1139 	return cmd - batch;
1140 }
1141 
1142 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1143 						  struct xe_hw_engine *hwe,
1144 						  u32 *batch, size_t max_len)
1145 {
1146 	struct xe_device *xe = gt_to_xe(lrc->gt);
1147 	const u32 *user_batch;
1148 	u32 *cmd = batch;
1149 	u32 count;
1150 
1151 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1152 						    hwe->class, &user_batch);
1153 	if (!count)
1154 		return 0;
1155 
1156 	if (count > max_len)
1157 		return -ENOSPC;
1158 
1159 	/*
1160 	 * This should be used only for tests and validation. Taint the kernel
1161 	 * as anything could be submitted directly in context switches
1162 	 */
1163 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1164 
1165 	memcpy(cmd, user_batch, count * sizeof(u32));
1166 	cmd += count;
1167 
1168 	return cmd - batch;
1169 }
1170 
1171 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1172 						 struct xe_hw_engine *hwe,
1173 						 u32 *batch, size_t max_len)
1174 {
1175 	struct xe_device *xe = gt_to_xe(lrc->gt);
1176 	const u32 *user_batch;
1177 	u32 *cmd = batch;
1178 	u32 count;
1179 
1180 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1181 						   hwe->class, &user_batch);
1182 	if (!count)
1183 		return 0;
1184 
1185 	if (count > max_len)
1186 		return -ENOSPC;
1187 
1188 	/*
1189 	 * This should be used only for tests and validation. Taint the kernel
1190 	 * as anything could be submitted directly in context switches
1191 	 */
1192 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1193 
1194 	memcpy(cmd, user_batch, count * sizeof(u32));
1195 	cmd += count;
1196 
1197 	return cmd - batch;
1198 }
1199 
1200 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1201 					       struct xe_hw_engine *hwe,
1202 					       u32 *batch, size_t max_len)
1203 {
1204 	u32 *cmd = batch;
1205 
1206 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1207 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1208 		return 0;
1209 
1210 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1211 		return -ENOSPC;
1212 
1213 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1214 	*cmd++ = CS_DEBUG_MODE2(0).addr;
1215 	*cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1216 
1217 	return cmd - batch;
1218 }
1219 
1220 struct bo_setup {
1221 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1222 			 u32 *batch, size_t max_size);
1223 };
1224 
1225 struct bo_setup_state {
1226 	/* Input: */
1227 	struct xe_lrc		*lrc;
1228 	struct xe_hw_engine	*hwe;
1229 	size_t			max_size;
1230 	size_t                  reserve_dw;
1231 	unsigned int		offset;
1232 	const struct bo_setup	*funcs;
1233 	unsigned int		num_funcs;
1234 
1235 	/* State: */
1236 	u32			*buffer;
1237 	u32			*ptr;
1238 	unsigned int		written;
1239 };
1240 
1241 static int setup_bo(struct bo_setup_state *state)
1242 {
1243 	ssize_t remain;
1244 
1245 	if (state->lrc->bo->vmap.is_iomem) {
1246 		xe_gt_assert(state->hwe->gt, state->buffer);
1247 		state->ptr = state->buffer;
1248 	} else {
1249 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1250 	}
1251 
1252 	remain = state->max_size / sizeof(u32);
1253 
1254 	for (size_t i = 0; i < state->num_funcs; i++) {
1255 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1256 						    state->ptr, remain);
1257 
1258 		remain -= len;
1259 
1260 		/*
1261 		 * Caller has asked for at least reserve_dw to remain unused.
1262 		 */
1263 		if (len < 0 ||
1264 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1265 			goto fail;
1266 
1267 		state->ptr += len;
1268 		state->written += len;
1269 	}
1270 
1271 	return 0;
1272 
1273 fail:
1274 	return -ENOSPC;
1275 }
1276 
1277 static void finish_bo(struct bo_setup_state *state)
1278 {
1279 	if (!state->lrc->bo->vmap.is_iomem)
1280 		return;
1281 
1282 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1283 			 state->offset, state->buffer,
1284 			 state->written * sizeof(u32));
1285 }
1286 
1287 /**
1288  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1289  * @lrc: the &xe_lrc struct instance
1290  * @hwe: the &xe_hw_engine struct instance
1291  * @scratch: preallocated scratch buffer for temporary storage
1292  * Return: 0 on success, negative error code on failure
1293  */
1294 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1295 {
1296 	static const struct bo_setup funcs[] = {
1297 		{ .setup = setup_timestamp_wa },
1298 		{ .setup = setup_invalidate_state_cache_wa },
1299 		{ .setup = setup_utilization_wa },
1300 		{ .setup = setup_configfs_post_ctx_restore_bb },
1301 	};
1302 	struct bo_setup_state state = {
1303 		.lrc = lrc,
1304 		.hwe = hwe,
1305 		.max_size = LRC_WA_BB_SIZE,
1306 		.buffer = scratch,
1307 		.reserve_dw = 1,
1308 		.offset = __xe_lrc_wa_bb_offset(lrc),
1309 		.funcs = funcs,
1310 		.num_funcs = ARRAY_SIZE(funcs),
1311 	};
1312 	int ret;
1313 
1314 	ret = setup_bo(&state);
1315 	if (ret)
1316 		return ret;
1317 
1318 	*state.ptr++ = MI_BATCH_BUFFER_END;
1319 	state.written++;
1320 
1321 	finish_bo(&state);
1322 
1323 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1324 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1325 
1326 	return 0;
1327 }
1328 
1329 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1330 {
1331 	u32 *buf = NULL;
1332 	int ret;
1333 
1334 	if (lrc->bo->vmap.is_iomem) {
1335 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1336 		if (!buf)
1337 			return -ENOMEM;
1338 	}
1339 
1340 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1341 
1342 	kfree(buf);
1343 
1344 	return ret;
1345 }
1346 
1347 static int
1348 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1349 {
1350 	static const struct bo_setup rcs_funcs[] = {
1351 		{ .setup = setup_timestamp_wa },
1352 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1353 	};
1354 	static const struct bo_setup xcs_funcs[] = {
1355 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1356 	};
1357 	struct bo_setup_state state = {
1358 		.lrc = lrc,
1359 		.hwe = hwe,
1360 		.max_size = (63 * 64) /* max 63 cachelines */,
1361 		.buffer = NULL,
1362 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1363 	};
1364 	int ret;
1365 
1366 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1367 		return 0;
1368 
1369 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1370 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1371 		state.funcs = rcs_funcs;
1372 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1373 	} else {
1374 		state.funcs = xcs_funcs;
1375 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1376 	}
1377 
1378 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1379 		return 0;
1380 
1381 	if (lrc->bo->vmap.is_iomem) {
1382 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1383 		if (!state.buffer)
1384 			return -ENOMEM;
1385 	}
1386 
1387 	ret = setup_bo(&state);
1388 	if (ret) {
1389 		kfree(state.buffer);
1390 		return ret;
1391 	}
1392 
1393 	/*
1394 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1395 	 * execute: size for indirect ctx must be a multiple of 64.
1396 	 */
1397 	while (state.written & 0xf) {
1398 		*state.ptr++ = MI_NOOP;
1399 		state.written++;
1400 	}
1401 
1402 	finish_bo(&state);
1403 	kfree(state.buffer);
1404 
1405 	/*
1406 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1407 	 * varies per engine class, but the default is good enough
1408 	 */
1409 	xe_lrc_write_ctx_reg(lrc,
1410 			     CTX_CS_INDIRECT_CTX,
1411 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1412 			     /* Size in CLs. */
1413 			     (state.written * sizeof(u32) / 64));
1414 
1415 	return 0;
1416 }
1417 
1418 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1419 {
1420 	struct xe_device *xe = gt_to_xe(lrc->gt);
1421 
1422 	xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1423 		       priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1424 
1425 	/* xe_multi_queue_priority is directly mapped to LRC priority values */
1426 	return priority;
1427 }
1428 
1429 /**
1430  * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1431  * @lrc: Logical Ring Context
1432  * @priority: Multi queue priority of the exec queue
1433  *
1434  * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1435  */
1436 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1437 {
1438 	lrc->desc &= ~LRC_PRIORITY;
1439 	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1440 }
1441 
1442 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1443 			   void *replay_state, u16 msix_vec, u32 init_flags)
1444 {
1445 	struct xe_gt *gt = hwe->gt;
1446 	struct xe_tile *tile = gt_to_tile(gt);
1447 	struct xe_device *xe = gt_to_xe(gt);
1448 	struct iosys_map map;
1449 	u32 arb_enable;
1450 	u32 state_cache_perf_fix[3];
1451 	int err;
1452 
1453 	/*
1454 	 * Init Per-Process of HW status Page, LRC / context state to known
1455 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1456 	 * it's the early submission to record the lrc: build a new empty one from
1457 	 * scratch.
1458 	 */
1459 	map = __xe_lrc_pphwsp_map(lrc);
1460 	if (gt->default_lrc[hwe->class] || replay_state) {
1461 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1462 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1463 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1464 				 lrc->size - LRC_PPHWSP_SIZE);
1465 		if (replay_state)
1466 			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1467 					 replay_state, lrc->replay_size);
1468 	} else {
1469 		void *init_data = empty_lrc_data(hwe);
1470 
1471 		if (!init_data) {
1472 			return -ENOMEM;
1473 		}
1474 
1475 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
1476 		kfree(init_data);
1477 	}
1478 
1479 	if (vm)
1480 		xe_lrc_set_ppgtt(lrc, vm);
1481 
1482 	if (xe_device_has_msix(xe)) {
1483 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1484 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1485 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1486 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1487 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1488 	}
1489 
1490 	if (xe_gt_has_indirect_ring_state(gt)) {
1491 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1492 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1493 
1494 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1495 					      __xe_lrc_ring_ggtt_addr(lrc));
1496 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1497 
1498 		/* Match head and tail pointers */
1499 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail);
1500 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1501 
1502 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1503 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1504 	} else {
1505 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1506 
1507 		/* Match head and tail pointers */
1508 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail);
1509 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1510 
1511 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1512 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1513 	}
1514 
1515 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1516 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1517 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1518 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE));
1519 
1520 	if (init_flags & XE_LRC_CREATE_PXP)
1521 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1522 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1523 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE));
1524 
1525 	lrc->ctx_timestamp = 0;
1526 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1527 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1528 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1529 
1530 	if (xe->info.has_asid && vm)
1531 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1532 
1533 	lrc->desc = LRC_VALID;
1534 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1535 	/* TODO: Priority */
1536 
1537 	/* While this appears to have something about privileged batches or
1538 	 * some such, it really just means PPGTT mode.
1539 	 */
1540 	if (vm)
1541 		lrc->desc |= LRC_PRIVILEGE;
1542 
1543 	if (GRAPHICS_VERx100(xe) < 1250) {
1544 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1545 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1546 	}
1547 
1548 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1549 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1550 
1551 	if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) {
1552 		state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1553 		state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
1554 		state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX);
1555 		xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
1556 	}
1557 
1558 	map = __xe_lrc_seqno_map(lrc);
1559 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1560 
1561 	map = __xe_lrc_start_seqno_map(lrc);
1562 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1563 
1564 	err = setup_wa_bb(lrc, hwe);
1565 	if (err)
1566 		return err;
1567 
1568 	err = setup_indirect_ctx(lrc, hwe);
1569 
1570 	return err;
1571 }
1572 
1573 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1574 		       void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
1575 {
1576 	struct xe_gt *gt = hwe->gt;
1577 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1578 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1579 	struct xe_tile *tile = gt_to_tile(gt);
1580 	struct xe_device *xe = gt_to_xe(gt);
1581 	struct xe_bo *bo;
1582 	u32 bo_flags;
1583 	int err;
1584 
1585 	kref_init(&lrc->refcount);
1586 	lrc->gt = gt;
1587 	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1588 	lrc->size = lrc_size;
1589 	lrc->flags = 0;
1590 	lrc->ring.size = ring_size;
1591 	lrc->ring.tail = 0;
1592 
1593 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1594 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1595 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1596 	}
1597 
1598 	if (xe_gt_has_indirect_ring_state(gt))
1599 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1600 
1601 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1602 		   XE_BO_FLAG_GGTT_INVALIDATE;
1603 
1604 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1605 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1606 
1607 	bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
1608 				       ttm_bo_type_kernel,
1609 				       bo_flags, false);
1610 	if (IS_ERR(lrc->bo))
1611 		return PTR_ERR(lrc->bo);
1612 
1613 	lrc->bo = bo;
1614 
1615 	bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
1616 				       ttm_bo_type_kernel,
1617 				       XE_BO_FLAG_GGTT |
1618 				       XE_BO_FLAG_GGTT_INVALIDATE |
1619 				       XE_BO_FLAG_SYSTEM, false);
1620 	if (IS_ERR(bo)) {
1621 		err = PTR_ERR(bo);
1622 		goto err_lrc_finish;
1623 	}
1624 	lrc->seqno_bo = bo;
1625 
1626 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1627 			     hwe->fence_irq, hwe->name);
1628 
1629 	err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags);
1630 	if (err)
1631 		goto err_lrc_finish;
1632 
1633 	if (vm && vm->xef)
1634 		xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1635 
1636 	return 0;
1637 
1638 err_lrc_finish:
1639 	xe_lrc_finish(lrc);
1640 	return err;
1641 }
1642 
1643 /**
1644  * xe_lrc_create - Create a LRC
1645  * @hwe: Hardware Engine
1646  * @vm: The VM (address space)
1647  * @replay_state: GPU hang replay state
1648  * @ring_size: LRC ring size
1649  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1650  * @flags: LRC initialization flags
1651  *
1652  * Allocate and initialize the Logical Ring Context (LRC).
1653  *
1654  * Return pointer to created LRC upon success and an error pointer
1655  * upon failure.
1656  */
1657 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1658 			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1659 {
1660 	struct xe_lrc *lrc;
1661 	int err;
1662 
1663 	lrc = kzalloc_obj(*lrc);
1664 	if (!lrc)
1665 		return ERR_PTR(-ENOMEM);
1666 
1667 	err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1668 	if (err) {
1669 		kfree(lrc);
1670 		return ERR_PTR(err);
1671 	}
1672 
1673 	return lrc;
1674 }
1675 
1676 /**
1677  * xe_lrc_destroy - Destroy the LRC
1678  * @ref: reference to LRC
1679  *
1680  * Called when ref == 0, release resources held by the Logical Ring Context
1681  * (LRC) and free the LRC memory.
1682  */
1683 void xe_lrc_destroy(struct kref *ref)
1684 {
1685 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1686 
1687 	xe_lrc_finish(lrc);
1688 	kfree(lrc);
1689 }
1690 
1691 /**
1692  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1693  * @lrc: the &xe_lrc struct instance
1694  */
1695 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1696 {
1697 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1698 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1699 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1700 
1701 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1702 					      __xe_lrc_ring_ggtt_addr(lrc));
1703 	} else {
1704 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1705 	}
1706 }
1707 
1708 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1709 {
1710 	if (xe_lrc_has_indirect_ring_state(lrc))
1711 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1712 	else
1713 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1714 }
1715 
1716 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1717 {
1718 	if (xe_lrc_has_indirect_ring_state(lrc))
1719 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1720 	else
1721 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1722 }
1723 
1724 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1725 {
1726 	if (xe_lrc_has_indirect_ring_state(lrc))
1727 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1728 	else
1729 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1730 }
1731 
1732 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1733 {
1734 	if (xe_lrc_has_indirect_ring_state(lrc))
1735 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1736 	else
1737 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1738 }
1739 
1740 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1741 {
1742 	if (xe_lrc_has_indirect_ring_state(lrc))
1743 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1744 	else
1745 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1746 }
1747 
1748 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1749 {
1750 	const u32 head = xe_lrc_ring_head(lrc);
1751 	const u32 tail = lrc->ring.tail;
1752 	const u32 size = lrc->ring.size;
1753 
1754 	return ((head - tail - 1) & (size - 1)) + 1;
1755 }
1756 
1757 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1758 				const void *data, size_t size)
1759 {
1760 	struct xe_device *xe = lrc_to_xe(lrc);
1761 
1762 	iosys_map_incr(&ring, lrc->ring.tail);
1763 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1764 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1765 }
1766 
1767 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1768 {
1769 	struct xe_device *xe = lrc_to_xe(lrc);
1770 	struct iosys_map ring;
1771 	u32 rhs;
1772 	size_t aligned_size;
1773 
1774 	xe_assert(xe, IS_ALIGNED(size, 4));
1775 	aligned_size = ALIGN(size, 8);
1776 
1777 	ring = __xe_lrc_ring_map(lrc);
1778 
1779 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1780 	rhs = lrc->ring.size - lrc->ring.tail;
1781 	if (size > rhs) {
1782 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1783 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1784 	} else {
1785 		__xe_lrc_write_ring(lrc, ring, data, size);
1786 	}
1787 
1788 	if (aligned_size > size) {
1789 		u32 noop = MI_NOOP;
1790 
1791 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1792 	}
1793 }
1794 
1795 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1796 {
1797 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1798 }
1799 
1800 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1801 {
1802 	return __xe_lrc_seqno_ggtt_addr(lrc);
1803 }
1804 
1805 /**
1806  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1807  *
1808  * Allocate but don't initialize an lrc seqno fence.
1809  *
1810  * Return: Pointer to the allocated fence or
1811  * negative error pointer on error.
1812  */
1813 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1814 {
1815 	return xe_hw_fence_alloc();
1816 }
1817 
1818 /**
1819  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1820  * @fence: Pointer to the fence to free.
1821  *
1822  * Frees an lrc seqno fence that hasn't yet been
1823  * initialized.
1824  */
1825 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1826 {
1827 	xe_hw_fence_free(fence);
1828 }
1829 
1830 /**
1831  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1832  * @lrc: Pointer to the lrc.
1833  * @fence: Pointer to the fence to initialize.
1834  *
1835  * Initializes a pre-allocated lrc seqno fence.
1836  * After initialization, the fence is subject to normal
1837  * dma-fence refcounting.
1838  */
1839 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1840 {
1841 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1842 }
1843 
1844 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1845 {
1846 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1847 
1848 	return xe_map_read32(lrc_to_xe(lrc), &map);
1849 }
1850 
1851 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1852 {
1853 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1854 
1855 	return xe_map_read32(lrc_to_xe(lrc), &map);
1856 }
1857 
1858 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1859 {
1860 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1861 }
1862 
1863 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1864 {
1865 	return __xe_lrc_parallel_ggtt_addr(lrc);
1866 }
1867 
1868 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1869 {
1870 	return __xe_lrc_parallel_map(lrc);
1871 }
1872 
1873 /**
1874  * xe_lrc_engine_id() - Read engine id value
1875  * @lrc: Pointer to the lrc.
1876  *
1877  * Returns: context id value
1878  */
1879 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1880 {
1881 	struct xe_device *xe = lrc_to_xe(lrc);
1882 	struct iosys_map map;
1883 
1884 	map = __xe_lrc_engine_id_map(lrc);
1885 	return xe_map_read32(xe, &map);
1886 }
1887 
1888 static int instr_dw(u32 cmd_header)
1889 {
1890 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1891 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1892 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1893 		return 1;
1894 
1895 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1896 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1897 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1898 
1899 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1900 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1901 }
1902 
1903 static int dump_mi_command(struct drm_printer *p,
1904 			   struct xe_gt *gt,
1905 			   u32 *dw,
1906 			   int remaining_dw)
1907 {
1908 	u32 inst_header = *dw;
1909 	u32 numdw = instr_dw(inst_header);
1910 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1911 	int num_noop;
1912 
1913 	/* First check for commands that don't have/use a '# DW' field */
1914 	switch (inst_header & MI_OPCODE) {
1915 	case MI_NOOP:
1916 		num_noop = 1;
1917 		while (num_noop < remaining_dw &&
1918 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1919 			num_noop++;
1920 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1921 		return num_noop;
1922 
1923 	case MI_TOPOLOGY_FILTER:
1924 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1925 		return 1;
1926 
1927 	case MI_BATCH_BUFFER_END:
1928 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1929 		/* Return 'remaining_dw' to consume the rest of the LRC */
1930 		return remaining_dw;
1931 	}
1932 
1933 	/*
1934 	 * Any remaining commands include a # of dwords.  We should make sure
1935 	 * it doesn't exceed the remaining size of the LRC.
1936 	 */
1937 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1938 		numdw = remaining_dw;
1939 
1940 	switch (inst_header & MI_OPCODE) {
1941 	case MI_LOAD_REGISTER_IMM:
1942 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1943 			   inst_header, (numdw - 1) / 2);
1944 		for (int i = 1; i < numdw; i += 2)
1945 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1946 		return numdw;
1947 
1948 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1949 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1950 			   inst_header,
1951 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1952 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1953 		if (numdw == 4)
1954 			drm_printf(p, " - %#6x = %#010llx\n",
1955 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1956 		else
1957 			drm_printf(p, " - %*ph (%s)\n",
1958 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1959 				   numdw < 4 ? "truncated" : "malformed");
1960 		return numdw;
1961 
1962 	case MI_FORCE_WAKEUP:
1963 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1964 		return numdw;
1965 
1966 	default:
1967 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1968 			   inst_header, opcode, numdw);
1969 		return numdw;
1970 	}
1971 }
1972 
1973 static int dump_gfxpipe_command(struct drm_printer *p,
1974 				struct xe_gt *gt,
1975 				u32 *dw,
1976 				int remaining_dw)
1977 {
1978 	u32 numdw = instr_dw(*dw);
1979 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1980 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1981 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1982 
1983 	/*
1984 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1985 	 * remaining size of the LRC.
1986 	 */
1987 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1988 		numdw = remaining_dw;
1989 
1990 	switch (*dw & GFXPIPE_MATCH_MASK) {
1991 #define MATCH(cmd) \
1992 	case cmd: \
1993 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1994 		return numdw
1995 #define MATCH3D(cmd) \
1996 	case CMD_##cmd: \
1997 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1998 		return numdw
1999 
2000 	MATCH(STATE_BASE_ADDRESS);
2001 	MATCH(STATE_SIP);
2002 	MATCH(GPGPU_CSR_BASE_ADDRESS);
2003 	MATCH(STATE_COMPUTE_MODE);
2004 	MATCH3D(3DSTATE_BTD);
2005 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
2006 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
2007 
2008 	MATCH3D(3DSTATE_VF_STATISTICS);
2009 
2010 	MATCH(PIPELINE_SELECT);
2011 
2012 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
2013 	MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN);
2014 	MATCH3D(3DSTATE_CLEAR_PARAMS);
2015 	MATCH3D(3DSTATE_DEPTH_BUFFER);
2016 	MATCH3D(3DSTATE_STENCIL_BUFFER);
2017 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
2018 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
2019 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
2020 	MATCH3D(3DSTATE_INDEX_BUFFER);
2021 	MATCH3D(3DSTATE_VF);
2022 	MATCH3D(3DSTATE_MULTISAMPLE);
2023 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
2024 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
2025 	MATCH3D(3DSTATE_VS);
2026 	MATCH3D(3DSTATE_GS);
2027 	MATCH3D(3DSTATE_CLIP);
2028 	MATCH3D(3DSTATE_SF);
2029 	MATCH3D(3DSTATE_WM);
2030 	MATCH3D(3DSTATE_CONSTANT_VS);
2031 	MATCH3D(3DSTATE_CONSTANT_GS);
2032 	MATCH3D(3DSTATE_CONSTANT_PS);
2033 	MATCH3D(3DSTATE_SAMPLE_MASK);
2034 	MATCH3D(3DSTATE_CONSTANT_HS);
2035 	MATCH3D(3DSTATE_CONSTANT_DS);
2036 	MATCH3D(3DSTATE_HS);
2037 	MATCH3D(3DSTATE_TE);
2038 	MATCH3D(3DSTATE_DS);
2039 	MATCH3D(3DSTATE_STREAMOUT);
2040 	MATCH3D(3DSTATE_SBE);
2041 	MATCH3D(3DSTATE_PS);
2042 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
2043 	MATCH3D(3DSTATE_CPS_POINTERS);
2044 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2045 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2046 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2047 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2048 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2049 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2050 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2051 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2052 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2053 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2054 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2055 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2056 	MATCH3D(3DSTATE_VF_INSTANCING);
2057 	MATCH3D(3DSTATE_VF_SGVS);
2058 	MATCH3D(3DSTATE_VF_TOPOLOGY);
2059 	MATCH3D(3DSTATE_WM_CHROMAKEY);
2060 	MATCH3D(3DSTATE_PS_BLEND);
2061 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2062 	MATCH3D(3DSTATE_PS_EXTRA);
2063 	MATCH3D(3DSTATE_RASTER);
2064 	MATCH3D(3DSTATE_SBE_SWIZ);
2065 	MATCH3D(3DSTATE_WM_HZ_OP);
2066 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2067 	MATCH3D(3DSTATE_VF_SGVS_2);
2068 	MATCH3D(3DSTATE_VFG);
2069 	MATCH3D(3DSTATE_URB_ALLOC_VS);
2070 	MATCH3D(3DSTATE_URB_ALLOC_HS);
2071 	MATCH3D(3DSTATE_URB_ALLOC_DS);
2072 	MATCH3D(3DSTATE_URB_ALLOC_GS);
2073 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2074 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2075 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2076 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2077 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2078 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2079 	MATCH3D(3DSTATE_AMFS);
2080 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
2081 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2082 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2083 	MATCH3D(3DSTATE_MESH_CONTROL);
2084 	MATCH3D(3DSTATE_MESH_DISTRIB);
2085 	MATCH3D(3DSTATE_TASK_REDISTRIB);
2086 	MATCH3D(3DSTATE_MESH_SHADER);
2087 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
2088 	MATCH3D(3DSTATE_TASK_CONTROL);
2089 	MATCH3D(3DSTATE_TASK_SHADER);
2090 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
2091 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
2092 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
2093 	MATCH3D(3DSTATE_CLIP_MESH);
2094 	MATCH3D(3DSTATE_SBE_MESH);
2095 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2096 	MATCH3D(3DSTATE_COARSE_PIXEL);
2097 	MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT);
2098 	MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT);
2099 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2);
2100 	MATCH3D(3DSTATE_CC_STATE_POINTERS_2);
2101 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2);
2102 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2);
2103 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2);
2104 
2105 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2106 	MATCH3D(3DSTATE_URB_MEMORY);
2107 	MATCH3D(3DSTATE_CHROMA_KEY);
2108 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2109 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2110 	MATCH3D(3DSTATE_LINE_STIPPLE);
2111 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2112 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2113 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2114 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2115 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2116 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2117 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2118 	MATCH3D(3DSTATE_SO_DECL_LIST);
2119 	MATCH3D(3DSTATE_SO_BUFFER);
2120 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2121 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2122 	MATCH3D(3DSTATE_3D_MODE);
2123 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2124 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2125 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2126 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2);
2127 
2128 	default:
2129 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2130 			   *dw, pipeline, opcode, subopcode, numdw);
2131 		return numdw;
2132 	}
2133 }
2134 
2135 static int dump_gfx_state_command(struct drm_printer *p,
2136 				  struct xe_gt *gt,
2137 				  u32 *dw,
2138 				  int remaining_dw)
2139 {
2140 	u32 numdw = instr_dw(*dw);
2141 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2142 
2143 	/*
2144 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2145 	 * remaining size of the LRC.
2146 	 */
2147 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2148 		numdw = remaining_dw;
2149 
2150 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2151 	MATCH(STATE_WRITE_INLINE);
2152 
2153 	default:
2154 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2155 			   *dw, opcode, numdw);
2156 		return numdw;
2157 	}
2158 }
2159 
2160 void xe_lrc_dump_default(struct drm_printer *p,
2161 			 struct xe_gt *gt,
2162 			 enum xe_engine_class hwe_class)
2163 {
2164 	u32 *dw;
2165 	int remaining_dw, num_dw;
2166 
2167 	if (!gt->default_lrc[hwe_class]) {
2168 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2169 		return;
2170 	}
2171 
2172 	/*
2173 	 * Skip the beginning of the LRC since it contains the per-process
2174 	 * hardware status page.
2175 	 */
2176 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2177 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2178 
2179 	while (remaining_dw > 0) {
2180 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2181 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
2182 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2183 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
2184 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2185 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
2186 		} else {
2187 			num_dw = min(instr_dw(*dw), remaining_dw);
2188 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2189 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2190 				   num_dw);
2191 		}
2192 
2193 		dw += num_dw;
2194 		remaining_dw -= num_dw;
2195 	}
2196 }
2197 
2198 /*
2199  * Lookup the value of a register within the offset/value pairs of an
2200  * MI_LOAD_REGISTER_IMM instruction.
2201  *
2202  * Return -ENOENT if the register is not present in the MI_LRI instruction.
2203  */
2204 static int lookup_reg_in_mi_lri(u32 offset, u32 *value,
2205 				const u32 *dword_pair, int num_regs)
2206 {
2207 	for (int i = 0; i < num_regs; i++) {
2208 		if (dword_pair[2 * i] == offset) {
2209 			*value = dword_pair[2 * i + 1];
2210 			return 0;
2211 		}
2212 	}
2213 
2214 	return -ENOENT;
2215 }
2216 
2217 /*
2218  * Lookup the value of a register in a specific engine type's default LRC.
2219  *
2220  * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register
2221  * cannot be found in the default LRC.
2222  */
2223 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt,
2224 				    enum xe_engine_class hwe_class,
2225 				    u32 offset,
2226 				    u32 *value)
2227 {
2228 	u32 *dw;
2229 	int remaining_dw, ret;
2230 
2231 	if (!gt->default_lrc[hwe_class])
2232 		return -EINVAL;
2233 
2234 	/*
2235 	 * Skip the beginning of the LRC since it contains the per-process
2236 	 * hardware status page.
2237 	 */
2238 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2239 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2240 
2241 	while (remaining_dw > 0) {
2242 		u32 num_dw = instr_dw(*dw);
2243 
2244 		if (num_dw > remaining_dw)
2245 			num_dw = remaining_dw;
2246 
2247 		switch (*dw & XE_INSTR_CMD_TYPE) {
2248 		case XE_INSTR_MI:
2249 			switch (*dw & MI_OPCODE) {
2250 			case MI_BATCH_BUFFER_END:
2251 				/* End of LRC; register not found */
2252 				return -ENOENT;
2253 
2254 			case MI_NOOP:
2255 			case MI_TOPOLOGY_FILTER:
2256 				/*
2257 				 * MI_NOOP and MI_TOPOLOGY_FILTER don't have
2258 				 * a length field and are always 1-dword
2259 				 * instructions.
2260 				 */
2261 				remaining_dw--;
2262 				dw++;
2263 				break;
2264 
2265 			case MI_LOAD_REGISTER_IMM:
2266 				ret = lookup_reg_in_mi_lri(offset, value,
2267 							   dw + 1, (num_dw - 1) / 2);
2268 				if (ret == 0)
2269 					return 0;
2270 
2271 				fallthrough;
2272 
2273 			default:
2274 				/*
2275 				 * Jump to next instruction based on length
2276 				 * field.
2277 				 */
2278 				remaining_dw -= num_dw;
2279 				dw += num_dw;
2280 				break;
2281 			}
2282 			break;
2283 
2284 		default:
2285 			/* Jump to next instruction based on length field. */
2286 			remaining_dw -= num_dw;
2287 			dw += num_dw;
2288 		}
2289 	}
2290 
2291 	return -ENOENT;
2292 }
2293 
2294 struct instr_state {
2295 	u32 instr;
2296 	u16 num_dw;
2297 };
2298 
2299 static const struct instr_state xe_hpg_svg_state[] = {
2300 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2301 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2302 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2303 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2304 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2305 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2306 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2307 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2308 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2309 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2310 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2311 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2312 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2313 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2314 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2315 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2316 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2317 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2318 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2319 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2320 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2321 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2322 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2323 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2324 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2325 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2326 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2327 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2328 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2329 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2330 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2331 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2332 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2333 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2334 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2335 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2336 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2337 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2338 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2339 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2340 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2341 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2342 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2343 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2344 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2345 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2346 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2347 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2348 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2349 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2350 };
2351 
2352 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2353 {
2354 	struct xe_gt *gt = q->hwe->gt;
2355 	struct xe_device *xe = gt_to_xe(gt);
2356 	const struct instr_state *state_table = NULL;
2357 	int state_table_size = 0;
2358 
2359 	/*
2360 	 * Wa_14019789679
2361 	 *
2362 	 * If the driver doesn't explicitly emit the SVG instructions while
2363 	 * setting up the default LRC, the context switch will write 0's
2364 	 * (noops) into the LRC memory rather than the expected instruction
2365 	 * headers.  Application contexts start out as a copy of the default
2366 	 * LRC, and if they also do not emit specific settings for some SVG
2367 	 * state, then on context restore they'll unintentionally inherit
2368 	 * whatever state setting the previous context had programmed into the
2369 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2370 	 * prevent the hardware from resetting that state back to any specific
2371 	 * value).
2372 	 *
2373 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2374 	 * since that's a specific state setting that can easily cause GPU
2375 	 * hangs if unintentionally inherited.  However to be safe we'll
2376 	 * continue to emit all of the SVG state since it's best not to leak
2377 	 * any of the state between contexts, even if that leakage is harmless.
2378 	 */
2379 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2380 		state_table = xe_hpg_svg_state;
2381 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2382 	}
2383 
2384 	if (!state_table) {
2385 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2386 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2387 		return cs;
2388 	}
2389 
2390 	for (int i = 0; i < state_table_size; i++) {
2391 		u32 instr = state_table[i].instr;
2392 		u16 num_dw = state_table[i].num_dw;
2393 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2394 
2395 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2396 		xe_gt_assert(gt, num_dw != 0);
2397 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2398 
2399 		/*
2400 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2401 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2402 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2403 		 * Just make the replacement here rather than defining a
2404 		 * whole separate table for the single trivial change.
2405 		 */
2406 		if (GRAPHICS_VER(xe) >= 20 &&
2407 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2408 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2409 
2410 		*cs = instr;
2411 		if (!is_single_dw)
2412 			*cs |= (num_dw - 2);
2413 
2414 		cs += num_dw;
2415 	}
2416 
2417 	return cs;
2418 }
2419 
2420 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2421 {
2422 	struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2423 
2424 	if (!snapshot)
2425 		return NULL;
2426 
2427 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2428 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2429 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2430 	snapshot->head = xe_lrc_ring_head(lrc);
2431 	snapshot->tail.internal = lrc->ring.tail;
2432 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2433 	snapshot->start = xe_lrc_ring_start(lrc);
2434 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2435 	snapshot->seqno = xe_lrc_seqno(lrc);
2436 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2437 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2438 	snapshot->lrc_size = lrc->size;
2439 	snapshot->replay_offset = 0;
2440 	snapshot->replay_size = lrc->replay_size;
2441 	snapshot->lrc_snapshot = NULL;
2442 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2443 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2444 	return snapshot;
2445 }
2446 
2447 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2448 {
2449 	struct xe_bo *bo;
2450 	struct iosys_map src;
2451 
2452 	if (!snapshot)
2453 		return;
2454 
2455 	bo = snapshot->lrc_bo;
2456 	snapshot->lrc_bo = NULL;
2457 
2458 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2459 	if (!snapshot->lrc_snapshot)
2460 		goto put_bo;
2461 
2462 	xe_bo_lock(bo, false);
2463 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2464 		xe_map_memcpy_from(xe_bo_device(bo),
2465 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2466 				   snapshot->lrc_size);
2467 		ttm_bo_vunmap(&bo->ttm, &src);
2468 	} else {
2469 		kvfree(snapshot->lrc_snapshot);
2470 		snapshot->lrc_snapshot = NULL;
2471 	}
2472 	xe_bo_unlock(bo);
2473 put_bo:
2474 	xe_bo_put(bo);
2475 }
2476 
2477 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2478 {
2479 	unsigned long i;
2480 
2481 	if (!snapshot)
2482 		return;
2483 
2484 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2485 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2486 		   snapshot->ring_addr);
2487 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2488 		   snapshot->indirect_context_desc);
2489 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2490 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2491 		   snapshot->tail.internal, snapshot->tail.memory);
2492 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2493 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2494 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2495 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2496 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2497 
2498 	if (!snapshot->lrc_snapshot)
2499 		return;
2500 
2501 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2502 	drm_puts(p, "\t[HWSP].data: ");
2503 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2504 		u32 *val = snapshot->lrc_snapshot + i;
2505 		char dumped[ASCII85_BUFSZ];
2506 
2507 		drm_puts(p, ascii85_encode(*val, dumped));
2508 	}
2509 
2510 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2511 	drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2512 	drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2513 
2514 	drm_puts(p, "\t[HWCTX].data: ");
2515 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2516 		u32 *val = snapshot->lrc_snapshot + i;
2517 		char dumped[ASCII85_BUFSZ];
2518 
2519 		drm_puts(p, ascii85_encode(*val, dumped));
2520 	}
2521 	drm_puts(p, "\n");
2522 }
2523 
2524 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2525 {
2526 	if (!snapshot)
2527 		return;
2528 
2529 	kvfree(snapshot->lrc_snapshot);
2530 	if (snapshot->lrc_bo)
2531 		xe_bo_put(snapshot->lrc_bo);
2532 
2533 	kfree(snapshot);
2534 }
2535 
2536 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2537 {
2538 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2539 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2540 	struct xe_hw_engine *hwe;
2541 	u64 val;
2542 
2543 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2544 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2545 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2546 			    class, instance))
2547 		return -1;
2548 
2549 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2550 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2551 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2552 	else
2553 		val = xe_mmio_read32(&hwe->gt->mmio,
2554 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2555 
2556 	*reg_ctx_ts = val;
2557 
2558 	return 0;
2559 }
2560 
2561 /**
2562  * xe_lrc_timestamp() - Current ctx timestamp
2563  * @lrc: Pointer to the lrc.
2564  *
2565  * Return latest ctx timestamp. With support for active contexts, the
2566  * calculation may bb slightly racy, so follow a read-again logic to ensure that
2567  * the context is still active before returning the right timestamp.
2568  *
2569  * Returns: New ctx timestamp value
2570  */
2571 u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2572 {
2573 	u64 lrc_ts, reg_ts, new_ts;
2574 	u32 engine_id;
2575 
2576 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2577 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2578 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2579 		new_ts = lrc_ts;
2580 		goto done;
2581 	}
2582 
2583 	if (lrc_ts == CONTEXT_ACTIVE) {
2584 		engine_id = xe_lrc_engine_id(lrc);
2585 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2586 			new_ts = reg_ts;
2587 
2588 		/* read lrc again to ensure context is still active */
2589 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2590 	}
2591 
2592 	/*
2593 	 * If context switched out, just use the lrc_ts. Note that this needs to
2594 	 * be a separate if condition.
2595 	 */
2596 	if (lrc_ts != CONTEXT_ACTIVE)
2597 		new_ts = lrc_ts;
2598 
2599 done:
2600 	return new_ts;
2601 }
2602 
2603 /**
2604  * xe_lrc_update_timestamp() - Update ctx timestamp
2605  * @lrc: Pointer to the lrc.
2606  * @old_ts: Old timestamp value
2607  *
2608  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2609  * update saved value.
2610  *
2611  * Returns: New ctx timestamp value
2612  */
2613 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2614 {
2615 	*old_ts = lrc->ctx_timestamp;
2616 	lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
2617 
2618 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2619 
2620 	return lrc->ctx_timestamp;
2621 }
2622 
2623 /**
2624  * xe_lrc_ring_is_idle() - LRC is idle
2625  * @lrc: Pointer to the lrc.
2626  *
2627  * Compare LRC ring head and tail to determine if idle.
2628  *
2629  * Return: True is ring is idle, False otherwise
2630  */
2631 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2632 {
2633 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2634 }
2635