xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 74ba587f402d5501af2c85e50cf1e4044263b6ca)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_lrc_layout.h"
18 #include "xe_bb.h"
19 #include "xe_bo.h"
20 #include "xe_configfs.h"
21 #include "xe_device.h"
22 #include "xe_drm_client.h"
23 #include "xe_exec_queue_types.h"
24 #include "xe_gt.h"
25 #include "xe_gt_printk.h"
26 #include "xe_hw_fence.h"
27 #include "xe_map.h"
28 #include "xe_memirq.h"
29 #include "xe_mmio.h"
30 #include "xe_sriov.h"
31 #include "xe_trace_lrc.h"
32 #include "xe_vm.h"
33 #include "xe_wa.h"
34 
35 #define LRC_VALID				BIT_ULL(0)
36 #define LRC_PRIVILEGE				BIT_ULL(8)
37 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
38 #define LRC_LEGACY_64B_CONTEXT			3
39 
40 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
41 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
42 
43 #define LRC_PPHWSP_SIZE				SZ_4K
44 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
45 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
46 
47 /*
48  * Layout of the LRC and associated data allocated as
49  * lrc->bo:
50  *
51  *   Region                       Size
52  *  +============================+=================================+ <- __xe_lrc_ring_offset()
53  *  | Ring                       | ring_size, see                  |
54  *  |                            | xe_lrc_init()                   |
55  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
56  *  | PPHWSP (includes SW state) | 4K                              |
57  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
58  *  | Engine Context Image       | n * 4K, see                     |
59  *  |                            | xe_gt_lrc_size()                |
60  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
61  *  | Indirect Ring State Page   | 0 or 4k, see                    |
62  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
63  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
64  *  | Indirect Context Page      | 0 or 4k, see                    |
65  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
66  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
67  *  | WA BB Per Ctx              | 4k                              |
68  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
69  */
70 
71 static struct xe_device *
72 lrc_to_xe(struct xe_lrc *lrc)
73 {
74 	return gt_to_xe(lrc->fence_ctx.gt);
75 }
76 
77 static bool
78 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
79 {
80 	struct xe_device *xe = gt_to_xe(gt);
81 
82 	if (XE_GT_WA(gt, 16010904313) &&
83 	    (class == XE_ENGINE_CLASS_RENDER ||
84 	     class == XE_ENGINE_CLASS_COMPUTE))
85 		return true;
86 
87 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
88 					       class, NULL))
89 		return true;
90 
91 	return false;
92 }
93 
94 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
95 {
96 	struct xe_device *xe = gt_to_xe(gt);
97 	size_t size;
98 
99 	/* Per-process HW status page (PPHWSP) */
100 	size = LRC_PPHWSP_SIZE;
101 
102 	/* Engine context image */
103 	switch (class) {
104 	case XE_ENGINE_CLASS_RENDER:
105 		if (GRAPHICS_VER(xe) >= 20)
106 			size += 3 * SZ_4K;
107 		else
108 			size += 13 * SZ_4K;
109 		break;
110 	case XE_ENGINE_CLASS_COMPUTE:
111 		if (GRAPHICS_VER(xe) >= 20)
112 			size += 2 * SZ_4K;
113 		else
114 			size += 13 * SZ_4K;
115 		break;
116 	default:
117 		WARN(1, "Unknown engine class: %d", class);
118 		fallthrough;
119 	case XE_ENGINE_CLASS_COPY:
120 	case XE_ENGINE_CLASS_VIDEO_DECODE:
121 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
122 	case XE_ENGINE_CLASS_OTHER:
123 		size += 1 * SZ_4K;
124 	}
125 
126 	/* Add indirect ring state page */
127 	if (xe_gt_has_indirect_ring_state(gt))
128 		size += LRC_INDIRECT_RING_STATE_SIZE;
129 
130 	return size;
131 }
132 
133 /*
134  * The per-platform tables are u8-encoded in @data. Decode @data and set the
135  * addresses' offset and commands in @regs. The following encoding is used
136  * for each byte. There are 2 steps: decoding commands and decoding addresses.
137  *
138  * Commands:
139  * [7]: create NOPs - number of NOPs are set in lower bits
140  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
141  *      MI_LRI_FORCE_POSTED
142  * [5:0]: Number of NOPs or registers to set values to in case of
143  *        MI_LOAD_REGISTER_IMM
144  *
145  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
146  * number of registers. They are set by using the REG/REG16 macros: the former
147  * is used for offsets smaller than 0x200 while the latter is for values bigger
148  * than that. Those macros already set all the bits documented below correctly:
149  *
150  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
151  *      follow, for the lower bits
152  * [6:0]: Register offset, without considering the engine base.
153  *
154  * This function only tweaks the commands and register offsets. Values are not
155  * filled out.
156  */
157 static void set_offsets(u32 *regs,
158 			const u8 *data,
159 			const struct xe_hw_engine *hwe)
160 #define NOP(x) (BIT(7) | (x))
161 #define LRI(count, flags) ((flags) << 6 | (count) | \
162 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
163 #define POSTED BIT(0)
164 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
165 #define REG16(x) \
166 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
167 	(((x) >> 2) & 0x7f)
168 {
169 	const u32 base = hwe->mmio_base;
170 
171 	while (*data) {
172 		u8 count, flags;
173 
174 		if (*data & BIT(7)) { /* skip */
175 			count = *data++ & ~BIT(7);
176 			regs += count;
177 			continue;
178 		}
179 
180 		count = *data & 0x3f;
181 		flags = *data >> 6;
182 		data++;
183 
184 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
185 		if (flags & POSTED)
186 			*regs |= MI_LRI_FORCE_POSTED;
187 		*regs |= MI_LRI_LRM_CS_MMIO;
188 		regs++;
189 
190 		xe_gt_assert(hwe->gt, count);
191 		do {
192 			u32 offset = 0;
193 			u8 v;
194 
195 			do {
196 				v = *data++;
197 				offset <<= 7;
198 				offset |= v & ~BIT(7);
199 			} while (v & BIT(7));
200 
201 			regs[0] = base + (offset << 2);
202 			regs += 2;
203 		} while (--count);
204 	}
205 
206 	*regs = MI_BATCH_BUFFER_END | BIT(0);
207 }
208 
209 static const u8 gen12_xcs_offsets[] = {
210 	NOP(1),
211 	LRI(13, POSTED),
212 	REG16(0x244),
213 	REG(0x034),
214 	REG(0x030),
215 	REG(0x038),
216 	REG(0x03c),
217 	REG(0x168),
218 	REG(0x140),
219 	REG(0x110),
220 	REG(0x1c0),
221 	REG(0x1c4),
222 	REG(0x1c8),
223 	REG(0x180),
224 	REG16(0x2b4),
225 
226 	NOP(5),
227 	LRI(9, POSTED),
228 	REG16(0x3a8),
229 	REG16(0x28c),
230 	REG16(0x288),
231 	REG16(0x284),
232 	REG16(0x280),
233 	REG16(0x27c),
234 	REG16(0x278),
235 	REG16(0x274),
236 	REG16(0x270),
237 
238 	0
239 };
240 
241 static const u8 dg2_xcs_offsets[] = {
242 	NOP(1),
243 	LRI(15, POSTED),
244 	REG16(0x244),
245 	REG(0x034),
246 	REG(0x030),
247 	REG(0x038),
248 	REG(0x03c),
249 	REG(0x168),
250 	REG(0x140),
251 	REG(0x110),
252 	REG(0x1c0),
253 	REG(0x1c4),
254 	REG(0x1c8),
255 	REG(0x180),
256 	REG16(0x2b4),
257 	REG(0x120),
258 	REG(0x124),
259 
260 	NOP(1),
261 	LRI(9, POSTED),
262 	REG16(0x3a8),
263 	REG16(0x28c),
264 	REG16(0x288),
265 	REG16(0x284),
266 	REG16(0x280),
267 	REG16(0x27c),
268 	REG16(0x278),
269 	REG16(0x274),
270 	REG16(0x270),
271 
272 	0
273 };
274 
275 static const u8 gen12_rcs_offsets[] = {
276 	NOP(1),
277 	LRI(13, POSTED),
278 	REG16(0x244),
279 	REG(0x034),
280 	REG(0x030),
281 	REG(0x038),
282 	REG(0x03c),
283 	REG(0x168),
284 	REG(0x140),
285 	REG(0x110),
286 	REG(0x1c0),
287 	REG(0x1c4),
288 	REG(0x1c8),
289 	REG(0x180),
290 	REG16(0x2b4),
291 
292 	NOP(5),
293 	LRI(9, POSTED),
294 	REG16(0x3a8),
295 	REG16(0x28c),
296 	REG16(0x288),
297 	REG16(0x284),
298 	REG16(0x280),
299 	REG16(0x27c),
300 	REG16(0x278),
301 	REG16(0x274),
302 	REG16(0x270),
303 
304 	LRI(3, POSTED),
305 	REG(0x1b0),
306 	REG16(0x5a8),
307 	REG16(0x5ac),
308 
309 	NOP(6),
310 	LRI(1, 0),
311 	REG(0x0c8),
312 	NOP(3 + 9 + 1),
313 
314 	LRI(51, POSTED),
315 	REG16(0x588),
316 	REG16(0x588),
317 	REG16(0x588),
318 	REG16(0x588),
319 	REG16(0x588),
320 	REG16(0x588),
321 	REG(0x028),
322 	REG(0x09c),
323 	REG(0x0c0),
324 	REG(0x178),
325 	REG(0x17c),
326 	REG16(0x358),
327 	REG(0x170),
328 	REG(0x150),
329 	REG(0x154),
330 	REG(0x158),
331 	REG16(0x41c),
332 	REG16(0x600),
333 	REG16(0x604),
334 	REG16(0x608),
335 	REG16(0x60c),
336 	REG16(0x610),
337 	REG16(0x614),
338 	REG16(0x618),
339 	REG16(0x61c),
340 	REG16(0x620),
341 	REG16(0x624),
342 	REG16(0x628),
343 	REG16(0x62c),
344 	REG16(0x630),
345 	REG16(0x634),
346 	REG16(0x638),
347 	REG16(0x63c),
348 	REG16(0x640),
349 	REG16(0x644),
350 	REG16(0x648),
351 	REG16(0x64c),
352 	REG16(0x650),
353 	REG16(0x654),
354 	REG16(0x658),
355 	REG16(0x65c),
356 	REG16(0x660),
357 	REG16(0x664),
358 	REG16(0x668),
359 	REG16(0x66c),
360 	REG16(0x670),
361 	REG16(0x674),
362 	REG16(0x678),
363 	REG16(0x67c),
364 	REG(0x068),
365 	REG(0x084),
366 	NOP(1),
367 
368 	0
369 };
370 
371 static const u8 xehp_rcs_offsets[] = {
372 	NOP(1),
373 	LRI(13, POSTED),
374 	REG16(0x244),
375 	REG(0x034),
376 	REG(0x030),
377 	REG(0x038),
378 	REG(0x03c),
379 	REG(0x168),
380 	REG(0x140),
381 	REG(0x110),
382 	REG(0x1c0),
383 	REG(0x1c4),
384 	REG(0x1c8),
385 	REG(0x180),
386 	REG16(0x2b4),
387 
388 	NOP(5),
389 	LRI(9, POSTED),
390 	REG16(0x3a8),
391 	REG16(0x28c),
392 	REG16(0x288),
393 	REG16(0x284),
394 	REG16(0x280),
395 	REG16(0x27c),
396 	REG16(0x278),
397 	REG16(0x274),
398 	REG16(0x270),
399 
400 	LRI(3, POSTED),
401 	REG(0x1b0),
402 	REG16(0x5a8),
403 	REG16(0x5ac),
404 
405 	NOP(6),
406 	LRI(1, 0),
407 	REG(0x0c8),
408 
409 	0
410 };
411 
412 static const u8 dg2_rcs_offsets[] = {
413 	NOP(1),
414 	LRI(15, POSTED),
415 	REG16(0x244),
416 	REG(0x034),
417 	REG(0x030),
418 	REG(0x038),
419 	REG(0x03c),
420 	REG(0x168),
421 	REG(0x140),
422 	REG(0x110),
423 	REG(0x1c0),
424 	REG(0x1c4),
425 	REG(0x1c8),
426 	REG(0x180),
427 	REG16(0x2b4),
428 	REG(0x120),
429 	REG(0x124),
430 
431 	NOP(1),
432 	LRI(9, POSTED),
433 	REG16(0x3a8),
434 	REG16(0x28c),
435 	REG16(0x288),
436 	REG16(0x284),
437 	REG16(0x280),
438 	REG16(0x27c),
439 	REG16(0x278),
440 	REG16(0x274),
441 	REG16(0x270),
442 
443 	LRI(3, POSTED),
444 	REG(0x1b0),
445 	REG16(0x5a8),
446 	REG16(0x5ac),
447 
448 	NOP(6),
449 	LRI(1, 0),
450 	REG(0x0c8),
451 
452 	0
453 };
454 
455 static const u8 mtl_rcs_offsets[] = {
456 	NOP(1),
457 	LRI(15, POSTED),
458 	REG16(0x244),
459 	REG(0x034),
460 	REG(0x030),
461 	REG(0x038),
462 	REG(0x03c),
463 	REG(0x168),
464 	REG(0x140),
465 	REG(0x110),
466 	REG(0x1c0),
467 	REG(0x1c4),
468 	REG(0x1c8),
469 	REG(0x180),
470 	REG16(0x2b4),
471 	REG(0x120),
472 	REG(0x124),
473 
474 	NOP(1),
475 	LRI(9, POSTED),
476 	REG16(0x3a8),
477 	REG16(0x28c),
478 	REG16(0x288),
479 	REG16(0x284),
480 	REG16(0x280),
481 	REG16(0x27c),
482 	REG16(0x278),
483 	REG16(0x274),
484 	REG16(0x270),
485 
486 	NOP(2),
487 	LRI(2, POSTED),
488 	REG16(0x5a8),
489 	REG16(0x5ac),
490 
491 	NOP(6),
492 	LRI(1, 0),
493 	REG(0x0c8),
494 
495 	0
496 };
497 
498 #define XE2_CTX_COMMON \
499 	NOP(1),                 /* [0x00] */ \
500 	LRI(15, POSTED),        /* [0x01] */ \
501 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
502 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
503 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
504 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
505 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
506 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
507 	REG(0x140),             /* [0x0e] BB_ADDR */ \
508 	REG(0x110),             /* [0x10] BB_STATE */ \
509 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
510 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
511 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
512 	REG(0x180),             /* [0x18] CCID */ \
513 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
514 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
515 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
516 	\
517 	NOP(1),                 /* [0x20] */ \
518 	LRI(9, POSTED),         /* [0x21] */ \
519 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
520 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
521 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
522 	REG16(0x284),           /* [0x28] dummy reg */ \
523 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
524 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
525 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
526 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
527 	REG16(0x270)            /* [0x32] PTBP_LDW */
528 
529 static const u8 xe2_rcs_offsets[] = {
530 	XE2_CTX_COMMON,
531 
532 	NOP(2),                 /* [0x34] */
533 	LRI(2, POSTED),         /* [0x36] */
534 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
535 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
536 
537 	NOP(6),                 /* [0x41] */
538 	LRI(1, 0),              /* [0x47] */
539 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
540 
541 	0
542 };
543 
544 static const u8 xe2_bcs_offsets[] = {
545 	XE2_CTX_COMMON,
546 
547 	NOP(4 + 8 + 1),         /* [0x34] */
548 	LRI(2, POSTED),         /* [0x41] */
549 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
550 	REG16(0x204),           /* [0x44] BLIT_CCTL */
551 
552 	0
553 };
554 
555 static const u8 xe2_xcs_offsets[] = {
556 	XE2_CTX_COMMON,
557 
558 	0
559 };
560 
561 static const u8 xe2_indirect_ring_state_offsets[] = {
562 	NOP(1),                 /* [0x00] */
563 	LRI(5, POSTED),         /* [0x01] */
564 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
565 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
566 	REG(0x038),             /* [0x06] RING_BUFFER_START */
567 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
568 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
569 
570 	NOP(5),                 /* [0x0c] */
571 	LRI(9, POSTED),         /* [0x11] */
572 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
573 	REG(0x140),             /* [0x14] BB_ADDR */
574 	REG(0x110),             /* [0x16] BB_STATE */
575 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
576 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
577 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
578 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
579 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
580 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
581 
582 	NOP(12),                 /* [0x00] */
583 
584 	0
585 };
586 
587 #undef REG16
588 #undef REG
589 #undef LRI
590 #undef NOP
591 
592 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
593 {
594 	if (class == XE_ENGINE_CLASS_RENDER) {
595 		if (GRAPHICS_VER(xe) >= 20)
596 			return xe2_rcs_offsets;
597 		else if (GRAPHICS_VERx100(xe) >= 1270)
598 			return mtl_rcs_offsets;
599 		else if (GRAPHICS_VERx100(xe) >= 1255)
600 			return dg2_rcs_offsets;
601 		else if (GRAPHICS_VERx100(xe) >= 1250)
602 			return xehp_rcs_offsets;
603 		else
604 			return gen12_rcs_offsets;
605 	} else if (class == XE_ENGINE_CLASS_COPY) {
606 		if (GRAPHICS_VER(xe) >= 20)
607 			return xe2_bcs_offsets;
608 		else
609 			return gen12_xcs_offsets;
610 	} else {
611 		if (GRAPHICS_VER(xe) >= 20)
612 			return xe2_xcs_offsets;
613 		else if (GRAPHICS_VERx100(xe) >= 1255)
614 			return dg2_xcs_offsets;
615 		else
616 			return gen12_xcs_offsets;
617 	}
618 }
619 
620 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
621 {
622 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
623 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
624 
625 	if (xe_gt_has_indirect_ring_state(hwe->gt))
626 		regs[CTX_CONTEXT_CONTROL] |=
627 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
628 }
629 
630 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
631 {
632 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
633 	struct xe_device *xe = gt_to_xe(hwe->gt);
634 	u8 num_regs;
635 
636 	if (!xe_device_uses_memirq(xe))
637 		return;
638 
639 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
640 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
641 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
642 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
643 
644 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
645 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
646 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
647 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
648 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
649 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
650 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
651 
652 	if (xe_device_has_msix(xe)) {
653 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
654 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
655 	}
656 }
657 
658 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
659 {
660 	struct xe_device *xe = gt_to_xe(hwe->gt);
661 
662 	if (GRAPHICS_VERx100(xe) >= 1250)
663 		return 0x70;
664 	else
665 		return 0x60;
666 }
667 
668 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
669 {
670 	int x;
671 
672 	x = lrc_ring_mi_mode(hwe);
673 	regs[x + 1] &= ~STOP_RING;
674 	regs[x + 1] |= STOP_RING << 16;
675 }
676 
677 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
678 {
679 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
680 }
681 
682 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
683 {
684 	return 0;
685 }
686 
687 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
688 {
689 	return lrc->ring.size;
690 }
691 
692 /* Make the magic macros work */
693 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
694 #define __xe_lrc_regs_offset xe_lrc_regs_offset
695 
696 #define LRC_SEQNO_PPHWSP_OFFSET 512
697 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
698 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
699 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
700 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
701 
702 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
703 {
704 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
705 }
706 
707 /**
708  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
709  * @xe: the &xe_device struct instance
710  *
711  * Returns: Size of the LRC registers area for current platform
712  */
713 size_t xe_lrc_reg_size(struct xe_device *xe)
714 {
715 	if (GRAPHICS_VERx100(xe) >= 1250)
716 		return 96 * sizeof(u32);
717 	else
718 		return 80 * sizeof(u32);
719 }
720 
721 size_t xe_lrc_skip_size(struct xe_device *xe)
722 {
723 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
724 }
725 
726 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
727 {
728 	/* The seqno is stored in the driver-defined portion of PPHWSP */
729 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
730 }
731 
732 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
733 {
734 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
735 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
736 }
737 
738 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
739 {
740 	/* This is stored in the driver-defined portion of PPHWSP */
741 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
742 }
743 
744 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
745 {
746 	/* The parallel is stored in the driver-defined portion of PPHWSP */
747 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
748 }
749 
750 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
751 {
752 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
753 }
754 
755 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
756 {
757 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
758 }
759 
760 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
761 {
762 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
763 }
764 
765 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
766 {
767 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
768 		     LRC_INDIRECT_RING_STATE_SIZE;
769 
770 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
771 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
772 
773 	return offset;
774 }
775 
776 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
777 {
778 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
779 }
780 
781 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
782 {
783 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
784 }
785 
786 #define DECL_MAP_ADDR_HELPERS(elem) \
787 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
788 { \
789 	struct iosys_map map = lrc->bo->vmap; \
790 \
791 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
792 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
793 	return map; \
794 } \
795 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
796 { \
797 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
798 } \
799 
800 DECL_MAP_ADDR_HELPERS(ring)
801 DECL_MAP_ADDR_HELPERS(pphwsp)
802 DECL_MAP_ADDR_HELPERS(seqno)
803 DECL_MAP_ADDR_HELPERS(regs)
804 DECL_MAP_ADDR_HELPERS(start_seqno)
805 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
806 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
807 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
808 DECL_MAP_ADDR_HELPERS(parallel)
809 DECL_MAP_ADDR_HELPERS(indirect_ring)
810 DECL_MAP_ADDR_HELPERS(engine_id)
811 
812 #undef DECL_MAP_ADDR_HELPERS
813 
814 /**
815  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
816  * @lrc: Pointer to the lrc.
817  *
818  * Returns: ctx timestamp GGTT address
819  */
820 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
821 {
822 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
823 }
824 
825 /**
826  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
827  * @lrc: Pointer to the lrc.
828  *
829  * Returns: ctx timestamp udw GGTT address
830  */
831 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
832 {
833 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
834 }
835 
836 /**
837  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
838  * @lrc: Pointer to the lrc.
839  *
840  * Returns: ctx timestamp value
841  */
842 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
843 {
844 	struct xe_device *xe = lrc_to_xe(lrc);
845 	struct iosys_map map;
846 	u32 ldw, udw = 0;
847 
848 	map = __xe_lrc_ctx_timestamp_map(lrc);
849 	ldw = xe_map_read32(xe, &map);
850 
851 	if (xe->info.has_64bit_timestamp) {
852 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
853 		udw = xe_map_read32(xe, &map);
854 	}
855 
856 	return (u64)udw << 32 | ldw;
857 }
858 
859 /**
860  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
861  * @lrc: Pointer to the lrc.
862  *
863  * Returns: ctx timestamp job GGTT address
864  */
865 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
866 {
867 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
868 }
869 
870 /**
871  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
872  * @lrc: Pointer to the lrc.
873  *
874  * Returns: ctx timestamp job value
875  */
876 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
877 {
878 	struct xe_device *xe = lrc_to_xe(lrc);
879 	struct iosys_map map;
880 
881 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
882 	return xe_map_read32(xe, &map);
883 }
884 
885 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
886 {
887 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
888 }
889 
890 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
891 {
892 	if (!xe_lrc_has_indirect_ring_state(lrc))
893 		return 0;
894 
895 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
896 }
897 
898 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
899 {
900 	struct xe_device *xe = lrc_to_xe(lrc);
901 	struct iosys_map map;
902 
903 	map = __xe_lrc_indirect_ring_map(lrc);
904 	iosys_map_incr(&map, reg_nr * sizeof(u32));
905 	return xe_map_read32(xe, &map);
906 }
907 
908 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
909 					  int reg_nr, u32 val)
910 {
911 	struct xe_device *xe = lrc_to_xe(lrc);
912 	struct iosys_map map;
913 
914 	map = __xe_lrc_indirect_ring_map(lrc);
915 	iosys_map_incr(&map, reg_nr * sizeof(u32));
916 	xe_map_write32(xe, &map, val);
917 }
918 
919 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
920 {
921 	struct xe_device *xe = lrc_to_xe(lrc);
922 	struct iosys_map map;
923 
924 	map = __xe_lrc_regs_map(lrc);
925 	iosys_map_incr(&map, reg_nr * sizeof(u32));
926 	return xe_map_read32(xe, &map);
927 }
928 
929 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
930 {
931 	struct xe_device *xe = lrc_to_xe(lrc);
932 	struct iosys_map map;
933 
934 	map = __xe_lrc_regs_map(lrc);
935 	iosys_map_incr(&map, reg_nr * sizeof(u32));
936 	xe_map_write32(xe, &map, val);
937 }
938 
939 static void *empty_lrc_data(struct xe_hw_engine *hwe)
940 {
941 	struct xe_gt *gt = hwe->gt;
942 	void *data;
943 	u32 *regs;
944 
945 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
946 	if (!data)
947 		return NULL;
948 
949 	/* 1st page: Per-Process of HW status Page */
950 	regs = data + LRC_PPHWSP_SIZE;
951 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
952 	set_context_control(regs, hwe);
953 	set_memory_based_intr(regs, hwe);
954 	reset_stop_ring(regs, hwe);
955 	if (xe_gt_has_indirect_ring_state(gt)) {
956 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
957 		       LRC_INDIRECT_RING_STATE_SIZE;
958 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
959 	}
960 
961 	return data;
962 }
963 
964 /**
965  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
966  * of given engine.
967  * @hwe: the &xe_hw_engine struct instance
968  */
969 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
970 {
971 	struct xe_gt *gt = hwe->gt;
972 	u32 *regs;
973 
974 	if (!gt->default_lrc[hwe->class])
975 		return;
976 
977 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
978 	set_memory_based_intr(regs, hwe);
979 }
980 
981 /**
982  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
983  * for given LRC.
984  * @lrc: the &xe_lrc struct instance
985  * @hwe: the &xe_hw_engine struct instance
986  * @regs: scratch buffer to be used as temporary storage
987  */
988 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
989 					    u32 *regs)
990 {
991 	struct xe_gt *gt = hwe->gt;
992 	struct iosys_map map;
993 	size_t regs_len;
994 
995 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
996 		return;
997 
998 	map = __xe_lrc_regs_map(lrc);
999 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1000 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1001 	set_memory_based_intr(regs, hwe);
1002 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1003 }
1004 
1005 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1006 {
1007 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1008 
1009 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1010 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1011 }
1012 
1013 static void xe_lrc_finish(struct xe_lrc *lrc)
1014 {
1015 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1016 	xe_bo_unpin_map_no_vm(lrc->bo);
1017 }
1018 
1019 /*
1020  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1021  * in calculating active context run ticks.
1022  *
1023  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1024  * context, but only gets updated when the context switches out. In order to
1025  * check how long a context has been active before it switches out, two things
1026  * are required:
1027  *
1028  * (1) Determine if the context is running:
1029  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1030  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1031  * initialized. During a query, we just check for this value to determine if the
1032  * context is active. If the context switched out, it would overwrite this
1033  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1034  * the last part of context restore, so reusing this LRC location will not
1035  * clobber anything.
1036  *
1037  * (2) Calculate the time that the context has been active for:
1038  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1039  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1040  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1041  * engine instance. Since we do not know which instance the context is running
1042  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1043  * store it in the PPHSWP.
1044  */
1045 #define CONTEXT_ACTIVE 1ULL
1046 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1047 				    struct xe_hw_engine *hwe,
1048 				    u32 *batch,
1049 				    size_t max_len)
1050 {
1051 	u32 *cmd = batch;
1052 
1053 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1054 		return -ENOSPC;
1055 
1056 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1057 	*cmd++ = ENGINE_ID(0).addr;
1058 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1059 	*cmd++ = 0;
1060 
1061 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1062 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1063 	*cmd++ = 0;
1064 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1065 
1066 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1067 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1068 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1069 		*cmd++ = 0;
1070 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1071 	}
1072 
1073 	return cmd - batch;
1074 }
1075 
1076 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1077 				  u32 *batch, size_t max_len)
1078 {
1079 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1080 	u32 *cmd = batch;
1081 
1082 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1083 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1084 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1085 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1086 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1087 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1088 		return 0;
1089 
1090 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1091 		return -ENOSPC;
1092 
1093 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1094 		 MI_LRM_ASYNC;
1095 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1096 	*cmd++ = ts_addr;
1097 	*cmd++ = 0;
1098 
1099 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1100 		 MI_LRM_ASYNC;
1101 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1102 	*cmd++ = ts_addr;
1103 	*cmd++ = 0;
1104 
1105 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1106 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1107 	*cmd++ = ts_addr;
1108 	*cmd++ = 0;
1109 
1110 	return cmd - batch;
1111 }
1112 
1113 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1114 						  struct xe_hw_engine *hwe,
1115 						  u32 *batch, size_t max_len)
1116 {
1117 	struct xe_device *xe = gt_to_xe(lrc->gt);
1118 	const u32 *user_batch;
1119 	u32 *cmd = batch;
1120 	u32 count;
1121 
1122 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1123 						    hwe->class, &user_batch);
1124 	if (!count)
1125 		return 0;
1126 
1127 	if (count > max_len)
1128 		return -ENOSPC;
1129 
1130 	/*
1131 	 * This should be used only for tests and validation. Taint the kernel
1132 	 * as anything could be submitted directly in context switches
1133 	 */
1134 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1135 
1136 	memcpy(cmd, user_batch, count * sizeof(u32));
1137 	cmd += count;
1138 
1139 	return cmd - batch;
1140 }
1141 
1142 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1143 						 struct xe_hw_engine *hwe,
1144 						 u32 *batch, size_t max_len)
1145 {
1146 	struct xe_device *xe = gt_to_xe(lrc->gt);
1147 	const u32 *user_batch;
1148 	u32 *cmd = batch;
1149 	u32 count;
1150 
1151 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1152 						   hwe->class, &user_batch);
1153 	if (!count)
1154 		return 0;
1155 
1156 	if (count > max_len)
1157 		return -ENOSPC;
1158 
1159 	/*
1160 	 * This should be used only for tests and validation. Taint the kernel
1161 	 * as anything could be submitted directly in context switches
1162 	 */
1163 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1164 
1165 	memcpy(cmd, user_batch, count * sizeof(u32));
1166 	cmd += count;
1167 
1168 	return cmd - batch;
1169 }
1170 
1171 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1172 					       struct xe_hw_engine *hwe,
1173 					       u32 *batch, size_t max_len)
1174 {
1175 	u32 *cmd = batch;
1176 
1177 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1178 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1179 		return 0;
1180 
1181 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1182 		return -ENOSPC;
1183 
1184 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1185 	*cmd++ = CS_DEBUG_MODE1(0).addr;
1186 	*cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1187 
1188 	return cmd - batch;
1189 }
1190 
1191 struct bo_setup {
1192 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1193 			 u32 *batch, size_t max_size);
1194 };
1195 
1196 struct bo_setup_state {
1197 	/* Input: */
1198 	struct xe_lrc		*lrc;
1199 	struct xe_hw_engine	*hwe;
1200 	size_t			max_size;
1201 	size_t                  reserve_dw;
1202 	unsigned int		offset;
1203 	const struct bo_setup	*funcs;
1204 	unsigned int		num_funcs;
1205 
1206 	/* State: */
1207 	u32			*buffer;
1208 	u32			*ptr;
1209 	unsigned int		written;
1210 };
1211 
1212 static int setup_bo(struct bo_setup_state *state)
1213 {
1214 	ssize_t remain;
1215 
1216 	if (state->lrc->bo->vmap.is_iomem) {
1217 		xe_gt_assert(state->hwe->gt, state->buffer);
1218 		state->ptr = state->buffer;
1219 	} else {
1220 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1221 	}
1222 
1223 	remain = state->max_size / sizeof(u32);
1224 
1225 	for (size_t i = 0; i < state->num_funcs; i++) {
1226 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1227 						    state->ptr, remain);
1228 
1229 		remain -= len;
1230 
1231 		/*
1232 		 * Caller has asked for at least reserve_dw to remain unused.
1233 		 */
1234 		if (len < 0 ||
1235 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1236 			goto fail;
1237 
1238 		state->ptr += len;
1239 		state->written += len;
1240 	}
1241 
1242 	return 0;
1243 
1244 fail:
1245 	return -ENOSPC;
1246 }
1247 
1248 static void finish_bo(struct bo_setup_state *state)
1249 {
1250 	if (!state->lrc->bo->vmap.is_iomem)
1251 		return;
1252 
1253 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1254 			 state->offset, state->buffer,
1255 			 state->written * sizeof(u32));
1256 }
1257 
1258 /**
1259  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1260  * @lrc: the &xe_lrc struct instance
1261  * @hwe: the &xe_hw_engine struct instance
1262  * @scratch: preallocated scratch buffer for temporary storage
1263  * Return: 0 on success, negative error code on failure
1264  */
1265 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1266 {
1267 	static const struct bo_setup funcs[] = {
1268 		{ .setup = setup_timestamp_wa },
1269 		{ .setup = setup_invalidate_state_cache_wa },
1270 		{ .setup = setup_utilization_wa },
1271 		{ .setup = setup_configfs_post_ctx_restore_bb },
1272 	};
1273 	struct bo_setup_state state = {
1274 		.lrc = lrc,
1275 		.hwe = hwe,
1276 		.max_size = LRC_WA_BB_SIZE,
1277 		.buffer = scratch,
1278 		.reserve_dw = 1,
1279 		.offset = __xe_lrc_wa_bb_offset(lrc),
1280 		.funcs = funcs,
1281 		.num_funcs = ARRAY_SIZE(funcs),
1282 	};
1283 	int ret;
1284 
1285 	ret = setup_bo(&state);
1286 	if (ret)
1287 		return ret;
1288 
1289 	*state.ptr++ = MI_BATCH_BUFFER_END;
1290 	state.written++;
1291 
1292 	finish_bo(&state);
1293 
1294 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1295 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1296 
1297 	return 0;
1298 }
1299 
1300 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1301 {
1302 	u32 *buf = NULL;
1303 	int ret;
1304 
1305 	if (lrc->bo->vmap.is_iomem) {
1306 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1307 		if (!buf)
1308 			return -ENOMEM;
1309 	}
1310 
1311 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1312 
1313 	kfree(buf);
1314 
1315 	return ret;
1316 }
1317 
1318 static int
1319 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1320 {
1321 	static const struct bo_setup rcs_funcs[] = {
1322 		{ .setup = setup_timestamp_wa },
1323 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1324 	};
1325 	static const struct bo_setup xcs_funcs[] = {
1326 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1327 	};
1328 	struct bo_setup_state state = {
1329 		.lrc = lrc,
1330 		.hwe = hwe,
1331 		.max_size = (63 * 64) /* max 63 cachelines */,
1332 		.buffer = NULL,
1333 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1334 	};
1335 	int ret;
1336 
1337 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1338 		return 0;
1339 
1340 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1341 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1342 		state.funcs = rcs_funcs;
1343 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1344 	} else {
1345 		state.funcs = xcs_funcs;
1346 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1347 	}
1348 
1349 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1350 		return 0;
1351 
1352 	if (lrc->bo->vmap.is_iomem) {
1353 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1354 		if (!state.buffer)
1355 			return -ENOMEM;
1356 	}
1357 
1358 	ret = setup_bo(&state);
1359 	if (ret) {
1360 		kfree(state.buffer);
1361 		return ret;
1362 	}
1363 
1364 	/*
1365 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1366 	 * execute: size for indirect ctx must be a multiple of 64.
1367 	 */
1368 	while (state.written & 0xf) {
1369 		*state.ptr++ = MI_NOOP;
1370 		state.written++;
1371 	}
1372 
1373 	finish_bo(&state);
1374 	kfree(state.buffer);
1375 
1376 	/*
1377 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1378 	 * varies per engine class, but the default is good enough
1379 	 */
1380 	xe_lrc_write_ctx_reg(lrc,
1381 			     CTX_CS_INDIRECT_CTX,
1382 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1383 			     /* Size in CLs. */
1384 			     (state.written * sizeof(u32) / 64));
1385 
1386 	return 0;
1387 }
1388 
1389 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1390 		       struct xe_vm *vm, u32 ring_size, u16 msix_vec,
1391 		       u32 init_flags)
1392 {
1393 	struct xe_gt *gt = hwe->gt;
1394 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1395 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1396 	struct xe_tile *tile = gt_to_tile(gt);
1397 	struct xe_device *xe = gt_to_xe(gt);
1398 	struct iosys_map map;
1399 	u32 arb_enable;
1400 	u32 bo_flags;
1401 	int err;
1402 
1403 	kref_init(&lrc->refcount);
1404 	lrc->gt = gt;
1405 	lrc->size = lrc_size;
1406 	lrc->flags = 0;
1407 	lrc->ring.size = ring_size;
1408 	lrc->ring.tail = 0;
1409 
1410 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1411 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1412 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1413 	}
1414 
1415 	if (xe_gt_has_indirect_ring_state(gt))
1416 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1417 
1418 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1419 		   XE_BO_FLAG_GGTT_INVALIDATE;
1420 
1421 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1422 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1423 
1424 	lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
1425 					    bo_size,
1426 					    ttm_bo_type_kernel,
1427 					    bo_flags, false);
1428 	if (IS_ERR(lrc->bo))
1429 		return PTR_ERR(lrc->bo);
1430 
1431 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1432 			     hwe->fence_irq, hwe->name);
1433 
1434 	/*
1435 	 * Init Per-Process of HW status Page, LRC / context state to known
1436 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1437 	 * it's the early submission to record the lrc: build a new empty one from
1438 	 * scratch.
1439 	 */
1440 	map = __xe_lrc_pphwsp_map(lrc);
1441 	if (gt->default_lrc[hwe->class]) {
1442 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1443 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1444 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1445 				 lrc_size - LRC_PPHWSP_SIZE);
1446 	} else {
1447 		void *init_data = empty_lrc_data(hwe);
1448 
1449 		if (!init_data) {
1450 			err = -ENOMEM;
1451 			goto err_lrc_finish;
1452 		}
1453 
1454 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1455 		kfree(init_data);
1456 	}
1457 
1458 	if (vm) {
1459 		xe_lrc_set_ppgtt(lrc, vm);
1460 
1461 		if (vm->xef)
1462 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1463 	}
1464 
1465 	if (xe_device_has_msix(xe)) {
1466 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1467 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1468 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1469 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1470 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1471 	}
1472 
1473 	if (xe_gt_has_indirect_ring_state(gt)) {
1474 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1475 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1476 
1477 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1478 					      __xe_lrc_ring_ggtt_addr(lrc));
1479 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1480 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1481 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1482 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1483 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1484 	} else {
1485 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1486 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1487 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1488 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1489 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1490 	}
1491 
1492 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1493 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1494 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1495 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1496 
1497 	if (init_flags & XE_LRC_CREATE_PXP)
1498 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1499 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1500 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1501 
1502 	lrc->ctx_timestamp = 0;
1503 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1504 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1505 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1506 
1507 	if (xe->info.has_asid && vm)
1508 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1509 
1510 	lrc->desc = LRC_VALID;
1511 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1512 	/* TODO: Priority */
1513 
1514 	/* While this appears to have something about privileged batches or
1515 	 * some such, it really just means PPGTT mode.
1516 	 */
1517 	if (vm)
1518 		lrc->desc |= LRC_PRIVILEGE;
1519 
1520 	if (GRAPHICS_VERx100(xe) < 1250) {
1521 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1522 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1523 	}
1524 
1525 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1526 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1527 
1528 	map = __xe_lrc_seqno_map(lrc);
1529 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1530 
1531 	map = __xe_lrc_start_seqno_map(lrc);
1532 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1533 
1534 	err = setup_wa_bb(lrc, hwe);
1535 	if (err)
1536 		goto err_lrc_finish;
1537 
1538 	err = setup_indirect_ctx(lrc, hwe);
1539 	if (err)
1540 		goto err_lrc_finish;
1541 
1542 	return 0;
1543 
1544 err_lrc_finish:
1545 	xe_lrc_finish(lrc);
1546 	return err;
1547 }
1548 
1549 /**
1550  * xe_lrc_create - Create a LRC
1551  * @hwe: Hardware Engine
1552  * @vm: The VM (address space)
1553  * @ring_size: LRC ring size
1554  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1555  * @flags: LRC initialization flags
1556  *
1557  * Allocate and initialize the Logical Ring Context (LRC).
1558  *
1559  * Return pointer to created LRC upon success and an error pointer
1560  * upon failure.
1561  */
1562 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1563 			     u32 ring_size, u16 msix_vec, u32 flags)
1564 {
1565 	struct xe_lrc *lrc;
1566 	int err;
1567 
1568 	lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1569 	if (!lrc)
1570 		return ERR_PTR(-ENOMEM);
1571 
1572 	err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1573 	if (err) {
1574 		kfree(lrc);
1575 		return ERR_PTR(err);
1576 	}
1577 
1578 	return lrc;
1579 }
1580 
1581 /**
1582  * xe_lrc_destroy - Destroy the LRC
1583  * @ref: reference to LRC
1584  *
1585  * Called when ref == 0, release resources held by the Logical Ring Context
1586  * (LRC) and free the LRC memory.
1587  */
1588 void xe_lrc_destroy(struct kref *ref)
1589 {
1590 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1591 
1592 	xe_lrc_finish(lrc);
1593 	kfree(lrc);
1594 }
1595 
1596 /**
1597  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1598  * @lrc: the &xe_lrc struct instance
1599  */
1600 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1601 {
1602 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1603 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1604 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1605 
1606 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1607 					      __xe_lrc_ring_ggtt_addr(lrc));
1608 	} else {
1609 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1610 	}
1611 }
1612 
1613 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1614 {
1615 	if (xe_lrc_has_indirect_ring_state(lrc))
1616 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1617 	else
1618 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1619 }
1620 
1621 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1622 {
1623 	if (xe_lrc_has_indirect_ring_state(lrc))
1624 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1625 	else
1626 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1627 }
1628 
1629 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1630 {
1631 	if (xe_lrc_has_indirect_ring_state(lrc))
1632 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1633 	else
1634 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1635 }
1636 
1637 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1638 {
1639 	if (xe_lrc_has_indirect_ring_state(lrc))
1640 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1641 	else
1642 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1643 }
1644 
1645 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1646 {
1647 	if (xe_lrc_has_indirect_ring_state(lrc))
1648 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1649 	else
1650 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1651 }
1652 
1653 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1654 {
1655 	const u32 head = xe_lrc_ring_head(lrc);
1656 	const u32 tail = lrc->ring.tail;
1657 	const u32 size = lrc->ring.size;
1658 
1659 	return ((head - tail - 1) & (size - 1)) + 1;
1660 }
1661 
1662 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1663 				const void *data, size_t size)
1664 {
1665 	struct xe_device *xe = lrc_to_xe(lrc);
1666 
1667 	iosys_map_incr(&ring, lrc->ring.tail);
1668 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1669 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1670 }
1671 
1672 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1673 {
1674 	struct xe_device *xe = lrc_to_xe(lrc);
1675 	struct iosys_map ring;
1676 	u32 rhs;
1677 	size_t aligned_size;
1678 
1679 	xe_assert(xe, IS_ALIGNED(size, 4));
1680 	aligned_size = ALIGN(size, 8);
1681 
1682 	ring = __xe_lrc_ring_map(lrc);
1683 
1684 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1685 	rhs = lrc->ring.size - lrc->ring.tail;
1686 	if (size > rhs) {
1687 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1688 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1689 	} else {
1690 		__xe_lrc_write_ring(lrc, ring, data, size);
1691 	}
1692 
1693 	if (aligned_size > size) {
1694 		u32 noop = MI_NOOP;
1695 
1696 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1697 	}
1698 }
1699 
1700 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1701 {
1702 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1703 }
1704 
1705 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1706 {
1707 	return __xe_lrc_seqno_ggtt_addr(lrc);
1708 }
1709 
1710 /**
1711  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1712  *
1713  * Allocate but don't initialize an lrc seqno fence.
1714  *
1715  * Return: Pointer to the allocated fence or
1716  * negative error pointer on error.
1717  */
1718 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1719 {
1720 	return xe_hw_fence_alloc();
1721 }
1722 
1723 /**
1724  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1725  * @fence: Pointer to the fence to free.
1726  *
1727  * Frees an lrc seqno fence that hasn't yet been
1728  * initialized.
1729  */
1730 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1731 {
1732 	xe_hw_fence_free(fence);
1733 }
1734 
1735 /**
1736  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1737  * @lrc: Pointer to the lrc.
1738  * @fence: Pointer to the fence to initialize.
1739  *
1740  * Initializes a pre-allocated lrc seqno fence.
1741  * After initialization, the fence is subject to normal
1742  * dma-fence refcounting.
1743  */
1744 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1745 {
1746 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1747 }
1748 
1749 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1750 {
1751 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1752 
1753 	return xe_map_read32(lrc_to_xe(lrc), &map);
1754 }
1755 
1756 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1757 {
1758 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1759 
1760 	return xe_map_read32(lrc_to_xe(lrc), &map);
1761 }
1762 
1763 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1764 {
1765 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1766 }
1767 
1768 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1769 {
1770 	return __xe_lrc_parallel_ggtt_addr(lrc);
1771 }
1772 
1773 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1774 {
1775 	return __xe_lrc_parallel_map(lrc);
1776 }
1777 
1778 /**
1779  * xe_lrc_engine_id() - Read engine id value
1780  * @lrc: Pointer to the lrc.
1781  *
1782  * Returns: context id value
1783  */
1784 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1785 {
1786 	struct xe_device *xe = lrc_to_xe(lrc);
1787 	struct iosys_map map;
1788 
1789 	map = __xe_lrc_engine_id_map(lrc);
1790 	return xe_map_read32(xe, &map);
1791 }
1792 
1793 static int instr_dw(u32 cmd_header)
1794 {
1795 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1796 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1797 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1798 		return 1;
1799 
1800 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1801 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1802 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1803 
1804 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1805 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1806 }
1807 
1808 static int dump_mi_command(struct drm_printer *p,
1809 			   struct xe_gt *gt,
1810 			   u32 *dw,
1811 			   int remaining_dw)
1812 {
1813 	u32 inst_header = *dw;
1814 	u32 numdw = instr_dw(inst_header);
1815 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1816 	int num_noop;
1817 
1818 	/* First check for commands that don't have/use a '# DW' field */
1819 	switch (inst_header & MI_OPCODE) {
1820 	case MI_NOOP:
1821 		num_noop = 1;
1822 		while (num_noop < remaining_dw &&
1823 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1824 			num_noop++;
1825 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1826 		return num_noop;
1827 
1828 	case MI_TOPOLOGY_FILTER:
1829 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1830 		return 1;
1831 
1832 	case MI_BATCH_BUFFER_END:
1833 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1834 		/* Return 'remaining_dw' to consume the rest of the LRC */
1835 		return remaining_dw;
1836 	}
1837 
1838 	/*
1839 	 * Any remaining commands include a # of dwords.  We should make sure
1840 	 * it doesn't exceed the remaining size of the LRC.
1841 	 */
1842 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1843 		numdw = remaining_dw;
1844 
1845 	switch (inst_header & MI_OPCODE) {
1846 	case MI_LOAD_REGISTER_IMM:
1847 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1848 			   inst_header, (numdw - 1) / 2);
1849 		for (int i = 1; i < numdw; i += 2)
1850 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1851 		return numdw;
1852 
1853 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1854 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1855 			   inst_header,
1856 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1857 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1858 		if (numdw == 4)
1859 			drm_printf(p, " - %#6x = %#010llx\n",
1860 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1861 		else
1862 			drm_printf(p, " - %*ph (%s)\n",
1863 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1864 				   numdw < 4 ? "truncated" : "malformed");
1865 		return numdw;
1866 
1867 	case MI_FORCE_WAKEUP:
1868 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1869 		return numdw;
1870 
1871 	default:
1872 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1873 			   inst_header, opcode, numdw);
1874 		return numdw;
1875 	}
1876 }
1877 
1878 static int dump_gfxpipe_command(struct drm_printer *p,
1879 				struct xe_gt *gt,
1880 				u32 *dw,
1881 				int remaining_dw)
1882 {
1883 	u32 numdw = instr_dw(*dw);
1884 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1885 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1886 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1887 
1888 	/*
1889 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1890 	 * remaining size of the LRC.
1891 	 */
1892 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1893 		numdw = remaining_dw;
1894 
1895 	switch (*dw & GFXPIPE_MATCH_MASK) {
1896 #define MATCH(cmd) \
1897 	case cmd: \
1898 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1899 		return numdw
1900 #define MATCH3D(cmd) \
1901 	case CMD_##cmd: \
1902 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1903 		return numdw
1904 
1905 	MATCH(STATE_BASE_ADDRESS);
1906 	MATCH(STATE_SIP);
1907 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1908 	MATCH(STATE_COMPUTE_MODE);
1909 	MATCH3D(3DSTATE_BTD);
1910 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1911 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1912 
1913 	MATCH3D(3DSTATE_VF_STATISTICS);
1914 
1915 	MATCH(PIPELINE_SELECT);
1916 
1917 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1918 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1919 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1920 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1921 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1922 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1923 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1924 	MATCH3D(3DSTATE_INDEX_BUFFER);
1925 	MATCH3D(3DSTATE_VF);
1926 	MATCH3D(3DSTATE_MULTISAMPLE);
1927 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1928 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1929 	MATCH3D(3DSTATE_VS);
1930 	MATCH3D(3DSTATE_GS);
1931 	MATCH3D(3DSTATE_CLIP);
1932 	MATCH3D(3DSTATE_SF);
1933 	MATCH3D(3DSTATE_WM);
1934 	MATCH3D(3DSTATE_CONSTANT_VS);
1935 	MATCH3D(3DSTATE_CONSTANT_GS);
1936 	MATCH3D(3DSTATE_CONSTANT_PS);
1937 	MATCH3D(3DSTATE_SAMPLE_MASK);
1938 	MATCH3D(3DSTATE_CONSTANT_HS);
1939 	MATCH3D(3DSTATE_CONSTANT_DS);
1940 	MATCH3D(3DSTATE_HS);
1941 	MATCH3D(3DSTATE_TE);
1942 	MATCH3D(3DSTATE_DS);
1943 	MATCH3D(3DSTATE_STREAMOUT);
1944 	MATCH3D(3DSTATE_SBE);
1945 	MATCH3D(3DSTATE_PS);
1946 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1947 	MATCH3D(3DSTATE_CPS_POINTERS);
1948 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1949 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1950 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1951 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1952 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1953 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1954 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1955 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1956 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1957 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1958 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1959 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1960 	MATCH3D(3DSTATE_VF_INSTANCING);
1961 	MATCH3D(3DSTATE_VF_SGVS);
1962 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1963 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1964 	MATCH3D(3DSTATE_PS_BLEND);
1965 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1966 	MATCH3D(3DSTATE_PS_EXTRA);
1967 	MATCH3D(3DSTATE_RASTER);
1968 	MATCH3D(3DSTATE_SBE_SWIZ);
1969 	MATCH3D(3DSTATE_WM_HZ_OP);
1970 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1971 	MATCH3D(3DSTATE_VF_SGVS_2);
1972 	MATCH3D(3DSTATE_VFG);
1973 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1974 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1975 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1976 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1977 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1978 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1979 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1980 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1981 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1982 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1983 	MATCH3D(3DSTATE_AMFS);
1984 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1985 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1986 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1987 	MATCH3D(3DSTATE_MESH_CONTROL);
1988 	MATCH3D(3DSTATE_MESH_DISTRIB);
1989 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1990 	MATCH3D(3DSTATE_MESH_SHADER);
1991 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1992 	MATCH3D(3DSTATE_TASK_CONTROL);
1993 	MATCH3D(3DSTATE_TASK_SHADER);
1994 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1995 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1996 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1997 	MATCH3D(3DSTATE_CLIP_MESH);
1998 	MATCH3D(3DSTATE_SBE_MESH);
1999 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2000 	MATCH3D(3DSTATE_COARSE_PIXEL);
2001 
2002 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2003 	MATCH3D(3DSTATE_CHROMA_KEY);
2004 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2005 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2006 	MATCH3D(3DSTATE_LINE_STIPPLE);
2007 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2008 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2009 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2010 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2011 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2012 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2013 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2014 	MATCH3D(3DSTATE_SO_DECL_LIST);
2015 	MATCH3D(3DSTATE_SO_BUFFER);
2016 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2017 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2018 	MATCH3D(3DSTATE_3D_MODE);
2019 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2020 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2021 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2022 
2023 	default:
2024 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2025 			   *dw, pipeline, opcode, subopcode, numdw);
2026 		return numdw;
2027 	}
2028 }
2029 
2030 static int dump_gfx_state_command(struct drm_printer *p,
2031 				  struct xe_gt *gt,
2032 				  u32 *dw,
2033 				  int remaining_dw)
2034 {
2035 	u32 numdw = instr_dw(*dw);
2036 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2037 
2038 	/*
2039 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2040 	 * remaining size of the LRC.
2041 	 */
2042 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2043 		numdw = remaining_dw;
2044 
2045 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2046 	MATCH(STATE_WRITE_INLINE);
2047 
2048 	default:
2049 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2050 			   *dw, opcode, numdw);
2051 		return numdw;
2052 	}
2053 }
2054 
2055 void xe_lrc_dump_default(struct drm_printer *p,
2056 			 struct xe_gt *gt,
2057 			 enum xe_engine_class hwe_class)
2058 {
2059 	u32 *dw;
2060 	int remaining_dw, num_dw;
2061 
2062 	if (!gt->default_lrc[hwe_class]) {
2063 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2064 		return;
2065 	}
2066 
2067 	/*
2068 	 * Skip the beginning of the LRC since it contains the per-process
2069 	 * hardware status page.
2070 	 */
2071 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2072 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2073 
2074 	while (remaining_dw > 0) {
2075 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2076 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
2077 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2078 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
2079 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2080 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
2081 		} else {
2082 			num_dw = min(instr_dw(*dw), remaining_dw);
2083 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2084 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2085 				   num_dw);
2086 		}
2087 
2088 		dw += num_dw;
2089 		remaining_dw -= num_dw;
2090 	}
2091 }
2092 
2093 struct instr_state {
2094 	u32 instr;
2095 	u16 num_dw;
2096 };
2097 
2098 static const struct instr_state xe_hpg_svg_state[] = {
2099 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2100 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2101 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2102 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2103 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2104 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2105 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2106 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2107 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2108 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2109 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2110 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2111 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2112 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2113 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2114 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2115 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2116 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2117 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2118 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2119 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2120 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2121 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2122 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2123 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2124 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2125 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2126 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2127 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2128 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2129 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2130 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2131 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2132 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2133 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2134 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2135 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2136 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2137 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2138 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2139 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2140 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2141 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2142 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2143 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2144 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2145 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2146 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2147 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2148 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2149 };
2150 
2151 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2152 {
2153 	struct xe_gt *gt = q->hwe->gt;
2154 	struct xe_device *xe = gt_to_xe(gt);
2155 	const struct instr_state *state_table = NULL;
2156 	int state_table_size = 0;
2157 
2158 	/*
2159 	 * Wa_14019789679
2160 	 *
2161 	 * If the driver doesn't explicitly emit the SVG instructions while
2162 	 * setting up the default LRC, the context switch will write 0's
2163 	 * (noops) into the LRC memory rather than the expected instruction
2164 	 * headers.  Application contexts start out as a copy of the default
2165 	 * LRC, and if they also do not emit specific settings for some SVG
2166 	 * state, then on context restore they'll unintentionally inherit
2167 	 * whatever state setting the previous context had programmed into the
2168 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2169 	 * prevent the hardware from resetting that state back to any specific
2170 	 * value).
2171 	 *
2172 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2173 	 * since that's a specific state setting that can easily cause GPU
2174 	 * hangs if unintentionally inherited.  However to be safe we'll
2175 	 * continue to emit all of the SVG state since it's best not to leak
2176 	 * any of the state between contexts, even if that leakage is harmless.
2177 	 */
2178 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2179 		state_table = xe_hpg_svg_state;
2180 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2181 	}
2182 
2183 	if (!state_table) {
2184 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2185 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2186 		return cs;
2187 	}
2188 
2189 	for (int i = 0; i < state_table_size; i++) {
2190 		u32 instr = state_table[i].instr;
2191 		u16 num_dw = state_table[i].num_dw;
2192 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2193 
2194 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2195 		xe_gt_assert(gt, num_dw != 0);
2196 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2197 
2198 		/*
2199 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2200 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2201 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2202 		 * Just make the replacement here rather than defining a
2203 		 * whole separate table for the single trivial change.
2204 		 */
2205 		if (GRAPHICS_VER(xe) >= 20 &&
2206 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2207 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2208 
2209 		*cs = instr;
2210 		if (!is_single_dw)
2211 			*cs |= (num_dw - 2);
2212 
2213 		cs += num_dw;
2214 	}
2215 
2216 	return cs;
2217 }
2218 
2219 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2220 {
2221 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
2222 
2223 	if (!snapshot)
2224 		return NULL;
2225 
2226 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2227 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2228 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2229 	snapshot->head = xe_lrc_ring_head(lrc);
2230 	snapshot->tail.internal = lrc->ring.tail;
2231 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2232 	snapshot->start = xe_lrc_ring_start(lrc);
2233 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2234 	snapshot->seqno = xe_lrc_seqno(lrc);
2235 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2236 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2237 	snapshot->lrc_size = lrc->size;
2238 	snapshot->lrc_snapshot = NULL;
2239 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2240 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2241 	return snapshot;
2242 }
2243 
2244 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2245 {
2246 	struct xe_bo *bo;
2247 	struct iosys_map src;
2248 
2249 	if (!snapshot)
2250 		return;
2251 
2252 	bo = snapshot->lrc_bo;
2253 	snapshot->lrc_bo = NULL;
2254 
2255 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2256 	if (!snapshot->lrc_snapshot)
2257 		goto put_bo;
2258 
2259 	xe_bo_lock(bo, false);
2260 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2261 		xe_map_memcpy_from(xe_bo_device(bo),
2262 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2263 				   snapshot->lrc_size);
2264 		ttm_bo_vunmap(&bo->ttm, &src);
2265 	} else {
2266 		kvfree(snapshot->lrc_snapshot);
2267 		snapshot->lrc_snapshot = NULL;
2268 	}
2269 	xe_bo_unlock(bo);
2270 put_bo:
2271 	xe_bo_put(bo);
2272 }
2273 
2274 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2275 {
2276 	unsigned long i;
2277 
2278 	if (!snapshot)
2279 		return;
2280 
2281 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2282 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2283 		   snapshot->ring_addr);
2284 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2285 		   snapshot->indirect_context_desc);
2286 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2287 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2288 		   snapshot->tail.internal, snapshot->tail.memory);
2289 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2290 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2291 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2292 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2293 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2294 
2295 	if (!snapshot->lrc_snapshot)
2296 		return;
2297 
2298 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2299 	drm_puts(p, "\t[HWSP].data: ");
2300 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2301 		u32 *val = snapshot->lrc_snapshot + i;
2302 		char dumped[ASCII85_BUFSZ];
2303 
2304 		drm_puts(p, ascii85_encode(*val, dumped));
2305 	}
2306 
2307 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2308 	drm_puts(p, "\t[HWCTX].data: ");
2309 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2310 		u32 *val = snapshot->lrc_snapshot + i;
2311 		char dumped[ASCII85_BUFSZ];
2312 
2313 		drm_puts(p, ascii85_encode(*val, dumped));
2314 	}
2315 	drm_puts(p, "\n");
2316 }
2317 
2318 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2319 {
2320 	if (!snapshot)
2321 		return;
2322 
2323 	kvfree(snapshot->lrc_snapshot);
2324 	if (snapshot->lrc_bo)
2325 		xe_bo_put(snapshot->lrc_bo);
2326 
2327 	kfree(snapshot);
2328 }
2329 
2330 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2331 {
2332 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2333 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2334 	struct xe_hw_engine *hwe;
2335 	u64 val;
2336 
2337 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2338 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2339 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2340 			    class, instance))
2341 		return -1;
2342 
2343 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2344 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2345 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2346 	else
2347 		val = xe_mmio_read32(&hwe->gt->mmio,
2348 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2349 
2350 	*reg_ctx_ts = val;
2351 
2352 	return 0;
2353 }
2354 
2355 /**
2356  * xe_lrc_update_timestamp() - Update ctx timestamp
2357  * @lrc: Pointer to the lrc.
2358  * @old_ts: Old timestamp value
2359  *
2360  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2361  * update saved value. With support for active contexts, the calculation may be
2362  * slightly racy, so follow a read-again logic to ensure that the context is
2363  * still active before returning the right timestamp.
2364  *
2365  * Returns: New ctx timestamp value
2366  */
2367 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2368 {
2369 	u64 lrc_ts, reg_ts;
2370 	u32 engine_id;
2371 
2372 	*old_ts = lrc->ctx_timestamp;
2373 
2374 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2375 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2376 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2377 		lrc->ctx_timestamp = lrc_ts;
2378 		goto done;
2379 	}
2380 
2381 	if (lrc_ts == CONTEXT_ACTIVE) {
2382 		engine_id = xe_lrc_engine_id(lrc);
2383 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2384 			lrc->ctx_timestamp = reg_ts;
2385 
2386 		/* read lrc again to ensure context is still active */
2387 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2388 	}
2389 
2390 	/*
2391 	 * If context switched out, just use the lrc_ts. Note that this needs to
2392 	 * be a separate if condition.
2393 	 */
2394 	if (lrc_ts != CONTEXT_ACTIVE)
2395 		lrc->ctx_timestamp = lrc_ts;
2396 
2397 done:
2398 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2399 
2400 	return lrc->ctx_timestamp;
2401 }
2402 
2403 /**
2404  * xe_lrc_ring_is_idle() - LRC is idle
2405  * @lrc: Pointer to the lrc.
2406  *
2407  * Compare LRC ring head and tail to determine if idle.
2408  *
2409  * Return: True is ring is idle, False otherwise
2410  */
2411 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2412 {
2413 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2414 }
2415