xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 68a052239fc4b351e961f698b824f7654a346091)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_lrc_layout.h"
18 #include "xe_bb.h"
19 #include "xe_bo.h"
20 #include "xe_configfs.h"
21 #include "xe_device.h"
22 #include "xe_drm_client.h"
23 #include "xe_exec_queue_types.h"
24 #include "xe_gt.h"
25 #include "xe_gt_printk.h"
26 #include "xe_hw_fence.h"
27 #include "xe_map.h"
28 #include "xe_memirq.h"
29 #include "xe_mmio.h"
30 #include "xe_sriov.h"
31 #include "xe_trace_lrc.h"
32 #include "xe_vm.h"
33 #include "xe_wa.h"
34 
35 #define LRC_VALID				BIT_ULL(0)
36 #define LRC_PRIVILEGE				BIT_ULL(8)
37 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
38 #define LRC_LEGACY_64B_CONTEXT			3
39 
40 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
41 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
42 
43 #define LRC_PPHWSP_SIZE				SZ_4K
44 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
45 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
46 
47 /*
48  * Layout of the LRC and associated data allocated as
49  * lrc->bo:
50  *
51  *   Region                       Size
52  *  +============================+=================================+ <- __xe_lrc_ring_offset()
53  *  | Ring                       | ring_size, see                  |
54  *  |                            | xe_lrc_init()                   |
55  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
56  *  | PPHWSP (includes SW state) | 4K                              |
57  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
58  *  | Engine Context Image       | n * 4K, see                     |
59  *  |                            | xe_gt_lrc_size()                |
60  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
61  *  | Indirect Ring State Page   | 0 or 4k, see                    |
62  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
63  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
64  *  | Indirect Context Page      | 0 or 4k, see                    |
65  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
66  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
67  *  | WA BB Per Ctx              | 4k                              |
68  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
69  */
70 
71 static struct xe_device *
72 lrc_to_xe(struct xe_lrc *lrc)
73 {
74 	return gt_to_xe(lrc->fence_ctx.gt);
75 }
76 
77 static bool
78 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
79 {
80 	struct xe_device *xe = gt_to_xe(gt);
81 
82 	if (XE_GT_WA(gt, 16010904313) &&
83 	    (class == XE_ENGINE_CLASS_RENDER ||
84 	     class == XE_ENGINE_CLASS_COMPUTE))
85 		return true;
86 
87 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
88 					       class, NULL))
89 		return true;
90 
91 	return false;
92 }
93 
94 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
95 {
96 	struct xe_device *xe = gt_to_xe(gt);
97 	size_t size;
98 
99 	/* Per-process HW status page (PPHWSP) */
100 	size = LRC_PPHWSP_SIZE;
101 
102 	/* Engine context image */
103 	switch (class) {
104 	case XE_ENGINE_CLASS_RENDER:
105 		if (GRAPHICS_VER(xe) >= 20)
106 			size += 3 * SZ_4K;
107 		else
108 			size += 13 * SZ_4K;
109 		break;
110 	case XE_ENGINE_CLASS_COMPUTE:
111 		if (GRAPHICS_VER(xe) >= 20)
112 			size += 2 * SZ_4K;
113 		else
114 			size += 13 * SZ_4K;
115 		break;
116 	default:
117 		WARN(1, "Unknown engine class: %d", class);
118 		fallthrough;
119 	case XE_ENGINE_CLASS_COPY:
120 	case XE_ENGINE_CLASS_VIDEO_DECODE:
121 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
122 	case XE_ENGINE_CLASS_OTHER:
123 		size += 1 * SZ_4K;
124 	}
125 
126 	/* Add indirect ring state page */
127 	if (xe_gt_has_indirect_ring_state(gt))
128 		size += LRC_INDIRECT_RING_STATE_SIZE;
129 
130 	return size;
131 }
132 
133 /*
134  * The per-platform tables are u8-encoded in @data. Decode @data and set the
135  * addresses' offset and commands in @regs. The following encoding is used
136  * for each byte. There are 2 steps: decoding commands and decoding addresses.
137  *
138  * Commands:
139  * [7]: create NOPs - number of NOPs are set in lower bits
140  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
141  *      MI_LRI_FORCE_POSTED
142  * [5:0]: Number of NOPs or registers to set values to in case of
143  *        MI_LOAD_REGISTER_IMM
144  *
145  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
146  * number of registers. They are set by using the REG/REG16 macros: the former
147  * is used for offsets smaller than 0x200 while the latter is for values bigger
148  * than that. Those macros already set all the bits documented below correctly:
149  *
150  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
151  *      follow, for the lower bits
152  * [6:0]: Register offset, without considering the engine base.
153  *
154  * This function only tweaks the commands and register offsets. Values are not
155  * filled out.
156  */
157 static void set_offsets(u32 *regs,
158 			const u8 *data,
159 			const struct xe_hw_engine *hwe)
160 #define NOP(x) (BIT(7) | (x))
161 #define LRI(count, flags) ((flags) << 6 | (count) | \
162 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
163 #define POSTED BIT(0)
164 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
165 #define REG16(x) \
166 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
167 	(((x) >> 2) & 0x7f)
168 {
169 	const u32 base = hwe->mmio_base;
170 
171 	while (*data) {
172 		u8 count, flags;
173 
174 		if (*data & BIT(7)) { /* skip */
175 			count = *data++ & ~BIT(7);
176 			regs += count;
177 			continue;
178 		}
179 
180 		count = *data & 0x3f;
181 		flags = *data >> 6;
182 		data++;
183 
184 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
185 		if (flags & POSTED)
186 			*regs |= MI_LRI_FORCE_POSTED;
187 		*regs |= MI_LRI_LRM_CS_MMIO;
188 		regs++;
189 
190 		xe_gt_assert(hwe->gt, count);
191 		do {
192 			u32 offset = 0;
193 			u8 v;
194 
195 			do {
196 				v = *data++;
197 				offset <<= 7;
198 				offset |= v & ~BIT(7);
199 			} while (v & BIT(7));
200 
201 			regs[0] = base + (offset << 2);
202 			regs += 2;
203 		} while (--count);
204 	}
205 
206 	*regs = MI_BATCH_BUFFER_END | BIT(0);
207 }
208 
209 static const u8 gen12_xcs_offsets[] = {
210 	NOP(1),
211 	LRI(13, POSTED),
212 	REG16(0x244),
213 	REG(0x034),
214 	REG(0x030),
215 	REG(0x038),
216 	REG(0x03c),
217 	REG(0x168),
218 	REG(0x140),
219 	REG(0x110),
220 	REG(0x1c0),
221 	REG(0x1c4),
222 	REG(0x1c8),
223 	REG(0x180),
224 	REG16(0x2b4),
225 
226 	NOP(5),
227 	LRI(9, POSTED),
228 	REG16(0x3a8),
229 	REG16(0x28c),
230 	REG16(0x288),
231 	REG16(0x284),
232 	REG16(0x280),
233 	REG16(0x27c),
234 	REG16(0x278),
235 	REG16(0x274),
236 	REG16(0x270),
237 
238 	0
239 };
240 
241 static const u8 dg2_xcs_offsets[] = {
242 	NOP(1),
243 	LRI(15, POSTED),
244 	REG16(0x244),
245 	REG(0x034),
246 	REG(0x030),
247 	REG(0x038),
248 	REG(0x03c),
249 	REG(0x168),
250 	REG(0x140),
251 	REG(0x110),
252 	REG(0x1c0),
253 	REG(0x1c4),
254 	REG(0x1c8),
255 	REG(0x180),
256 	REG16(0x2b4),
257 	REG(0x120),
258 	REG(0x124),
259 
260 	NOP(1),
261 	LRI(9, POSTED),
262 	REG16(0x3a8),
263 	REG16(0x28c),
264 	REG16(0x288),
265 	REG16(0x284),
266 	REG16(0x280),
267 	REG16(0x27c),
268 	REG16(0x278),
269 	REG16(0x274),
270 	REG16(0x270),
271 
272 	0
273 };
274 
275 static const u8 gen12_rcs_offsets[] = {
276 	NOP(1),
277 	LRI(13, POSTED),
278 	REG16(0x244),
279 	REG(0x034),
280 	REG(0x030),
281 	REG(0x038),
282 	REG(0x03c),
283 	REG(0x168),
284 	REG(0x140),
285 	REG(0x110),
286 	REG(0x1c0),
287 	REG(0x1c4),
288 	REG(0x1c8),
289 	REG(0x180),
290 	REG16(0x2b4),
291 
292 	NOP(5),
293 	LRI(9, POSTED),
294 	REG16(0x3a8),
295 	REG16(0x28c),
296 	REG16(0x288),
297 	REG16(0x284),
298 	REG16(0x280),
299 	REG16(0x27c),
300 	REG16(0x278),
301 	REG16(0x274),
302 	REG16(0x270),
303 
304 	LRI(3, POSTED),
305 	REG(0x1b0),
306 	REG16(0x5a8),
307 	REG16(0x5ac),
308 
309 	NOP(6),
310 	LRI(1, 0),
311 	REG(0x0c8),
312 	NOP(3 + 9 + 1),
313 
314 	LRI(51, POSTED),
315 	REG16(0x588),
316 	REG16(0x588),
317 	REG16(0x588),
318 	REG16(0x588),
319 	REG16(0x588),
320 	REG16(0x588),
321 	REG(0x028),
322 	REG(0x09c),
323 	REG(0x0c0),
324 	REG(0x178),
325 	REG(0x17c),
326 	REG16(0x358),
327 	REG(0x170),
328 	REG(0x150),
329 	REG(0x154),
330 	REG(0x158),
331 	REG16(0x41c),
332 	REG16(0x600),
333 	REG16(0x604),
334 	REG16(0x608),
335 	REG16(0x60c),
336 	REG16(0x610),
337 	REG16(0x614),
338 	REG16(0x618),
339 	REG16(0x61c),
340 	REG16(0x620),
341 	REG16(0x624),
342 	REG16(0x628),
343 	REG16(0x62c),
344 	REG16(0x630),
345 	REG16(0x634),
346 	REG16(0x638),
347 	REG16(0x63c),
348 	REG16(0x640),
349 	REG16(0x644),
350 	REG16(0x648),
351 	REG16(0x64c),
352 	REG16(0x650),
353 	REG16(0x654),
354 	REG16(0x658),
355 	REG16(0x65c),
356 	REG16(0x660),
357 	REG16(0x664),
358 	REG16(0x668),
359 	REG16(0x66c),
360 	REG16(0x670),
361 	REG16(0x674),
362 	REG16(0x678),
363 	REG16(0x67c),
364 	REG(0x068),
365 	REG(0x084),
366 	NOP(1),
367 
368 	0
369 };
370 
371 static const u8 xehp_rcs_offsets[] = {
372 	NOP(1),
373 	LRI(13, POSTED),
374 	REG16(0x244),
375 	REG(0x034),
376 	REG(0x030),
377 	REG(0x038),
378 	REG(0x03c),
379 	REG(0x168),
380 	REG(0x140),
381 	REG(0x110),
382 	REG(0x1c0),
383 	REG(0x1c4),
384 	REG(0x1c8),
385 	REG(0x180),
386 	REG16(0x2b4),
387 
388 	NOP(5),
389 	LRI(9, POSTED),
390 	REG16(0x3a8),
391 	REG16(0x28c),
392 	REG16(0x288),
393 	REG16(0x284),
394 	REG16(0x280),
395 	REG16(0x27c),
396 	REG16(0x278),
397 	REG16(0x274),
398 	REG16(0x270),
399 
400 	LRI(3, POSTED),
401 	REG(0x1b0),
402 	REG16(0x5a8),
403 	REG16(0x5ac),
404 
405 	NOP(6),
406 	LRI(1, 0),
407 	REG(0x0c8),
408 
409 	0
410 };
411 
412 static const u8 dg2_rcs_offsets[] = {
413 	NOP(1),
414 	LRI(15, POSTED),
415 	REG16(0x244),
416 	REG(0x034),
417 	REG(0x030),
418 	REG(0x038),
419 	REG(0x03c),
420 	REG(0x168),
421 	REG(0x140),
422 	REG(0x110),
423 	REG(0x1c0),
424 	REG(0x1c4),
425 	REG(0x1c8),
426 	REG(0x180),
427 	REG16(0x2b4),
428 	REG(0x120),
429 	REG(0x124),
430 
431 	NOP(1),
432 	LRI(9, POSTED),
433 	REG16(0x3a8),
434 	REG16(0x28c),
435 	REG16(0x288),
436 	REG16(0x284),
437 	REG16(0x280),
438 	REG16(0x27c),
439 	REG16(0x278),
440 	REG16(0x274),
441 	REG16(0x270),
442 
443 	LRI(3, POSTED),
444 	REG(0x1b0),
445 	REG16(0x5a8),
446 	REG16(0x5ac),
447 
448 	NOP(6),
449 	LRI(1, 0),
450 	REG(0x0c8),
451 
452 	0
453 };
454 
455 static const u8 mtl_rcs_offsets[] = {
456 	NOP(1),
457 	LRI(15, POSTED),
458 	REG16(0x244),
459 	REG(0x034),
460 	REG(0x030),
461 	REG(0x038),
462 	REG(0x03c),
463 	REG(0x168),
464 	REG(0x140),
465 	REG(0x110),
466 	REG(0x1c0),
467 	REG(0x1c4),
468 	REG(0x1c8),
469 	REG(0x180),
470 	REG16(0x2b4),
471 	REG(0x120),
472 	REG(0x124),
473 
474 	NOP(1),
475 	LRI(9, POSTED),
476 	REG16(0x3a8),
477 	REG16(0x28c),
478 	REG16(0x288),
479 	REG16(0x284),
480 	REG16(0x280),
481 	REG16(0x27c),
482 	REG16(0x278),
483 	REG16(0x274),
484 	REG16(0x270),
485 
486 	NOP(2),
487 	LRI(2, POSTED),
488 	REG16(0x5a8),
489 	REG16(0x5ac),
490 
491 	NOP(6),
492 	LRI(1, 0),
493 	REG(0x0c8),
494 
495 	0
496 };
497 
498 #define XE2_CTX_COMMON \
499 	NOP(1),                 /* [0x00] */ \
500 	LRI(15, POSTED),        /* [0x01] */ \
501 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
502 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
503 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
504 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
505 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
506 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
507 	REG(0x140),             /* [0x0e] BB_ADDR */ \
508 	REG(0x110),             /* [0x10] BB_STATE */ \
509 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
510 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
511 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
512 	REG(0x180),             /* [0x18] CCID */ \
513 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
514 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
515 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
516 	\
517 	NOP(1),                 /* [0x20] */ \
518 	LRI(9, POSTED),         /* [0x21] */ \
519 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
520 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
521 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
522 	REG16(0x284),           /* [0x28] dummy reg */ \
523 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
524 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
525 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
526 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
527 	REG16(0x270)            /* [0x32] PTBP_LDW */
528 
529 static const u8 xe2_rcs_offsets[] = {
530 	XE2_CTX_COMMON,
531 
532 	NOP(2),                 /* [0x34] */
533 	LRI(2, POSTED),         /* [0x36] */
534 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
535 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
536 
537 	NOP(6),                 /* [0x41] */
538 	LRI(1, 0),              /* [0x47] */
539 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
540 
541 	0
542 };
543 
544 static const u8 xe2_bcs_offsets[] = {
545 	XE2_CTX_COMMON,
546 
547 	NOP(4 + 8 + 1),         /* [0x34] */
548 	LRI(2, POSTED),         /* [0x41] */
549 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
550 	REG16(0x204),           /* [0x44] BLIT_CCTL */
551 
552 	0
553 };
554 
555 static const u8 xe2_xcs_offsets[] = {
556 	XE2_CTX_COMMON,
557 
558 	0
559 };
560 
561 static const u8 xe2_indirect_ring_state_offsets[] = {
562 	NOP(1),                 /* [0x00] */
563 	LRI(5, POSTED),         /* [0x01] */
564 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
565 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
566 	REG(0x038),             /* [0x06] RING_BUFFER_START */
567 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
568 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
569 
570 	NOP(5),                 /* [0x0c] */
571 	LRI(9, POSTED),         /* [0x11] */
572 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
573 	REG(0x140),             /* [0x14] BB_ADDR */
574 	REG(0x110),             /* [0x16] BB_STATE */
575 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
576 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
577 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
578 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
579 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
580 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
581 
582 	NOP(12),                 /* [0x00] */
583 
584 	0
585 };
586 
587 #undef REG16
588 #undef REG
589 #undef LRI
590 #undef NOP
591 
592 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
593 {
594 	if (class == XE_ENGINE_CLASS_RENDER) {
595 		if (GRAPHICS_VER(xe) >= 20)
596 			return xe2_rcs_offsets;
597 		else if (GRAPHICS_VERx100(xe) >= 1270)
598 			return mtl_rcs_offsets;
599 		else if (GRAPHICS_VERx100(xe) >= 1255)
600 			return dg2_rcs_offsets;
601 		else if (GRAPHICS_VERx100(xe) >= 1250)
602 			return xehp_rcs_offsets;
603 		else
604 			return gen12_rcs_offsets;
605 	} else if (class == XE_ENGINE_CLASS_COPY) {
606 		if (GRAPHICS_VER(xe) >= 20)
607 			return xe2_bcs_offsets;
608 		else
609 			return gen12_xcs_offsets;
610 	} else {
611 		if (GRAPHICS_VER(xe) >= 20)
612 			return xe2_xcs_offsets;
613 		else if (GRAPHICS_VERx100(xe) >= 1255)
614 			return dg2_xcs_offsets;
615 		else
616 			return gen12_xcs_offsets;
617 	}
618 }
619 
620 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
621 {
622 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
623 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
624 
625 	if (xe_gt_has_indirect_ring_state(hwe->gt))
626 		regs[CTX_CONTEXT_CONTROL] |=
627 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
628 }
629 
630 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
631 {
632 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
633 	struct xe_device *xe = gt_to_xe(hwe->gt);
634 	u8 num_regs;
635 
636 	if (!xe_device_uses_memirq(xe))
637 		return;
638 
639 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
640 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
641 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
642 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
643 
644 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
645 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
646 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
647 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
648 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
649 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
650 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
651 
652 	if (xe_device_has_msix(xe)) {
653 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
654 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
655 	}
656 }
657 
658 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
659 {
660 	struct xe_device *xe = gt_to_xe(hwe->gt);
661 
662 	if (GRAPHICS_VERx100(xe) >= 1250)
663 		return 0x70;
664 	else
665 		return 0x60;
666 }
667 
668 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
669 {
670 	int x;
671 
672 	x = lrc_ring_mi_mode(hwe);
673 	regs[x + 1] &= ~STOP_RING;
674 	regs[x + 1] |= STOP_RING << 16;
675 }
676 
677 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
678 {
679 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
680 }
681 
682 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
683 {
684 	return 0;
685 }
686 
687 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
688 {
689 	return lrc->ring.size;
690 }
691 
692 /* Make the magic macros work */
693 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
694 #define __xe_lrc_regs_offset xe_lrc_regs_offset
695 
696 #define LRC_SEQNO_PPHWSP_OFFSET 512
697 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
698 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
699 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
700 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
701 
702 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
703 {
704 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
705 }
706 
707 /**
708  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
709  * @xe: the &xe_device struct instance
710  *
711  * Returns: Size of the LRC registers area for current platform
712  */
713 size_t xe_lrc_reg_size(struct xe_device *xe)
714 {
715 	if (GRAPHICS_VERx100(xe) >= 1250)
716 		return 96 * sizeof(u32);
717 	else
718 		return 80 * sizeof(u32);
719 }
720 
721 size_t xe_lrc_skip_size(struct xe_device *xe)
722 {
723 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
724 }
725 
726 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
727 {
728 	/* The seqno is stored in the driver-defined portion of PPHWSP */
729 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
730 }
731 
732 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
733 {
734 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
735 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
736 }
737 
738 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
739 {
740 	/* This is stored in the driver-defined portion of PPHWSP */
741 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
742 }
743 
744 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
745 {
746 	/* The parallel is stored in the driver-defined portion of PPHWSP */
747 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
748 }
749 
750 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
751 {
752 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
753 }
754 
755 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
756 {
757 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
758 }
759 
760 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
761 {
762 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
763 }
764 
765 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
766 {
767 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
768 		     LRC_INDIRECT_RING_STATE_SIZE;
769 
770 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
771 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
772 
773 	return offset;
774 }
775 
776 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
777 {
778 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
779 }
780 
781 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
782 {
783 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
784 }
785 
786 #define DECL_MAP_ADDR_HELPERS(elem) \
787 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
788 { \
789 	struct iosys_map map = lrc->bo->vmap; \
790 \
791 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
792 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
793 	return map; \
794 } \
795 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
796 { \
797 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
798 } \
799 
800 DECL_MAP_ADDR_HELPERS(ring)
801 DECL_MAP_ADDR_HELPERS(pphwsp)
802 DECL_MAP_ADDR_HELPERS(seqno)
803 DECL_MAP_ADDR_HELPERS(regs)
804 DECL_MAP_ADDR_HELPERS(start_seqno)
805 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
806 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
807 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
808 DECL_MAP_ADDR_HELPERS(parallel)
809 DECL_MAP_ADDR_HELPERS(indirect_ring)
810 DECL_MAP_ADDR_HELPERS(engine_id)
811 
812 #undef DECL_MAP_ADDR_HELPERS
813 
814 /**
815  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
816  * @lrc: Pointer to the lrc.
817  *
818  * Returns: ctx timestamp GGTT address
819  */
820 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
821 {
822 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
823 }
824 
825 /**
826  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
827  * @lrc: Pointer to the lrc.
828  *
829  * Returns: ctx timestamp udw GGTT address
830  */
831 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
832 {
833 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
834 }
835 
836 /**
837  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
838  * @lrc: Pointer to the lrc.
839  *
840  * Returns: ctx timestamp value
841  */
842 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
843 {
844 	struct xe_device *xe = lrc_to_xe(lrc);
845 	struct iosys_map map;
846 	u32 ldw, udw = 0;
847 
848 	map = __xe_lrc_ctx_timestamp_map(lrc);
849 	ldw = xe_map_read32(xe, &map);
850 
851 	if (xe->info.has_64bit_timestamp) {
852 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
853 		udw = xe_map_read32(xe, &map);
854 	}
855 
856 	return (u64)udw << 32 | ldw;
857 }
858 
859 /**
860  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
861  * @lrc: Pointer to the lrc.
862  *
863  * Returns: ctx timestamp job GGTT address
864  */
865 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
866 {
867 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
868 }
869 
870 /**
871  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
872  * @lrc: Pointer to the lrc.
873  *
874  * Returns: ctx timestamp job value
875  */
876 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
877 {
878 	struct xe_device *xe = lrc_to_xe(lrc);
879 	struct iosys_map map;
880 
881 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
882 	return xe_map_read32(xe, &map);
883 }
884 
885 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
886 {
887 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
888 }
889 
890 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
891 {
892 	if (!xe_lrc_has_indirect_ring_state(lrc))
893 		return 0;
894 
895 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
896 }
897 
898 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
899 {
900 	struct xe_device *xe = lrc_to_xe(lrc);
901 	struct iosys_map map;
902 
903 	map = __xe_lrc_indirect_ring_map(lrc);
904 	iosys_map_incr(&map, reg_nr * sizeof(u32));
905 	return xe_map_read32(xe, &map);
906 }
907 
908 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
909 					  int reg_nr, u32 val)
910 {
911 	struct xe_device *xe = lrc_to_xe(lrc);
912 	struct iosys_map map;
913 
914 	map = __xe_lrc_indirect_ring_map(lrc);
915 	iosys_map_incr(&map, reg_nr * sizeof(u32));
916 	xe_map_write32(xe, &map, val);
917 }
918 
919 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
920 {
921 	struct xe_device *xe = lrc_to_xe(lrc);
922 	struct iosys_map map;
923 
924 	map = __xe_lrc_regs_map(lrc);
925 	iosys_map_incr(&map, reg_nr * sizeof(u32));
926 	return xe_map_read32(xe, &map);
927 }
928 
929 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
930 {
931 	struct xe_device *xe = lrc_to_xe(lrc);
932 	struct iosys_map map;
933 
934 	map = __xe_lrc_regs_map(lrc);
935 	iosys_map_incr(&map, reg_nr * sizeof(u32));
936 	xe_map_write32(xe, &map, val);
937 }
938 
939 static void *empty_lrc_data(struct xe_hw_engine *hwe)
940 {
941 	struct xe_gt *gt = hwe->gt;
942 	void *data;
943 	u32 *regs;
944 
945 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
946 	if (!data)
947 		return NULL;
948 
949 	/* 1st page: Per-Process of HW status Page */
950 	regs = data + LRC_PPHWSP_SIZE;
951 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
952 	set_context_control(regs, hwe);
953 	set_memory_based_intr(regs, hwe);
954 	reset_stop_ring(regs, hwe);
955 	if (xe_gt_has_indirect_ring_state(gt)) {
956 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
957 		       LRC_INDIRECT_RING_STATE_SIZE;
958 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
959 	}
960 
961 	return data;
962 }
963 
964 /**
965  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
966  * of given engine.
967  * @hwe: the &xe_hw_engine struct instance
968  */
969 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
970 {
971 	struct xe_gt *gt = hwe->gt;
972 	u32 *regs;
973 
974 	if (!gt->default_lrc[hwe->class])
975 		return;
976 
977 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
978 	set_memory_based_intr(regs, hwe);
979 }
980 
981 /**
982  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
983  * for given LRC.
984  * @lrc: the &xe_lrc struct instance
985  * @hwe: the &xe_hw_engine struct instance
986  * @regs: scratch buffer to be used as temporary storage
987  */
988 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
989 					    u32 *regs)
990 {
991 	struct xe_gt *gt = hwe->gt;
992 	struct iosys_map map;
993 	size_t regs_len;
994 
995 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
996 		return;
997 
998 	map = __xe_lrc_regs_map(lrc);
999 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1000 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1001 	set_memory_based_intr(regs, hwe);
1002 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1003 }
1004 
1005 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1006 {
1007 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1008 
1009 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1010 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1011 }
1012 
1013 static void xe_lrc_finish(struct xe_lrc *lrc)
1014 {
1015 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1016 	xe_bo_unpin_map_no_vm(lrc->bo);
1017 }
1018 
1019 /*
1020  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1021  * in calculating active context run ticks.
1022  *
1023  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1024  * context, but only gets updated when the context switches out. In order to
1025  * check how long a context has been active before it switches out, two things
1026  * are required:
1027  *
1028  * (1) Determine if the context is running:
1029  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1030  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1031  * initialized. During a query, we just check for this value to determine if the
1032  * context is active. If the context switched out, it would overwrite this
1033  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1034  * the last part of context restore, so reusing this LRC location will not
1035  * clobber anything.
1036  *
1037  * (2) Calculate the time that the context has been active for:
1038  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1039  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1040  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1041  * engine instance. Since we do not know which instance the context is running
1042  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1043  * store it in the PPHSWP.
1044  */
1045 #define CONTEXT_ACTIVE 1ULL
1046 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1047 				    struct xe_hw_engine *hwe,
1048 				    u32 *batch,
1049 				    size_t max_len)
1050 {
1051 	u32 *cmd = batch;
1052 
1053 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1054 		return -ENOSPC;
1055 
1056 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1057 	*cmd++ = ENGINE_ID(0).addr;
1058 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1059 	*cmd++ = 0;
1060 
1061 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1062 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1063 	*cmd++ = 0;
1064 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1065 
1066 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1067 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1068 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1069 		*cmd++ = 0;
1070 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1071 	}
1072 
1073 	return cmd - batch;
1074 }
1075 
1076 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1077 				  u32 *batch, size_t max_len)
1078 {
1079 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1080 	u32 *cmd = batch;
1081 
1082 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1083 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1084 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1085 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1086 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1087 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1088 		return 0;
1089 
1090 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1091 		return -ENOSPC;
1092 
1093 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1094 		 MI_LRM_ASYNC;
1095 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1096 	*cmd++ = ts_addr;
1097 	*cmd++ = 0;
1098 
1099 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1100 		 MI_LRM_ASYNC;
1101 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1102 	*cmd++ = ts_addr;
1103 	*cmd++ = 0;
1104 
1105 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1106 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1107 	*cmd++ = ts_addr;
1108 	*cmd++ = 0;
1109 
1110 	return cmd - batch;
1111 }
1112 
1113 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1114 						  struct xe_hw_engine *hwe,
1115 						  u32 *batch, size_t max_len)
1116 {
1117 	struct xe_device *xe = gt_to_xe(lrc->gt);
1118 	const u32 *user_batch;
1119 	u32 *cmd = batch;
1120 	u32 count;
1121 
1122 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1123 						    hwe->class, &user_batch);
1124 	if (!count)
1125 		return 0;
1126 
1127 	if (count > max_len)
1128 		return -ENOSPC;
1129 
1130 	/*
1131 	 * This should be used only for tests and validation. Taint the kernel
1132 	 * as anything could be submitted directly in context switches
1133 	 */
1134 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1135 
1136 	memcpy(cmd, user_batch, count * sizeof(u32));
1137 	cmd += count;
1138 
1139 	return cmd - batch;
1140 }
1141 
1142 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1143 						 struct xe_hw_engine *hwe,
1144 						 u32 *batch, size_t max_len)
1145 {
1146 	struct xe_device *xe = gt_to_xe(lrc->gt);
1147 	const u32 *user_batch;
1148 	u32 *cmd = batch;
1149 	u32 count;
1150 
1151 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1152 						   hwe->class, &user_batch);
1153 	if (!count)
1154 		return 0;
1155 
1156 	if (count > max_len)
1157 		return -ENOSPC;
1158 
1159 	/*
1160 	 * This should be used only for tests and validation. Taint the kernel
1161 	 * as anything could be submitted directly in context switches
1162 	 */
1163 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1164 
1165 	memcpy(cmd, user_batch, count * sizeof(u32));
1166 	cmd += count;
1167 
1168 	return cmd - batch;
1169 }
1170 
1171 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1172 					       struct xe_hw_engine *hwe,
1173 					       u32 *batch, size_t max_len)
1174 {
1175 	u32 *cmd = batch;
1176 
1177 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1178 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1179 		return 0;
1180 
1181 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1182 		return -ENOSPC;
1183 
1184 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1185 	*cmd++ = CS_DEBUG_MODE1(0).addr;
1186 	*cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1187 
1188 	return cmd - batch;
1189 }
1190 
1191 struct bo_setup {
1192 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1193 			 u32 *batch, size_t max_size);
1194 };
1195 
1196 struct bo_setup_state {
1197 	/* Input: */
1198 	struct xe_lrc		*lrc;
1199 	struct xe_hw_engine	*hwe;
1200 	size_t			max_size;
1201 	size_t                  reserve_dw;
1202 	unsigned int		offset;
1203 	const struct bo_setup	*funcs;
1204 	unsigned int		num_funcs;
1205 
1206 	/* State: */
1207 	u32			*buffer;
1208 	u32			*ptr;
1209 	unsigned int		written;
1210 };
1211 
1212 static int setup_bo(struct bo_setup_state *state)
1213 {
1214 	ssize_t remain;
1215 
1216 	if (state->lrc->bo->vmap.is_iomem) {
1217 		if (!state->buffer)
1218 			return -ENOMEM;
1219 		state->ptr = state->buffer;
1220 	} else {
1221 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1222 	}
1223 
1224 	remain = state->max_size / sizeof(u32);
1225 
1226 	for (size_t i = 0; i < state->num_funcs; i++) {
1227 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1228 						    state->ptr, remain);
1229 
1230 		remain -= len;
1231 
1232 		/*
1233 		 * Caller has asked for at least reserve_dw to remain unused.
1234 		 */
1235 		if (len < 0 ||
1236 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1237 			goto fail;
1238 
1239 		state->ptr += len;
1240 		state->written += len;
1241 	}
1242 
1243 	return 0;
1244 
1245 fail:
1246 	return -ENOSPC;
1247 }
1248 
1249 static void finish_bo(struct bo_setup_state *state)
1250 {
1251 	if (!state->buffer)
1252 		return;
1253 
1254 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1255 			 state->offset, state->buffer,
1256 			 state->written * sizeof(u32));
1257 }
1258 
1259 /**
1260  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1261  * @lrc: the &xe_lrc struct instance
1262  * @hwe: the &xe_hw_engine struct instance
1263  * @scratch: preallocated scratch buffer for temporary storage
1264  * Return: 0 on success, negative error code on failure
1265  */
1266 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1267 {
1268 	static const struct bo_setup funcs[] = {
1269 		{ .setup = setup_timestamp_wa },
1270 		{ .setup = setup_invalidate_state_cache_wa },
1271 		{ .setup = setup_utilization_wa },
1272 		{ .setup = setup_configfs_post_ctx_restore_bb },
1273 	};
1274 	struct bo_setup_state state = {
1275 		.lrc = lrc,
1276 		.hwe = hwe,
1277 		.max_size = LRC_WA_BB_SIZE,
1278 		.buffer = scratch,
1279 		.reserve_dw = 1,
1280 		.offset = __xe_lrc_wa_bb_offset(lrc),
1281 		.funcs = funcs,
1282 		.num_funcs = ARRAY_SIZE(funcs),
1283 	};
1284 	int ret;
1285 
1286 	ret = setup_bo(&state);
1287 	if (ret)
1288 		return ret;
1289 
1290 	*state.ptr++ = MI_BATCH_BUFFER_END;
1291 	state.written++;
1292 
1293 	finish_bo(&state);
1294 
1295 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1296 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1297 
1298 	return 0;
1299 }
1300 
1301 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1302 {
1303 	u32 *buf = NULL;
1304 	int ret;
1305 
1306 	if (lrc->bo->vmap.is_iomem)
1307 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1308 
1309 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1310 
1311 	kfree(buf);
1312 
1313 	return ret;
1314 }
1315 
1316 static int
1317 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1318 {
1319 	static const struct bo_setup rcs_funcs[] = {
1320 		{ .setup = setup_timestamp_wa },
1321 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1322 	};
1323 	static const struct bo_setup xcs_funcs[] = {
1324 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1325 	};
1326 	struct bo_setup_state state = {
1327 		.lrc = lrc,
1328 		.hwe = hwe,
1329 		.max_size = (63 * 64) /* max 63 cachelines */,
1330 		.buffer = NULL,
1331 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1332 	};
1333 	int ret;
1334 
1335 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1336 		return 0;
1337 
1338 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1339 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1340 		state.funcs = rcs_funcs;
1341 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1342 	} else {
1343 		state.funcs = xcs_funcs;
1344 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1345 	}
1346 
1347 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1348 		return 0;
1349 
1350 	if (lrc->bo->vmap.is_iomem)
1351 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1352 
1353 	ret = setup_bo(&state);
1354 	if (ret) {
1355 		kfree(state.buffer);
1356 		return ret;
1357 	}
1358 
1359 	/*
1360 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1361 	 * execute: size for indirect ctx must be a multiple of 64.
1362 	 */
1363 	while (state.written & 0xf) {
1364 		*state.ptr++ = MI_NOOP;
1365 		state.written++;
1366 	}
1367 
1368 	finish_bo(&state);
1369 	kfree(state.buffer);
1370 
1371 	/*
1372 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1373 	 * varies per engine class, but the default is good enough
1374 	 */
1375 	xe_lrc_write_ctx_reg(lrc,
1376 			     CTX_CS_INDIRECT_CTX,
1377 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1378 			     /* Size in CLs. */
1379 			     (state.written * sizeof(u32) / 64));
1380 
1381 	return 0;
1382 }
1383 
1384 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1385 		       struct xe_vm *vm, u32 ring_size, u16 msix_vec,
1386 		       u32 init_flags)
1387 {
1388 	struct xe_gt *gt = hwe->gt;
1389 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1390 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1391 	struct xe_tile *tile = gt_to_tile(gt);
1392 	struct xe_device *xe = gt_to_xe(gt);
1393 	struct iosys_map map;
1394 	u32 arb_enable;
1395 	u32 bo_flags;
1396 	int err;
1397 
1398 	kref_init(&lrc->refcount);
1399 	lrc->gt = gt;
1400 	lrc->size = lrc_size;
1401 	lrc->flags = 0;
1402 	lrc->ring.size = ring_size;
1403 	lrc->ring.tail = 0;
1404 
1405 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1406 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1407 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1408 	}
1409 
1410 	if (xe_gt_has_indirect_ring_state(gt))
1411 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1412 
1413 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1414 		   XE_BO_FLAG_GGTT_INVALIDATE;
1415 	if (vm && vm->xef) /* userspace */
1416 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE;
1417 
1418 	lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
1419 					    bo_size,
1420 					    ttm_bo_type_kernel,
1421 					    bo_flags, false);
1422 	if (IS_ERR(lrc->bo))
1423 		return PTR_ERR(lrc->bo);
1424 
1425 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1426 			     hwe->fence_irq, hwe->name);
1427 
1428 	/*
1429 	 * Init Per-Process of HW status Page, LRC / context state to known
1430 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1431 	 * it's the early submission to record the lrc: build a new empty one from
1432 	 * scratch.
1433 	 */
1434 	map = __xe_lrc_pphwsp_map(lrc);
1435 	if (gt->default_lrc[hwe->class]) {
1436 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1437 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1438 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1439 				 lrc_size - LRC_PPHWSP_SIZE);
1440 	} else {
1441 		void *init_data = empty_lrc_data(hwe);
1442 
1443 		if (!init_data) {
1444 			err = -ENOMEM;
1445 			goto err_lrc_finish;
1446 		}
1447 
1448 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1449 		kfree(init_data);
1450 	}
1451 
1452 	if (vm) {
1453 		xe_lrc_set_ppgtt(lrc, vm);
1454 
1455 		if (vm->xef)
1456 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1457 	}
1458 
1459 	if (xe_device_has_msix(xe)) {
1460 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1461 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1462 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1463 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1464 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1465 	}
1466 
1467 	if (xe_gt_has_indirect_ring_state(gt)) {
1468 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1469 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1470 
1471 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1472 					      __xe_lrc_ring_ggtt_addr(lrc));
1473 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1474 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1475 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1476 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1477 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1478 	} else {
1479 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1480 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1481 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1482 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1483 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1484 	}
1485 
1486 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1487 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1488 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1489 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1490 
1491 	if (init_flags & XE_LRC_CREATE_PXP)
1492 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1493 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1494 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1495 
1496 	lrc->ctx_timestamp = 0;
1497 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1498 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1499 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1500 
1501 	if (xe->info.has_asid && vm)
1502 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1503 
1504 	lrc->desc = LRC_VALID;
1505 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1506 	/* TODO: Priority */
1507 
1508 	/* While this appears to have something about privileged batches or
1509 	 * some such, it really just means PPGTT mode.
1510 	 */
1511 	if (vm)
1512 		lrc->desc |= LRC_PRIVILEGE;
1513 
1514 	if (GRAPHICS_VERx100(xe) < 1250) {
1515 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1516 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1517 	}
1518 
1519 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1520 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1521 
1522 	map = __xe_lrc_seqno_map(lrc);
1523 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1524 
1525 	map = __xe_lrc_start_seqno_map(lrc);
1526 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1527 
1528 	err = setup_wa_bb(lrc, hwe);
1529 	if (err)
1530 		goto err_lrc_finish;
1531 
1532 	err = setup_indirect_ctx(lrc, hwe);
1533 	if (err)
1534 		goto err_lrc_finish;
1535 
1536 	return 0;
1537 
1538 err_lrc_finish:
1539 	xe_lrc_finish(lrc);
1540 	return err;
1541 }
1542 
1543 /**
1544  * xe_lrc_create - Create a LRC
1545  * @hwe: Hardware Engine
1546  * @vm: The VM (address space)
1547  * @ring_size: LRC ring size
1548  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1549  * @flags: LRC initialization flags
1550  *
1551  * Allocate and initialize the Logical Ring Context (LRC).
1552  *
1553  * Return pointer to created LRC upon success and an error pointer
1554  * upon failure.
1555  */
1556 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1557 			     u32 ring_size, u16 msix_vec, u32 flags)
1558 {
1559 	struct xe_lrc *lrc;
1560 	int err;
1561 
1562 	lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1563 	if (!lrc)
1564 		return ERR_PTR(-ENOMEM);
1565 
1566 	err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1567 	if (err) {
1568 		kfree(lrc);
1569 		return ERR_PTR(err);
1570 	}
1571 
1572 	return lrc;
1573 }
1574 
1575 /**
1576  * xe_lrc_destroy - Destroy the LRC
1577  * @ref: reference to LRC
1578  *
1579  * Called when ref == 0, release resources held by the Logical Ring Context
1580  * (LRC) and free the LRC memory.
1581  */
1582 void xe_lrc_destroy(struct kref *ref)
1583 {
1584 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1585 
1586 	xe_lrc_finish(lrc);
1587 	kfree(lrc);
1588 }
1589 
1590 /**
1591  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1592  * @lrc: the &xe_lrc struct instance
1593  */
1594 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1595 {
1596 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1597 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1598 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1599 
1600 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1601 					      __xe_lrc_ring_ggtt_addr(lrc));
1602 	} else {
1603 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1604 	}
1605 }
1606 
1607 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1608 {
1609 	if (xe_lrc_has_indirect_ring_state(lrc))
1610 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1611 	else
1612 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1613 }
1614 
1615 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1616 {
1617 	if (xe_lrc_has_indirect_ring_state(lrc))
1618 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1619 	else
1620 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1621 }
1622 
1623 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1624 {
1625 	if (xe_lrc_has_indirect_ring_state(lrc))
1626 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1627 	else
1628 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1629 }
1630 
1631 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1632 {
1633 	if (xe_lrc_has_indirect_ring_state(lrc))
1634 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1635 	else
1636 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1637 }
1638 
1639 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1640 {
1641 	if (xe_lrc_has_indirect_ring_state(lrc))
1642 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1643 	else
1644 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1645 }
1646 
1647 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1648 {
1649 	const u32 head = xe_lrc_ring_head(lrc);
1650 	const u32 tail = lrc->ring.tail;
1651 	const u32 size = lrc->ring.size;
1652 
1653 	return ((head - tail - 1) & (size - 1)) + 1;
1654 }
1655 
1656 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1657 				const void *data, size_t size)
1658 {
1659 	struct xe_device *xe = lrc_to_xe(lrc);
1660 
1661 	iosys_map_incr(&ring, lrc->ring.tail);
1662 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1663 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1664 }
1665 
1666 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1667 {
1668 	struct xe_device *xe = lrc_to_xe(lrc);
1669 	struct iosys_map ring;
1670 	u32 rhs;
1671 	size_t aligned_size;
1672 
1673 	xe_assert(xe, IS_ALIGNED(size, 4));
1674 	aligned_size = ALIGN(size, 8);
1675 
1676 	ring = __xe_lrc_ring_map(lrc);
1677 
1678 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1679 	rhs = lrc->ring.size - lrc->ring.tail;
1680 	if (size > rhs) {
1681 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1682 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1683 	} else {
1684 		__xe_lrc_write_ring(lrc, ring, data, size);
1685 	}
1686 
1687 	if (aligned_size > size) {
1688 		u32 noop = MI_NOOP;
1689 
1690 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1691 	}
1692 }
1693 
1694 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1695 {
1696 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1697 }
1698 
1699 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1700 {
1701 	return __xe_lrc_seqno_ggtt_addr(lrc);
1702 }
1703 
1704 /**
1705  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1706  *
1707  * Allocate but don't initialize an lrc seqno fence.
1708  *
1709  * Return: Pointer to the allocated fence or
1710  * negative error pointer on error.
1711  */
1712 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1713 {
1714 	return xe_hw_fence_alloc();
1715 }
1716 
1717 /**
1718  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1719  * @fence: Pointer to the fence to free.
1720  *
1721  * Frees an lrc seqno fence that hasn't yet been
1722  * initialized.
1723  */
1724 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1725 {
1726 	xe_hw_fence_free(fence);
1727 }
1728 
1729 /**
1730  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1731  * @lrc: Pointer to the lrc.
1732  * @fence: Pointer to the fence to initialize.
1733  *
1734  * Initializes a pre-allocated lrc seqno fence.
1735  * After initialization, the fence is subject to normal
1736  * dma-fence refcounting.
1737  */
1738 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1739 {
1740 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1741 }
1742 
1743 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1744 {
1745 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1746 
1747 	return xe_map_read32(lrc_to_xe(lrc), &map);
1748 }
1749 
1750 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1751 {
1752 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1753 
1754 	return xe_map_read32(lrc_to_xe(lrc), &map);
1755 }
1756 
1757 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1758 {
1759 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1760 }
1761 
1762 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1763 {
1764 	return __xe_lrc_parallel_ggtt_addr(lrc);
1765 }
1766 
1767 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1768 {
1769 	return __xe_lrc_parallel_map(lrc);
1770 }
1771 
1772 /**
1773  * xe_lrc_engine_id() - Read engine id value
1774  * @lrc: Pointer to the lrc.
1775  *
1776  * Returns: context id value
1777  */
1778 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1779 {
1780 	struct xe_device *xe = lrc_to_xe(lrc);
1781 	struct iosys_map map;
1782 
1783 	map = __xe_lrc_engine_id_map(lrc);
1784 	return xe_map_read32(xe, &map);
1785 }
1786 
1787 static int instr_dw(u32 cmd_header)
1788 {
1789 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1790 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1791 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1792 		return 1;
1793 
1794 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1795 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1796 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1797 
1798 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1799 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1800 }
1801 
1802 static int dump_mi_command(struct drm_printer *p,
1803 			   struct xe_gt *gt,
1804 			   u32 *dw,
1805 			   int remaining_dw)
1806 {
1807 	u32 inst_header = *dw;
1808 	u32 numdw = instr_dw(inst_header);
1809 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1810 	int num_noop;
1811 
1812 	/* First check for commands that don't have/use a '# DW' field */
1813 	switch (inst_header & MI_OPCODE) {
1814 	case MI_NOOP:
1815 		num_noop = 1;
1816 		while (num_noop < remaining_dw &&
1817 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1818 			num_noop++;
1819 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1820 		return num_noop;
1821 
1822 	case MI_TOPOLOGY_FILTER:
1823 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1824 		return 1;
1825 
1826 	case MI_BATCH_BUFFER_END:
1827 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1828 		/* Return 'remaining_dw' to consume the rest of the LRC */
1829 		return remaining_dw;
1830 	}
1831 
1832 	/*
1833 	 * Any remaining commands include a # of dwords.  We should make sure
1834 	 * it doesn't exceed the remaining size of the LRC.
1835 	 */
1836 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1837 		numdw = remaining_dw;
1838 
1839 	switch (inst_header & MI_OPCODE) {
1840 	case MI_LOAD_REGISTER_IMM:
1841 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1842 			   inst_header, (numdw - 1) / 2);
1843 		for (int i = 1; i < numdw; i += 2)
1844 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1845 		return numdw;
1846 
1847 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1848 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1849 			   inst_header,
1850 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1851 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1852 		if (numdw == 4)
1853 			drm_printf(p, " - %#6x = %#010llx\n",
1854 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1855 		else
1856 			drm_printf(p, " - %*ph (%s)\n",
1857 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1858 				   numdw < 4 ? "truncated" : "malformed");
1859 		return numdw;
1860 
1861 	case MI_FORCE_WAKEUP:
1862 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1863 		return numdw;
1864 
1865 	default:
1866 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1867 			   inst_header, opcode, numdw);
1868 		return numdw;
1869 	}
1870 }
1871 
1872 static int dump_gfxpipe_command(struct drm_printer *p,
1873 				struct xe_gt *gt,
1874 				u32 *dw,
1875 				int remaining_dw)
1876 {
1877 	u32 numdw = instr_dw(*dw);
1878 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1879 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1880 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1881 
1882 	/*
1883 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1884 	 * remaining size of the LRC.
1885 	 */
1886 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1887 		numdw = remaining_dw;
1888 
1889 	switch (*dw & GFXPIPE_MATCH_MASK) {
1890 #define MATCH(cmd) \
1891 	case cmd: \
1892 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1893 		return numdw
1894 #define MATCH3D(cmd) \
1895 	case CMD_##cmd: \
1896 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1897 		return numdw
1898 
1899 	MATCH(STATE_BASE_ADDRESS);
1900 	MATCH(STATE_SIP);
1901 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1902 	MATCH(STATE_COMPUTE_MODE);
1903 	MATCH3D(3DSTATE_BTD);
1904 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1905 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1906 
1907 	MATCH3D(3DSTATE_VF_STATISTICS);
1908 
1909 	MATCH(PIPELINE_SELECT);
1910 
1911 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1912 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1913 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1914 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1915 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1916 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1917 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1918 	MATCH3D(3DSTATE_INDEX_BUFFER);
1919 	MATCH3D(3DSTATE_VF);
1920 	MATCH3D(3DSTATE_MULTISAMPLE);
1921 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1922 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1923 	MATCH3D(3DSTATE_VS);
1924 	MATCH3D(3DSTATE_GS);
1925 	MATCH3D(3DSTATE_CLIP);
1926 	MATCH3D(3DSTATE_SF);
1927 	MATCH3D(3DSTATE_WM);
1928 	MATCH3D(3DSTATE_CONSTANT_VS);
1929 	MATCH3D(3DSTATE_CONSTANT_GS);
1930 	MATCH3D(3DSTATE_CONSTANT_PS);
1931 	MATCH3D(3DSTATE_SAMPLE_MASK);
1932 	MATCH3D(3DSTATE_CONSTANT_HS);
1933 	MATCH3D(3DSTATE_CONSTANT_DS);
1934 	MATCH3D(3DSTATE_HS);
1935 	MATCH3D(3DSTATE_TE);
1936 	MATCH3D(3DSTATE_DS);
1937 	MATCH3D(3DSTATE_STREAMOUT);
1938 	MATCH3D(3DSTATE_SBE);
1939 	MATCH3D(3DSTATE_PS);
1940 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1941 	MATCH3D(3DSTATE_CPS_POINTERS);
1942 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1943 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1944 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1945 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1946 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1947 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1948 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1949 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1950 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1951 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1952 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1953 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1954 	MATCH3D(3DSTATE_VF_INSTANCING);
1955 	MATCH3D(3DSTATE_VF_SGVS);
1956 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1957 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1958 	MATCH3D(3DSTATE_PS_BLEND);
1959 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1960 	MATCH3D(3DSTATE_PS_EXTRA);
1961 	MATCH3D(3DSTATE_RASTER);
1962 	MATCH3D(3DSTATE_SBE_SWIZ);
1963 	MATCH3D(3DSTATE_WM_HZ_OP);
1964 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1965 	MATCH3D(3DSTATE_VF_SGVS_2);
1966 	MATCH3D(3DSTATE_VFG);
1967 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1968 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1969 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1970 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1971 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1972 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1973 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1974 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1975 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1976 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1977 	MATCH3D(3DSTATE_AMFS);
1978 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1979 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1980 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1981 	MATCH3D(3DSTATE_MESH_CONTROL);
1982 	MATCH3D(3DSTATE_MESH_DISTRIB);
1983 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1984 	MATCH3D(3DSTATE_MESH_SHADER);
1985 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1986 	MATCH3D(3DSTATE_TASK_CONTROL);
1987 	MATCH3D(3DSTATE_TASK_SHADER);
1988 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1989 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1990 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1991 	MATCH3D(3DSTATE_CLIP_MESH);
1992 	MATCH3D(3DSTATE_SBE_MESH);
1993 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1994 	MATCH3D(3DSTATE_COARSE_PIXEL);
1995 
1996 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1997 	MATCH3D(3DSTATE_CHROMA_KEY);
1998 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1999 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2000 	MATCH3D(3DSTATE_LINE_STIPPLE);
2001 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2002 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2003 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2004 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2005 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2006 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2007 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2008 	MATCH3D(3DSTATE_SO_DECL_LIST);
2009 	MATCH3D(3DSTATE_SO_BUFFER);
2010 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2011 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2012 	MATCH3D(3DSTATE_3D_MODE);
2013 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2014 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2015 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2016 
2017 	default:
2018 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2019 			   *dw, pipeline, opcode, subopcode, numdw);
2020 		return numdw;
2021 	}
2022 }
2023 
2024 static int dump_gfx_state_command(struct drm_printer *p,
2025 				  struct xe_gt *gt,
2026 				  u32 *dw,
2027 				  int remaining_dw)
2028 {
2029 	u32 numdw = instr_dw(*dw);
2030 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2031 
2032 	/*
2033 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2034 	 * remaining size of the LRC.
2035 	 */
2036 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2037 		numdw = remaining_dw;
2038 
2039 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2040 	MATCH(STATE_WRITE_INLINE);
2041 
2042 	default:
2043 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2044 			   *dw, opcode, numdw);
2045 		return numdw;
2046 	}
2047 }
2048 
2049 void xe_lrc_dump_default(struct drm_printer *p,
2050 			 struct xe_gt *gt,
2051 			 enum xe_engine_class hwe_class)
2052 {
2053 	u32 *dw;
2054 	int remaining_dw, num_dw;
2055 
2056 	if (!gt->default_lrc[hwe_class]) {
2057 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2058 		return;
2059 	}
2060 
2061 	/*
2062 	 * Skip the beginning of the LRC since it contains the per-process
2063 	 * hardware status page.
2064 	 */
2065 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2066 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2067 
2068 	while (remaining_dw > 0) {
2069 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2070 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
2071 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2072 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
2073 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2074 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
2075 		} else {
2076 			num_dw = min(instr_dw(*dw), remaining_dw);
2077 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2078 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2079 				   num_dw);
2080 		}
2081 
2082 		dw += num_dw;
2083 		remaining_dw -= num_dw;
2084 	}
2085 }
2086 
2087 struct instr_state {
2088 	u32 instr;
2089 	u16 num_dw;
2090 };
2091 
2092 static const struct instr_state xe_hpg_svg_state[] = {
2093 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2094 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2095 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2096 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2097 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2098 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2099 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2100 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2101 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2102 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2103 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2104 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2105 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2106 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2107 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2108 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2109 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2110 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2111 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2112 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2113 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2114 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2115 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2116 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2117 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2118 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2119 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2120 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2121 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2122 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2123 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2124 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2125 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2126 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2127 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2128 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2129 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2130 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2131 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2132 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2133 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2134 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2135 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2136 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2137 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2138 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2139 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2140 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2141 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2142 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2143 };
2144 
2145 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2146 {
2147 	struct xe_gt *gt = q->hwe->gt;
2148 	struct xe_device *xe = gt_to_xe(gt);
2149 	const struct instr_state *state_table = NULL;
2150 	int state_table_size = 0;
2151 
2152 	/*
2153 	 * Wa_14019789679
2154 	 *
2155 	 * If the driver doesn't explicitly emit the SVG instructions while
2156 	 * setting up the default LRC, the context switch will write 0's
2157 	 * (noops) into the LRC memory rather than the expected instruction
2158 	 * headers.  Application contexts start out as a copy of the default
2159 	 * LRC, and if they also do not emit specific settings for some SVG
2160 	 * state, then on context restore they'll unintentionally inherit
2161 	 * whatever state setting the previous context had programmed into the
2162 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2163 	 * prevent the hardware from resetting that state back to any specific
2164 	 * value).
2165 	 *
2166 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2167 	 * since that's a specific state setting that can easily cause GPU
2168 	 * hangs if unintentionally inherited.  However to be safe we'll
2169 	 * continue to emit all of the SVG state since it's best not to leak
2170 	 * any of the state between contexts, even if that leakage is harmless.
2171 	 */
2172 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2173 		state_table = xe_hpg_svg_state;
2174 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2175 	}
2176 
2177 	if (!state_table) {
2178 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2179 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2180 		return cs;
2181 	}
2182 
2183 	for (int i = 0; i < state_table_size; i++) {
2184 		u32 instr = state_table[i].instr;
2185 		u16 num_dw = state_table[i].num_dw;
2186 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2187 
2188 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2189 		xe_gt_assert(gt, num_dw != 0);
2190 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2191 
2192 		/*
2193 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2194 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2195 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2196 		 * Just make the replacement here rather than defining a
2197 		 * whole separate table for the single trivial change.
2198 		 */
2199 		if (GRAPHICS_VER(xe) >= 20 &&
2200 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2201 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2202 
2203 		*cs = instr;
2204 		if (!is_single_dw)
2205 			*cs |= (num_dw - 2);
2206 
2207 		cs += num_dw;
2208 	}
2209 
2210 	return cs;
2211 }
2212 
2213 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2214 {
2215 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
2216 
2217 	if (!snapshot)
2218 		return NULL;
2219 
2220 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2221 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2222 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2223 	snapshot->head = xe_lrc_ring_head(lrc);
2224 	snapshot->tail.internal = lrc->ring.tail;
2225 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2226 	snapshot->start = xe_lrc_ring_start(lrc);
2227 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2228 	snapshot->seqno = xe_lrc_seqno(lrc);
2229 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2230 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2231 	snapshot->lrc_size = lrc->size;
2232 	snapshot->lrc_snapshot = NULL;
2233 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2234 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2235 	return snapshot;
2236 }
2237 
2238 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2239 {
2240 	struct xe_bo *bo;
2241 	struct iosys_map src;
2242 
2243 	if (!snapshot)
2244 		return;
2245 
2246 	bo = snapshot->lrc_bo;
2247 	snapshot->lrc_bo = NULL;
2248 
2249 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2250 	if (!snapshot->lrc_snapshot)
2251 		goto put_bo;
2252 
2253 	xe_bo_lock(bo, false);
2254 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2255 		xe_map_memcpy_from(xe_bo_device(bo),
2256 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2257 				   snapshot->lrc_size);
2258 		ttm_bo_vunmap(&bo->ttm, &src);
2259 	} else {
2260 		kvfree(snapshot->lrc_snapshot);
2261 		snapshot->lrc_snapshot = NULL;
2262 	}
2263 	xe_bo_unlock(bo);
2264 put_bo:
2265 	xe_bo_put(bo);
2266 }
2267 
2268 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2269 {
2270 	unsigned long i;
2271 
2272 	if (!snapshot)
2273 		return;
2274 
2275 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2276 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2277 		   snapshot->ring_addr);
2278 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2279 		   snapshot->indirect_context_desc);
2280 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2281 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2282 		   snapshot->tail.internal, snapshot->tail.memory);
2283 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2284 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2285 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2286 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2287 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2288 
2289 	if (!snapshot->lrc_snapshot)
2290 		return;
2291 
2292 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2293 	drm_puts(p, "\t[HWSP].data: ");
2294 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2295 		u32 *val = snapshot->lrc_snapshot + i;
2296 		char dumped[ASCII85_BUFSZ];
2297 
2298 		drm_puts(p, ascii85_encode(*val, dumped));
2299 	}
2300 
2301 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2302 	drm_puts(p, "\t[HWCTX].data: ");
2303 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2304 		u32 *val = snapshot->lrc_snapshot + i;
2305 		char dumped[ASCII85_BUFSZ];
2306 
2307 		drm_puts(p, ascii85_encode(*val, dumped));
2308 	}
2309 	drm_puts(p, "\n");
2310 }
2311 
2312 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2313 {
2314 	if (!snapshot)
2315 		return;
2316 
2317 	kvfree(snapshot->lrc_snapshot);
2318 	if (snapshot->lrc_bo)
2319 		xe_bo_put(snapshot->lrc_bo);
2320 
2321 	kfree(snapshot);
2322 }
2323 
2324 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2325 {
2326 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2327 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2328 	struct xe_hw_engine *hwe;
2329 	u64 val;
2330 
2331 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2332 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2333 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2334 			    class, instance))
2335 		return -1;
2336 
2337 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2338 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2339 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2340 	else
2341 		val = xe_mmio_read32(&hwe->gt->mmio,
2342 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2343 
2344 	*reg_ctx_ts = val;
2345 
2346 	return 0;
2347 }
2348 
2349 /**
2350  * xe_lrc_update_timestamp() - Update ctx timestamp
2351  * @lrc: Pointer to the lrc.
2352  * @old_ts: Old timestamp value
2353  *
2354  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2355  * update saved value. With support for active contexts, the calculation may be
2356  * slightly racy, so follow a read-again logic to ensure that the context is
2357  * still active before returning the right timestamp.
2358  *
2359  * Returns: New ctx timestamp value
2360  */
2361 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2362 {
2363 	u64 lrc_ts, reg_ts;
2364 	u32 engine_id;
2365 
2366 	*old_ts = lrc->ctx_timestamp;
2367 
2368 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2369 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2370 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2371 		lrc->ctx_timestamp = lrc_ts;
2372 		goto done;
2373 	}
2374 
2375 	if (lrc_ts == CONTEXT_ACTIVE) {
2376 		engine_id = xe_lrc_engine_id(lrc);
2377 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2378 			lrc->ctx_timestamp = reg_ts;
2379 
2380 		/* read lrc again to ensure context is still active */
2381 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2382 	}
2383 
2384 	/*
2385 	 * If context switched out, just use the lrc_ts. Note that this needs to
2386 	 * be a separate if condition.
2387 	 */
2388 	if (lrc_ts != CONTEXT_ACTIVE)
2389 		lrc->ctx_timestamp = lrc_ts;
2390 
2391 done:
2392 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2393 
2394 	return lrc->ctx_timestamp;
2395 }
2396 
2397 /**
2398  * xe_lrc_ring_is_idle() - LRC is idle
2399  * @lrc: Pointer to the lrc.
2400  *
2401  * Compare LRC ring head and tail to determine if idle.
2402  *
2403  * Return: True is ring is idle, False otherwise
2404  */
2405 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2406 {
2407 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2408 }
2409