xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 1f20a5769446a1acae67ac9e63d07a594829a789)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include "instructions/xe_mi_commands.h"
9 #include "instructions/xe_gfxpipe_commands.h"
10 #include "regs/xe_engine_regs.h"
11 #include "regs/xe_gpu_commands.h"
12 #include "regs/xe_lrc_layout.h"
13 #include "xe_bb.h"
14 #include "xe_bo.h"
15 #include "xe_device.h"
16 #include "xe_drm_client.h"
17 #include "xe_exec_queue_types.h"
18 #include "xe_gt.h"
19 #include "xe_gt_printk.h"
20 #include "xe_hw_fence.h"
21 #include "xe_map.h"
22 #include "xe_memirq.h"
23 #include "xe_sriov.h"
24 #include "xe_vm.h"
25 
26 #define LRC_VALID				(1 << 0)
27 #define LRC_PRIVILEGE				(1 << 8)
28 #define LRC_ADDRESSING_MODE_SHIFT		3
29 #define LRC_LEGACY_64B_CONTEXT			3
30 
31 #define ENGINE_CLASS_SHIFT			61
32 #define ENGINE_INSTANCE_SHIFT			48
33 
34 static struct xe_device *
35 lrc_to_xe(struct xe_lrc *lrc)
36 {
37 	return gt_to_xe(lrc->fence_ctx.gt);
38 }
39 
40 size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
41 {
42 	switch (class) {
43 	case XE_ENGINE_CLASS_RENDER:
44 		if (GRAPHICS_VER(xe) >= 20)
45 			return 4 * SZ_4K;
46 		else
47 			return 14 * SZ_4K;
48 	case XE_ENGINE_CLASS_COMPUTE:
49 		/* 14 pages since graphics_ver == 11 */
50 		if (GRAPHICS_VER(xe) >= 20)
51 			return 3 * SZ_4K;
52 		else
53 			return 14 * SZ_4K;
54 	default:
55 		WARN(1, "Unknown engine class: %d", class);
56 		fallthrough;
57 	case XE_ENGINE_CLASS_COPY:
58 	case XE_ENGINE_CLASS_VIDEO_DECODE:
59 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
60 	case XE_ENGINE_CLASS_OTHER:
61 		return 2 * SZ_4K;
62 	}
63 }
64 
65 /*
66  * The per-platform tables are u8-encoded in @data. Decode @data and set the
67  * addresses' offset and commands in @regs. The following encoding is used
68  * for each byte. There are 2 steps: decoding commands and decoding addresses.
69  *
70  * Commands:
71  * [7]: create NOPs - number of NOPs are set in lower bits
72  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
73  *      MI_LRI_FORCE_POSTED
74  * [5:0]: Number of NOPs or registers to set values to in case of
75  *        MI_LOAD_REGISTER_IMM
76  *
77  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
78  * number of registers. They are set by using the REG/REG16 macros: the former
79  * is used for offsets smaller than 0x200 while the latter is for values bigger
80  * than that. Those macros already set all the bits documented below correctly:
81  *
82  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
83  *      follow, for the lower bits
84  * [6:0]: Register offset, without considering the engine base.
85  *
86  * This function only tweaks the commands and register offsets. Values are not
87  * filled out.
88  */
89 static void set_offsets(u32 *regs,
90 			const u8 *data,
91 			const struct xe_hw_engine *hwe)
92 #define NOP(x) (BIT(7) | (x))
93 #define LRI(count, flags) ((flags) << 6 | (count) | \
94 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
95 #define POSTED BIT(0)
96 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
97 #define REG16(x) \
98 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
99 	(((x) >> 2) & 0x7f)
100 {
101 	const u32 base = hwe->mmio_base;
102 
103 	while (*data) {
104 		u8 count, flags;
105 
106 		if (*data & BIT(7)) { /* skip */
107 			count = *data++ & ~BIT(7);
108 			regs += count;
109 			continue;
110 		}
111 
112 		count = *data & 0x3f;
113 		flags = *data >> 6;
114 		data++;
115 
116 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
117 		if (flags & POSTED)
118 			*regs |= MI_LRI_FORCE_POSTED;
119 		*regs |= MI_LRI_LRM_CS_MMIO;
120 		regs++;
121 
122 		xe_gt_assert(hwe->gt, count);
123 		do {
124 			u32 offset = 0;
125 			u8 v;
126 
127 			do {
128 				v = *data++;
129 				offset <<= 7;
130 				offset |= v & ~BIT(7);
131 			} while (v & BIT(7));
132 
133 			regs[0] = base + (offset << 2);
134 			regs += 2;
135 		} while (--count);
136 	}
137 
138 	*regs = MI_BATCH_BUFFER_END | BIT(0);
139 }
140 
141 static const u8 gen12_xcs_offsets[] = {
142 	NOP(1),
143 	LRI(13, POSTED),
144 	REG16(0x244),
145 	REG(0x034),
146 	REG(0x030),
147 	REG(0x038),
148 	REG(0x03c),
149 	REG(0x168),
150 	REG(0x140),
151 	REG(0x110),
152 	REG(0x1c0),
153 	REG(0x1c4),
154 	REG(0x1c8),
155 	REG(0x180),
156 	REG16(0x2b4),
157 
158 	NOP(5),
159 	LRI(9, POSTED),
160 	REG16(0x3a8),
161 	REG16(0x28c),
162 	REG16(0x288),
163 	REG16(0x284),
164 	REG16(0x280),
165 	REG16(0x27c),
166 	REG16(0x278),
167 	REG16(0x274),
168 	REG16(0x270),
169 
170 	0
171 };
172 
173 static const u8 dg2_xcs_offsets[] = {
174 	NOP(1),
175 	LRI(15, POSTED),
176 	REG16(0x244),
177 	REG(0x034),
178 	REG(0x030),
179 	REG(0x038),
180 	REG(0x03c),
181 	REG(0x168),
182 	REG(0x140),
183 	REG(0x110),
184 	REG(0x1c0),
185 	REG(0x1c4),
186 	REG(0x1c8),
187 	REG(0x180),
188 	REG16(0x2b4),
189 	REG(0x120),
190 	REG(0x124),
191 
192 	NOP(1),
193 	LRI(9, POSTED),
194 	REG16(0x3a8),
195 	REG16(0x28c),
196 	REG16(0x288),
197 	REG16(0x284),
198 	REG16(0x280),
199 	REG16(0x27c),
200 	REG16(0x278),
201 	REG16(0x274),
202 	REG16(0x270),
203 
204 	0
205 };
206 
207 static const u8 gen12_rcs_offsets[] = {
208 	NOP(1),
209 	LRI(13, POSTED),
210 	REG16(0x244),
211 	REG(0x034),
212 	REG(0x030),
213 	REG(0x038),
214 	REG(0x03c),
215 	REG(0x168),
216 	REG(0x140),
217 	REG(0x110),
218 	REG(0x1c0),
219 	REG(0x1c4),
220 	REG(0x1c8),
221 	REG(0x180),
222 	REG16(0x2b4),
223 
224 	NOP(5),
225 	LRI(9, POSTED),
226 	REG16(0x3a8),
227 	REG16(0x28c),
228 	REG16(0x288),
229 	REG16(0x284),
230 	REG16(0x280),
231 	REG16(0x27c),
232 	REG16(0x278),
233 	REG16(0x274),
234 	REG16(0x270),
235 
236 	LRI(3, POSTED),
237 	REG(0x1b0),
238 	REG16(0x5a8),
239 	REG16(0x5ac),
240 
241 	NOP(6),
242 	LRI(1, 0),
243 	REG(0x0c8),
244 	NOP(3 + 9 + 1),
245 
246 	LRI(51, POSTED),
247 	REG16(0x588),
248 	REG16(0x588),
249 	REG16(0x588),
250 	REG16(0x588),
251 	REG16(0x588),
252 	REG16(0x588),
253 	REG(0x028),
254 	REG(0x09c),
255 	REG(0x0c0),
256 	REG(0x178),
257 	REG(0x17c),
258 	REG16(0x358),
259 	REG(0x170),
260 	REG(0x150),
261 	REG(0x154),
262 	REG(0x158),
263 	REG16(0x41c),
264 	REG16(0x600),
265 	REG16(0x604),
266 	REG16(0x608),
267 	REG16(0x60c),
268 	REG16(0x610),
269 	REG16(0x614),
270 	REG16(0x618),
271 	REG16(0x61c),
272 	REG16(0x620),
273 	REG16(0x624),
274 	REG16(0x628),
275 	REG16(0x62c),
276 	REG16(0x630),
277 	REG16(0x634),
278 	REG16(0x638),
279 	REG16(0x63c),
280 	REG16(0x640),
281 	REG16(0x644),
282 	REG16(0x648),
283 	REG16(0x64c),
284 	REG16(0x650),
285 	REG16(0x654),
286 	REG16(0x658),
287 	REG16(0x65c),
288 	REG16(0x660),
289 	REG16(0x664),
290 	REG16(0x668),
291 	REG16(0x66c),
292 	REG16(0x670),
293 	REG16(0x674),
294 	REG16(0x678),
295 	REG16(0x67c),
296 	REG(0x068),
297 	REG(0x084),
298 	NOP(1),
299 
300 	0
301 };
302 
303 static const u8 xehp_rcs_offsets[] = {
304 	NOP(1),
305 	LRI(13, POSTED),
306 	REG16(0x244),
307 	REG(0x034),
308 	REG(0x030),
309 	REG(0x038),
310 	REG(0x03c),
311 	REG(0x168),
312 	REG(0x140),
313 	REG(0x110),
314 	REG(0x1c0),
315 	REG(0x1c4),
316 	REG(0x1c8),
317 	REG(0x180),
318 	REG16(0x2b4),
319 
320 	NOP(5),
321 	LRI(9, POSTED),
322 	REG16(0x3a8),
323 	REG16(0x28c),
324 	REG16(0x288),
325 	REG16(0x284),
326 	REG16(0x280),
327 	REG16(0x27c),
328 	REG16(0x278),
329 	REG16(0x274),
330 	REG16(0x270),
331 
332 	LRI(3, POSTED),
333 	REG(0x1b0),
334 	REG16(0x5a8),
335 	REG16(0x5ac),
336 
337 	NOP(6),
338 	LRI(1, 0),
339 	REG(0x0c8),
340 
341 	0
342 };
343 
344 static const u8 dg2_rcs_offsets[] = {
345 	NOP(1),
346 	LRI(15, POSTED),
347 	REG16(0x244),
348 	REG(0x034),
349 	REG(0x030),
350 	REG(0x038),
351 	REG(0x03c),
352 	REG(0x168),
353 	REG(0x140),
354 	REG(0x110),
355 	REG(0x1c0),
356 	REG(0x1c4),
357 	REG(0x1c8),
358 	REG(0x180),
359 	REG16(0x2b4),
360 	REG(0x120),
361 	REG(0x124),
362 
363 	NOP(1),
364 	LRI(9, POSTED),
365 	REG16(0x3a8),
366 	REG16(0x28c),
367 	REG16(0x288),
368 	REG16(0x284),
369 	REG16(0x280),
370 	REG16(0x27c),
371 	REG16(0x278),
372 	REG16(0x274),
373 	REG16(0x270),
374 
375 	LRI(3, POSTED),
376 	REG(0x1b0),
377 	REG16(0x5a8),
378 	REG16(0x5ac),
379 
380 	NOP(6),
381 	LRI(1, 0),
382 	REG(0x0c8),
383 
384 	0
385 };
386 
387 static const u8 mtl_rcs_offsets[] = {
388 	NOP(1),
389 	LRI(15, POSTED),
390 	REG16(0x244),
391 	REG(0x034),
392 	REG(0x030),
393 	REG(0x038),
394 	REG(0x03c),
395 	REG(0x168),
396 	REG(0x140),
397 	REG(0x110),
398 	REG(0x1c0),
399 	REG(0x1c4),
400 	REG(0x1c8),
401 	REG(0x180),
402 	REG16(0x2b4),
403 	REG(0x120),
404 	REG(0x124),
405 
406 	NOP(1),
407 	LRI(9, POSTED),
408 	REG16(0x3a8),
409 	REG16(0x28c),
410 	REG16(0x288),
411 	REG16(0x284),
412 	REG16(0x280),
413 	REG16(0x27c),
414 	REG16(0x278),
415 	REG16(0x274),
416 	REG16(0x270),
417 
418 	NOP(2),
419 	LRI(2, POSTED),
420 	REG16(0x5a8),
421 	REG16(0x5ac),
422 
423 	NOP(6),
424 	LRI(1, 0),
425 	REG(0x0c8),
426 
427 	0
428 };
429 
430 #define XE2_CTX_COMMON \
431 	NOP(1),                 /* [0x00] */ \
432 	LRI(15, POSTED),        /* [0x01] */ \
433 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
434 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
435 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
436 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
437 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
438 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
439 	REG(0x140),             /* [0x0e] BB_ADDR */ \
440 	REG(0x110),             /* [0x10] BB_STATE */ \
441 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
442 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
443 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
444 	REG(0x180),             /* [0x18] CCID */ \
445 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
446 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
447 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
448 	\
449 	NOP(1),                 /* [0x20] */ \
450 	LRI(9, POSTED),         /* [0x21] */ \
451 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
452 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
453 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
454 	REG16(0x284),           /* [0x28] dummy reg */ \
455 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
456 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
457 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
458 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
459 	REG16(0x270)            /* [0x32] PTBP_LDW */
460 
461 static const u8 xe2_rcs_offsets[] = {
462 	XE2_CTX_COMMON,
463 
464 	NOP(2),                 /* [0x34] */
465 	LRI(2, POSTED),         /* [0x36] */
466 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
467 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
468 
469 	NOP(6),                 /* [0x41] */
470 	LRI(1, 0),              /* [0x47] */
471 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
472 
473 	0
474 };
475 
476 static const u8 xe2_bcs_offsets[] = {
477 	XE2_CTX_COMMON,
478 
479 	NOP(4 + 8 + 1),         /* [0x34] */
480 	LRI(2, POSTED),         /* [0x41] */
481 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
482 	REG16(0x204),           /* [0x44] BLIT_CCTL */
483 
484 	0
485 };
486 
487 static const u8 xe2_xcs_offsets[] = {
488 	XE2_CTX_COMMON,
489 
490 	0
491 };
492 
493 #undef REG16
494 #undef REG
495 #undef LRI
496 #undef NOP
497 
498 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
499 {
500 	if (class == XE_ENGINE_CLASS_RENDER) {
501 		if (GRAPHICS_VER(xe) >= 20)
502 			return xe2_rcs_offsets;
503 		else if (GRAPHICS_VERx100(xe) >= 1270)
504 			return mtl_rcs_offsets;
505 		else if (GRAPHICS_VERx100(xe) >= 1255)
506 			return dg2_rcs_offsets;
507 		else if (GRAPHICS_VERx100(xe) >= 1250)
508 			return xehp_rcs_offsets;
509 		else
510 			return gen12_rcs_offsets;
511 	} else if (class == XE_ENGINE_CLASS_COPY) {
512 		if (GRAPHICS_VER(xe) >= 20)
513 			return xe2_bcs_offsets;
514 		else
515 			return gen12_xcs_offsets;
516 	} else {
517 		if (GRAPHICS_VER(xe) >= 20)
518 			return xe2_xcs_offsets;
519 		else if (GRAPHICS_VERx100(xe) >= 1255)
520 			return dg2_xcs_offsets;
521 		else
522 			return gen12_xcs_offsets;
523 	}
524 }
525 
526 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
527 {
528 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
529 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
530 
531 	/* TODO: Timestamp */
532 }
533 
534 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
535 {
536 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->sriov.vf.memirq;
537 	struct xe_device *xe = gt_to_xe(hwe->gt);
538 
539 	if (!IS_SRIOV_VF(xe) || !xe_device_has_memirq(xe))
540 		return;
541 
542 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
543 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
544 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
545 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
546 
547 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
548 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
549 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
550 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq);
551 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
552 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq);
553 }
554 
555 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
556 {
557 	struct xe_device *xe = gt_to_xe(hwe->gt);
558 
559 	if (GRAPHICS_VERx100(xe) >= 1250)
560 		return 0x70;
561 	else
562 		return 0x60;
563 }
564 
565 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
566 {
567 	int x;
568 
569 	x = lrc_ring_mi_mode(hwe);
570 	regs[x + 1] &= ~STOP_RING;
571 	regs[x + 1] |= STOP_RING << 16;
572 }
573 
574 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
575 {
576 	return 0;
577 }
578 
579 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
580 {
581 	return lrc->ring.size;
582 }
583 
584 /* Make the magic macros work */
585 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
586 
587 #define LRC_SEQNO_PPHWSP_OFFSET 512
588 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
589 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
590 #define LRC_PPHWSP_SIZE SZ_4K
591 
592 static size_t lrc_reg_size(struct xe_device *xe)
593 {
594 	if (GRAPHICS_VERx100(xe) >= 1250)
595 		return 96 * sizeof(u32);
596 	else
597 		return 80 * sizeof(u32);
598 }
599 
600 size_t xe_lrc_skip_size(struct xe_device *xe)
601 {
602 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
603 }
604 
605 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
606 {
607 	/* The seqno is stored in the driver-defined portion of PPHWSP */
608 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
609 }
610 
611 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
612 {
613 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
614 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
615 }
616 
617 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
618 {
619 	/* The parallel is stored in the driver-defined portion of PPHWSP */
620 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
621 }
622 
623 static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
624 {
625 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
626 }
627 
628 #define DECL_MAP_ADDR_HELPERS(elem) \
629 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
630 { \
631 	struct iosys_map map = lrc->bo->vmap; \
632 \
633 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
634 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
635 	return map; \
636 } \
637 static inline u32 __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
638 { \
639 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
640 } \
641 
642 DECL_MAP_ADDR_HELPERS(ring)
643 DECL_MAP_ADDR_HELPERS(pphwsp)
644 DECL_MAP_ADDR_HELPERS(seqno)
645 DECL_MAP_ADDR_HELPERS(regs)
646 DECL_MAP_ADDR_HELPERS(start_seqno)
647 DECL_MAP_ADDR_HELPERS(parallel)
648 
649 #undef DECL_MAP_ADDR_HELPERS
650 
651 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
652 {
653 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
654 }
655 
656 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
657 {
658 	struct xe_device *xe = lrc_to_xe(lrc);
659 	struct iosys_map map;
660 
661 	map = __xe_lrc_regs_map(lrc);
662 	iosys_map_incr(&map, reg_nr * sizeof(u32));
663 	return xe_map_read32(xe, &map);
664 }
665 
666 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
667 {
668 	struct xe_device *xe = lrc_to_xe(lrc);
669 	struct iosys_map map;
670 
671 	map = __xe_lrc_regs_map(lrc);
672 	iosys_map_incr(&map, reg_nr * sizeof(u32));
673 	xe_map_write32(xe, &map, val);
674 }
675 
676 static void *empty_lrc_data(struct xe_hw_engine *hwe)
677 {
678 	struct xe_device *xe = gt_to_xe(hwe->gt);
679 	void *data;
680 	u32 *regs;
681 
682 	data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
683 	if (!data)
684 		return NULL;
685 
686 	/* 1st page: Per-Process of HW status Page */
687 	regs = data + LRC_PPHWSP_SIZE;
688 	set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
689 	set_context_control(regs, hwe);
690 	set_memory_based_intr(regs, hwe);
691 	reset_stop_ring(regs, hwe);
692 
693 	return data;
694 }
695 
696 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
697 {
698 	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
699 
700 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
701 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
702 }
703 
704 #define PVC_CTX_ASID		(0x2e + 1)
705 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
706 
707 int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
708 		struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
709 {
710 	struct xe_gt *gt = hwe->gt;
711 	struct xe_tile *tile = gt_to_tile(gt);
712 	struct xe_device *xe = gt_to_xe(gt);
713 	struct iosys_map map;
714 	void *init_data = NULL;
715 	u32 arb_enable;
716 	int err;
717 
718 	lrc->flags = 0;
719 
720 	/*
721 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
722 	 * via VM bind calls.
723 	 */
724 	lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
725 				      ring_size + xe_lrc_size(xe, hwe->class),
726 				      ttm_bo_type_kernel,
727 				      XE_BO_CREATE_VRAM_IF_DGFX(tile) |
728 				      XE_BO_CREATE_GGTT_BIT);
729 	if (IS_ERR(lrc->bo))
730 		return PTR_ERR(lrc->bo);
731 
732 	lrc->tile = gt_to_tile(hwe->gt);
733 	lrc->ring.size = ring_size;
734 	lrc->ring.tail = 0;
735 
736 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
737 			     hwe->fence_irq, hwe->name);
738 
739 	if (!gt->default_lrc[hwe->class]) {
740 		init_data = empty_lrc_data(hwe);
741 		if (!init_data) {
742 			err = -ENOMEM;
743 			goto err_lrc_finish;
744 		}
745 	}
746 
747 	/*
748 	 * Init Per-Process of HW status Page, LRC / context state to known
749 	 * values
750 	 */
751 	map = __xe_lrc_pphwsp_map(lrc);
752 	if (!init_data) {
753 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
754 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
755 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
756 				 xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
757 	} else {
758 		xe_map_memcpy_to(xe, &map, 0, init_data,
759 				 xe_lrc_size(xe, hwe->class));
760 		kfree(init_data);
761 	}
762 
763 	if (vm) {
764 		xe_lrc_set_ppgtt(lrc, vm);
765 
766 		if (vm->xef)
767 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
768 	}
769 
770 	xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
771 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
772 	xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
773 	xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
774 			     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
775 	if (xe->info.has_asid && vm)
776 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
777 
778 	lrc->desc = LRC_VALID;
779 	lrc->desc |= LRC_LEGACY_64B_CONTEXT << LRC_ADDRESSING_MODE_SHIFT;
780 	/* TODO: Priority */
781 
782 	/* While this appears to have something about privileged batches or
783 	 * some such, it really just means PPGTT mode.
784 	 */
785 	if (vm)
786 		lrc->desc |= LRC_PRIVILEGE;
787 
788 	if (GRAPHICS_VERx100(xe) < 1250) {
789 		lrc->desc |= (u64)hwe->instance << ENGINE_INSTANCE_SHIFT;
790 		lrc->desc |= (u64)hwe->class << ENGINE_CLASS_SHIFT;
791 	}
792 
793 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
794 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
795 
796 	map = __xe_lrc_seqno_map(lrc);
797 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
798 
799 	map = __xe_lrc_start_seqno_map(lrc);
800 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
801 
802 	return 0;
803 
804 err_lrc_finish:
805 	xe_lrc_finish(lrc);
806 	return err;
807 }
808 
809 void xe_lrc_finish(struct xe_lrc *lrc)
810 {
811 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
812 	xe_bo_lock(lrc->bo, false);
813 	xe_bo_unpin(lrc->bo);
814 	xe_bo_unlock(lrc->bo);
815 	xe_bo_put(lrc->bo);
816 }
817 
818 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
819 {
820 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
821 }
822 
823 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
824 {
825 	return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
826 }
827 
828 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
829 {
830 	const u32 head = xe_lrc_ring_head(lrc);
831 	const u32 tail = lrc->ring.tail;
832 	const u32 size = lrc->ring.size;
833 
834 	return ((head - tail - 1) & (size - 1)) + 1;
835 }
836 
837 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
838 				const void *data, size_t size)
839 {
840 	struct xe_device *xe = lrc_to_xe(lrc);
841 
842 	iosys_map_incr(&ring, lrc->ring.tail);
843 	xe_map_memcpy_to(xe, &ring, 0, data, size);
844 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
845 }
846 
847 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
848 {
849 	struct xe_device *xe = lrc_to_xe(lrc);
850 	struct iosys_map ring;
851 	u32 rhs;
852 	size_t aligned_size;
853 
854 	xe_assert(xe, IS_ALIGNED(size, 4));
855 	aligned_size = ALIGN(size, 8);
856 
857 	ring = __xe_lrc_ring_map(lrc);
858 
859 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
860 	rhs = lrc->ring.size - lrc->ring.tail;
861 	if (size > rhs) {
862 		__xe_lrc_write_ring(lrc, ring, data, rhs);
863 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
864 	} else {
865 		__xe_lrc_write_ring(lrc, ring, data, size);
866 	}
867 
868 	if (aligned_size > size) {
869 		u32 noop = MI_NOOP;
870 
871 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
872 	}
873 }
874 
875 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
876 {
877 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
878 }
879 
880 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
881 {
882 	return __xe_lrc_seqno_ggtt_addr(lrc);
883 }
884 
885 struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
886 {
887 	return &xe_hw_fence_create(&lrc->fence_ctx,
888 				   __xe_lrc_seqno_map(lrc))->dma;
889 }
890 
891 s32 xe_lrc_seqno(struct xe_lrc *lrc)
892 {
893 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
894 
895 	return xe_map_read32(lrc_to_xe(lrc), &map);
896 }
897 
898 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
899 {
900 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
901 
902 	return xe_map_read32(lrc_to_xe(lrc), &map);
903 }
904 
905 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
906 {
907 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
908 }
909 
910 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
911 {
912 	return __xe_lrc_parallel_ggtt_addr(lrc);
913 }
914 
915 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
916 {
917 	return __xe_lrc_parallel_map(lrc);
918 }
919 
920 static int instr_dw(u32 cmd_header)
921 {
922 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
923 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
924 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
925 		return 1;
926 
927 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
928 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
929 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
930 
931 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
932 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
933 }
934 
935 static int dump_mi_command(struct drm_printer *p,
936 			   struct xe_gt *gt,
937 			   u32 *dw,
938 			   int remaining_dw)
939 {
940 	u32 inst_header = *dw;
941 	u32 numdw = instr_dw(inst_header);
942 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
943 	int num_noop;
944 
945 	/* First check for commands that don't have/use a '# DW' field */
946 	switch (inst_header & MI_OPCODE) {
947 	case MI_NOOP:
948 		num_noop = 1;
949 		while (num_noop < remaining_dw &&
950 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
951 			num_noop++;
952 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
953 		return num_noop;
954 
955 	case MI_TOPOLOGY_FILTER:
956 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
957 		return 1;
958 
959 	case MI_BATCH_BUFFER_END:
960 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
961 		/* Return 'remaining_dw' to consume the rest of the LRC */
962 		return remaining_dw;
963 	}
964 
965 	/*
966 	 * Any remaining commands include a # of dwords.  We should make sure
967 	 * it doesn't exceed the remaining size of the LRC.
968 	 */
969 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
970 		numdw = remaining_dw;
971 
972 	switch (inst_header & MI_OPCODE) {
973 	case MI_LOAD_REGISTER_IMM:
974 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
975 			   inst_header, (numdw - 1) / 2);
976 		for (int i = 1; i < numdw; i += 2)
977 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
978 		return numdw;
979 
980 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
981 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
982 			   inst_header,
983 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
984 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
985 		if (numdw == 4)
986 			drm_printf(p, " - %#6x = %#010llx\n",
987 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
988 		else
989 			drm_printf(p, " - %*ph (%s)\n",
990 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
991 				   numdw < 4 ? "truncated" : "malformed");
992 		return numdw;
993 
994 	case MI_FORCE_WAKEUP:
995 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
996 		return numdw;
997 
998 	default:
999 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1000 			   inst_header, opcode, numdw);
1001 		return numdw;
1002 	}
1003 }
1004 
1005 static int dump_gfxpipe_command(struct drm_printer *p,
1006 				struct xe_gt *gt,
1007 				u32 *dw,
1008 				int remaining_dw)
1009 {
1010 	u32 numdw = instr_dw(*dw);
1011 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1012 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1013 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1014 
1015 	/*
1016 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1017 	 * remaining size of the LRC.
1018 	 */
1019 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1020 		numdw = remaining_dw;
1021 
1022 	switch (*dw & GFXPIPE_MATCH_MASK) {
1023 #define MATCH(cmd) \
1024 	case cmd: \
1025 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1026 		return numdw
1027 #define MATCH3D(cmd) \
1028 	case CMD_##cmd: \
1029 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1030 		return numdw
1031 
1032 	MATCH(STATE_BASE_ADDRESS);
1033 	MATCH(STATE_SIP);
1034 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1035 	MATCH(STATE_COMPUTE_MODE);
1036 	MATCH3D(3DSTATE_BTD);
1037 
1038 	MATCH3D(3DSTATE_VF_STATISTICS);
1039 
1040 	MATCH(PIPELINE_SELECT);
1041 
1042 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1043 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1044 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1045 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1046 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1047 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1048 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1049 	MATCH3D(3DSTATE_INDEX_BUFFER);
1050 	MATCH3D(3DSTATE_VF);
1051 	MATCH3D(3DSTATE_MULTISAMPLE);
1052 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1053 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1054 	MATCH3D(3DSTATE_VS);
1055 	MATCH3D(3DSTATE_GS);
1056 	MATCH3D(3DSTATE_CLIP);
1057 	MATCH3D(3DSTATE_SF);
1058 	MATCH3D(3DSTATE_WM);
1059 	MATCH3D(3DSTATE_CONSTANT_VS);
1060 	MATCH3D(3DSTATE_CONSTANT_GS);
1061 	MATCH3D(3DSTATE_SAMPLE_MASK);
1062 	MATCH3D(3DSTATE_CONSTANT_HS);
1063 	MATCH3D(3DSTATE_CONSTANT_DS);
1064 	MATCH3D(3DSTATE_HS);
1065 	MATCH3D(3DSTATE_TE);
1066 	MATCH3D(3DSTATE_DS);
1067 	MATCH3D(3DSTATE_STREAMOUT);
1068 	MATCH3D(3DSTATE_SBE);
1069 	MATCH3D(3DSTATE_PS);
1070 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1071 	MATCH3D(3DSTATE_CPS_POINTERS);
1072 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1073 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1074 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1075 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1076 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1077 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1078 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1079 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1080 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1081 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1082 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1083 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1084 	MATCH3D(3DSTATE_VF_INSTANCING);
1085 	MATCH3D(3DSTATE_VF_SGVS);
1086 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1087 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1088 	MATCH3D(3DSTATE_PS_BLEND);
1089 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1090 	MATCH3D(3DSTATE_PS_EXTRA);
1091 	MATCH3D(3DSTATE_RASTER);
1092 	MATCH3D(3DSTATE_SBE_SWIZ);
1093 	MATCH3D(3DSTATE_WM_HZ_OP);
1094 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1095 	MATCH3D(3DSTATE_VF_SGVS_2);
1096 	MATCH3D(3DSTATE_VFG);
1097 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1098 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1099 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1100 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1101 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1102 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1103 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1104 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1105 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1106 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1107 	MATCH3D(3DSTATE_AMFS);
1108 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1109 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1110 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1111 	MATCH3D(3DSTATE_MESH_CONTROL);
1112 	MATCH3D(3DSTATE_MESH_DISTRIB);
1113 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1114 	MATCH3D(3DSTATE_MESH_SHADER);
1115 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1116 	MATCH3D(3DSTATE_TASK_CONTROL);
1117 	MATCH3D(3DSTATE_TASK_SHADER);
1118 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1119 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1120 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1121 	MATCH3D(3DSTATE_CLIP_MESH);
1122 	MATCH3D(3DSTATE_SBE_MESH);
1123 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1124 
1125 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1126 	MATCH3D(3DSTATE_CHROMA_KEY);
1127 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1128 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1129 	MATCH3D(3DSTATE_LINE_STIPPLE);
1130 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1131 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1132 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1133 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1134 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1135 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1136 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1137 	MATCH3D(3DSTATE_SO_DECL_LIST);
1138 	MATCH3D(3DSTATE_SO_BUFFER);
1139 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1140 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1141 	MATCH3D(3DSTATE_3D_MODE);
1142 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1143 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1144 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1145 
1146 	default:
1147 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1148 			   *dw, pipeline, opcode, subopcode, numdw);
1149 		return numdw;
1150 	}
1151 }
1152 
1153 void xe_lrc_dump_default(struct drm_printer *p,
1154 			 struct xe_gt *gt,
1155 			 enum xe_engine_class hwe_class)
1156 {
1157 	u32 *dw;
1158 	int remaining_dw, num_dw;
1159 
1160 	if (!gt->default_lrc[hwe_class]) {
1161 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1162 		return;
1163 	}
1164 
1165 	/*
1166 	 * Skip the beginning of the LRC since it contains the per-process
1167 	 * hardware status page.
1168 	 */
1169 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1170 	remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
1171 
1172 	while (remaining_dw > 0) {
1173 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1174 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1175 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1176 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1177 		} else {
1178 			num_dw = min(instr_dw(*dw), remaining_dw);
1179 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1180 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1181 				   num_dw);
1182 		}
1183 
1184 		dw += num_dw;
1185 		remaining_dw -= num_dw;
1186 	}
1187 }
1188 
1189 struct instr_state {
1190 	u32 instr;
1191 	u16 num_dw;
1192 };
1193 
1194 static const struct instr_state xe_hpg_svg_state[] = {
1195 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1196 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1197 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1198 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1199 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1200 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1201 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1202 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1203 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1204 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1205 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1206 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1207 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1208 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1209 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1210 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1211 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1212 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1213 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1214 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1215 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1216 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1217 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1218 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1219 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1220 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1221 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1222 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1223 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1224 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1225 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1226 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1227 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1228 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1229 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1230 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1231 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1232 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1233 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1234 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1235 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1236 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1237 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1238 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1239 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1240 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1241 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1242 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1243 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1244 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1245 };
1246 
1247 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1248 {
1249 	struct xe_gt *gt = q->hwe->gt;
1250 	struct xe_device *xe = gt_to_xe(gt);
1251 	const struct instr_state *state_table = NULL;
1252 	int state_table_size = 0;
1253 
1254 	/*
1255 	 * At the moment we only need to emit non-register state for the RCS
1256 	 * engine.
1257 	 */
1258 	if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
1259 		return;
1260 
1261 	switch (GRAPHICS_VERx100(xe)) {
1262 	case 1255:
1263 	case 1270 ... 2004:
1264 		state_table = xe_hpg_svg_state;
1265 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1266 		break;
1267 	default:
1268 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1269 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1270 		return;
1271 	}
1272 
1273 	for (int i = 0; i < state_table_size; i++) {
1274 		u32 instr = state_table[i].instr;
1275 		u16 num_dw = state_table[i].num_dw;
1276 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1277 
1278 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1279 		xe_gt_assert(gt, num_dw != 0);
1280 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1281 
1282 		/*
1283 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1284 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1285 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1286 		 * Just make the replacement here rather than defining a
1287 		 * whole separate table for the single trivial change.
1288 		 */
1289 		if (GRAPHICS_VER(xe) >= 20 &&
1290 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1291 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1292 
1293 		bb->cs[bb->len] = instr;
1294 		if (!is_single_dw)
1295 			bb->cs[bb->len] |= (num_dw - 2);
1296 
1297 		bb->len += num_dw;
1298 	}
1299 }
1300