xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 78c3925c048c752334873f56c3a3d1c9d53e0416)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include "instructions/xe_mi_commands.h"
9 #include "instructions/xe_gfxpipe_commands.h"
10 #include "regs/xe_engine_regs.h"
11 #include "regs/xe_gpu_commands.h"
12 #include "regs/xe_lrc_layout.h"
13 #include "xe_bb.h"
14 #include "xe_bo.h"
15 #include "xe_device.h"
16 #include "xe_drm_client.h"
17 #include "xe_exec_queue_types.h"
18 #include "xe_gt.h"
19 #include "xe_gt_printk.h"
20 #include "xe_hw_fence.h"
21 #include "xe_map.h"
22 #include "xe_memirq.h"
23 #include "xe_sriov.h"
24 #include "xe_vm.h"
25 
26 #define LRC_VALID				(1 << 0)
27 #define LRC_PRIVILEGE				(1 << 8)
28 #define LRC_ADDRESSING_MODE_SHIFT		3
29 #define LRC_LEGACY_64B_CONTEXT			3
30 
31 #define ENGINE_CLASS_SHIFT			61
32 #define ENGINE_INSTANCE_SHIFT			48
33 
34 static struct xe_device *
35 lrc_to_xe(struct xe_lrc *lrc)
36 {
37 	return gt_to_xe(lrc->fence_ctx.gt);
38 }
39 
40 size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
41 {
42 	switch (class) {
43 	case XE_ENGINE_CLASS_RENDER:
44 		if (GRAPHICS_VER(xe) >= 20)
45 			return 4 * SZ_4K;
46 		else
47 			return 14 * SZ_4K;
48 	case XE_ENGINE_CLASS_COMPUTE:
49 		/* 14 pages since graphics_ver == 11 */
50 		if (GRAPHICS_VER(xe) >= 20)
51 			return 3 * SZ_4K;
52 		else
53 			return 14 * SZ_4K;
54 	default:
55 		WARN(1, "Unknown engine class: %d", class);
56 		fallthrough;
57 	case XE_ENGINE_CLASS_COPY:
58 	case XE_ENGINE_CLASS_VIDEO_DECODE:
59 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
60 	case XE_ENGINE_CLASS_OTHER:
61 		return 2 * SZ_4K;
62 	}
63 }
64 
65 /*
66  * The per-platform tables are u8-encoded in @data. Decode @data and set the
67  * addresses' offset and commands in @regs. The following encoding is used
68  * for each byte. There are 2 steps: decoding commands and decoding addresses.
69  *
70  * Commands:
71  * [7]: create NOPs - number of NOPs are set in lower bits
72  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
73  *      MI_LRI_FORCE_POSTED
74  * [5:0]: Number of NOPs or registers to set values to in case of
75  *        MI_LOAD_REGISTER_IMM
76  *
77  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
78  * number of registers. They are set by using the REG/REG16 macros: the former
79  * is used for offsets smaller than 0x200 while the latter is for values bigger
80  * than that. Those macros already set all the bits documented below correctly:
81  *
82  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
83  *      follow, for the lower bits
84  * [6:0]: Register offset, without considering the engine base.
85  *
86  * This function only tweaks the commands and register offsets. Values are not
87  * filled out.
88  */
89 static void set_offsets(u32 *regs,
90 			const u8 *data,
91 			const struct xe_hw_engine *hwe)
92 #define NOP(x) (BIT(7) | (x))
93 #define LRI(count, flags) ((flags) << 6 | (count) | \
94 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
95 #define POSTED BIT(0)
96 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
97 #define REG16(x) \
98 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
99 	(((x) >> 2) & 0x7f)
100 #define END 0
101 {
102 	const u32 base = hwe->mmio_base;
103 
104 	while (*data) {
105 		u8 count, flags;
106 
107 		if (*data & BIT(7)) { /* skip */
108 			count = *data++ & ~BIT(7);
109 			regs += count;
110 			continue;
111 		}
112 
113 		count = *data & 0x3f;
114 		flags = *data >> 6;
115 		data++;
116 
117 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
118 		if (flags & POSTED)
119 			*regs |= MI_LRI_FORCE_POSTED;
120 		*regs |= MI_LRI_LRM_CS_MMIO;
121 		regs++;
122 
123 		xe_gt_assert(hwe->gt, count);
124 		do {
125 			u32 offset = 0;
126 			u8 v;
127 
128 			do {
129 				v = *data++;
130 				offset <<= 7;
131 				offset |= v & ~BIT(7);
132 			} while (v & BIT(7));
133 
134 			regs[0] = base + (offset << 2);
135 			regs += 2;
136 		} while (--count);
137 	}
138 
139 	*regs = MI_BATCH_BUFFER_END | BIT(0);
140 }
141 
142 static const u8 gen12_xcs_offsets[] = {
143 	NOP(1),
144 	LRI(13, POSTED),
145 	REG16(0x244),
146 	REG(0x034),
147 	REG(0x030),
148 	REG(0x038),
149 	REG(0x03c),
150 	REG(0x168),
151 	REG(0x140),
152 	REG(0x110),
153 	REG(0x1c0),
154 	REG(0x1c4),
155 	REG(0x1c8),
156 	REG(0x180),
157 	REG16(0x2b4),
158 
159 	NOP(5),
160 	LRI(9, POSTED),
161 	REG16(0x3a8),
162 	REG16(0x28c),
163 	REG16(0x288),
164 	REG16(0x284),
165 	REG16(0x280),
166 	REG16(0x27c),
167 	REG16(0x278),
168 	REG16(0x274),
169 	REG16(0x270),
170 
171 	END
172 };
173 
174 static const u8 dg2_xcs_offsets[] = {
175 	NOP(1),
176 	LRI(15, POSTED),
177 	REG16(0x244),
178 	REG(0x034),
179 	REG(0x030),
180 	REG(0x038),
181 	REG(0x03c),
182 	REG(0x168),
183 	REG(0x140),
184 	REG(0x110),
185 	REG(0x1c0),
186 	REG(0x1c4),
187 	REG(0x1c8),
188 	REG(0x180),
189 	REG16(0x2b4),
190 	REG(0x120),
191 	REG(0x124),
192 
193 	NOP(1),
194 	LRI(9, POSTED),
195 	REG16(0x3a8),
196 	REG16(0x28c),
197 	REG16(0x288),
198 	REG16(0x284),
199 	REG16(0x280),
200 	REG16(0x27c),
201 	REG16(0x278),
202 	REG16(0x274),
203 	REG16(0x270),
204 
205 	END
206 };
207 
208 static const u8 gen12_rcs_offsets[] = {
209 	NOP(1),
210 	LRI(13, POSTED),
211 	REG16(0x244),
212 	REG(0x034),
213 	REG(0x030),
214 	REG(0x038),
215 	REG(0x03c),
216 	REG(0x168),
217 	REG(0x140),
218 	REG(0x110),
219 	REG(0x1c0),
220 	REG(0x1c4),
221 	REG(0x1c8),
222 	REG(0x180),
223 	REG16(0x2b4),
224 
225 	NOP(5),
226 	LRI(9, POSTED),
227 	REG16(0x3a8),
228 	REG16(0x28c),
229 	REG16(0x288),
230 	REG16(0x284),
231 	REG16(0x280),
232 	REG16(0x27c),
233 	REG16(0x278),
234 	REG16(0x274),
235 	REG16(0x270),
236 
237 	LRI(3, POSTED),
238 	REG(0x1b0),
239 	REG16(0x5a8),
240 	REG16(0x5ac),
241 
242 	NOP(6),
243 	LRI(1, 0),
244 	REG(0x0c8),
245 	NOP(3 + 9 + 1),
246 
247 	LRI(51, POSTED),
248 	REG16(0x588),
249 	REG16(0x588),
250 	REG16(0x588),
251 	REG16(0x588),
252 	REG16(0x588),
253 	REG16(0x588),
254 	REG(0x028),
255 	REG(0x09c),
256 	REG(0x0c0),
257 	REG(0x178),
258 	REG(0x17c),
259 	REG16(0x358),
260 	REG(0x170),
261 	REG(0x150),
262 	REG(0x154),
263 	REG(0x158),
264 	REG16(0x41c),
265 	REG16(0x600),
266 	REG16(0x604),
267 	REG16(0x608),
268 	REG16(0x60c),
269 	REG16(0x610),
270 	REG16(0x614),
271 	REG16(0x618),
272 	REG16(0x61c),
273 	REG16(0x620),
274 	REG16(0x624),
275 	REG16(0x628),
276 	REG16(0x62c),
277 	REG16(0x630),
278 	REG16(0x634),
279 	REG16(0x638),
280 	REG16(0x63c),
281 	REG16(0x640),
282 	REG16(0x644),
283 	REG16(0x648),
284 	REG16(0x64c),
285 	REG16(0x650),
286 	REG16(0x654),
287 	REG16(0x658),
288 	REG16(0x65c),
289 	REG16(0x660),
290 	REG16(0x664),
291 	REG16(0x668),
292 	REG16(0x66c),
293 	REG16(0x670),
294 	REG16(0x674),
295 	REG16(0x678),
296 	REG16(0x67c),
297 	REG(0x068),
298 	REG(0x084),
299 	NOP(1),
300 
301 	END
302 };
303 
304 static const u8 xehp_rcs_offsets[] = {
305 	NOP(1),
306 	LRI(13, POSTED),
307 	REG16(0x244),
308 	REG(0x034),
309 	REG(0x030),
310 	REG(0x038),
311 	REG(0x03c),
312 	REG(0x168),
313 	REG(0x140),
314 	REG(0x110),
315 	REG(0x1c0),
316 	REG(0x1c4),
317 	REG(0x1c8),
318 	REG(0x180),
319 	REG16(0x2b4),
320 
321 	NOP(5),
322 	LRI(9, POSTED),
323 	REG16(0x3a8),
324 	REG16(0x28c),
325 	REG16(0x288),
326 	REG16(0x284),
327 	REG16(0x280),
328 	REG16(0x27c),
329 	REG16(0x278),
330 	REG16(0x274),
331 	REG16(0x270),
332 
333 	LRI(3, POSTED),
334 	REG(0x1b0),
335 	REG16(0x5a8),
336 	REG16(0x5ac),
337 
338 	NOP(6),
339 	LRI(1, 0),
340 	REG(0x0c8),
341 
342 	END
343 };
344 
345 static const u8 dg2_rcs_offsets[] = {
346 	NOP(1),
347 	LRI(15, POSTED),
348 	REG16(0x244),
349 	REG(0x034),
350 	REG(0x030),
351 	REG(0x038),
352 	REG(0x03c),
353 	REG(0x168),
354 	REG(0x140),
355 	REG(0x110),
356 	REG(0x1c0),
357 	REG(0x1c4),
358 	REG(0x1c8),
359 	REG(0x180),
360 	REG16(0x2b4),
361 	REG(0x120),
362 	REG(0x124),
363 
364 	NOP(1),
365 	LRI(9, POSTED),
366 	REG16(0x3a8),
367 	REG16(0x28c),
368 	REG16(0x288),
369 	REG16(0x284),
370 	REG16(0x280),
371 	REG16(0x27c),
372 	REG16(0x278),
373 	REG16(0x274),
374 	REG16(0x270),
375 
376 	LRI(3, POSTED),
377 	REG(0x1b0),
378 	REG16(0x5a8),
379 	REG16(0x5ac),
380 
381 	NOP(6),
382 	LRI(1, 0),
383 	REG(0x0c8),
384 
385 	END
386 };
387 
388 static const u8 mtl_rcs_offsets[] = {
389 	NOP(1),
390 	LRI(15, POSTED),
391 	REG16(0x244),
392 	REG(0x034),
393 	REG(0x030),
394 	REG(0x038),
395 	REG(0x03c),
396 	REG(0x168),
397 	REG(0x140),
398 	REG(0x110),
399 	REG(0x1c0),
400 	REG(0x1c4),
401 	REG(0x1c8),
402 	REG(0x180),
403 	REG16(0x2b4),
404 	REG(0x120),
405 	REG(0x124),
406 
407 	NOP(1),
408 	LRI(9, POSTED),
409 	REG16(0x3a8),
410 	REG16(0x28c),
411 	REG16(0x288),
412 	REG16(0x284),
413 	REG16(0x280),
414 	REG16(0x27c),
415 	REG16(0x278),
416 	REG16(0x274),
417 	REG16(0x270),
418 
419 	NOP(2),
420 	LRI(2, POSTED),
421 	REG16(0x5a8),
422 	REG16(0x5ac),
423 
424 	NOP(6),
425 	LRI(1, 0),
426 	REG(0x0c8),
427 
428 	END
429 };
430 
431 #define XE2_CTX_COMMON \
432 	NOP(1),                 /* [0x00] */ \
433 	LRI(15, POSTED),        /* [0x01] */ \
434 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
435 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
436 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
437 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
438 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
439 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
440 	REG(0x140),             /* [0x0e] BB_ADDR */ \
441 	REG(0x110),             /* [0x10] BB_STATE */ \
442 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
443 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
444 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
445 	REG(0x180),             /* [0x18] CCID */ \
446 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
447 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
448 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
449 	\
450 	NOP(1),                 /* [0x20] */ \
451 	LRI(9, POSTED),         /* [0x21] */ \
452 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
453 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
454 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
455 	REG16(0x284),           /* [0x28] dummy reg */ \
456 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
457 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
458 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
459 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
460 	REG16(0x270)            /* [0x32] PTBP_LDW */
461 
462 static const u8 xe2_rcs_offsets[] = {
463 	XE2_CTX_COMMON,
464 
465 	NOP(2),                 /* [0x34] */
466 	LRI(2, POSTED),         /* [0x36] */
467 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
468 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
469 
470 	NOP(6),                 /* [0x41] */
471 	LRI(1, 0),              /* [0x47] */
472 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
473 
474 	END
475 };
476 
477 static const u8 xe2_bcs_offsets[] = {
478 	XE2_CTX_COMMON,
479 
480 	NOP(4 + 8 + 1),         /* [0x34] */
481 	LRI(2, POSTED),         /* [0x41] */
482 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
483 	REG16(0x204),           /* [0x44] BLIT_CCTL */
484 
485 	END
486 };
487 
488 static const u8 xe2_xcs_offsets[] = {
489 	XE2_CTX_COMMON,
490 
491 	END
492 };
493 
494 #undef END
495 #undef REG16
496 #undef REG
497 #undef LRI
498 #undef NOP
499 
500 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
501 {
502 	if (class == XE_ENGINE_CLASS_RENDER) {
503 		if (GRAPHICS_VER(xe) >= 20)
504 			return xe2_rcs_offsets;
505 		else if (GRAPHICS_VERx100(xe) >= 1270)
506 			return mtl_rcs_offsets;
507 		else if (GRAPHICS_VERx100(xe) >= 1255)
508 			return dg2_rcs_offsets;
509 		else if (GRAPHICS_VERx100(xe) >= 1250)
510 			return xehp_rcs_offsets;
511 		else
512 			return gen12_rcs_offsets;
513 	} else if (class == XE_ENGINE_CLASS_COPY) {
514 		if (GRAPHICS_VER(xe) >= 20)
515 			return xe2_bcs_offsets;
516 		else
517 			return gen12_xcs_offsets;
518 	} else {
519 		if (GRAPHICS_VER(xe) >= 20)
520 			return xe2_xcs_offsets;
521 		else if (GRAPHICS_VERx100(xe) >= 1255)
522 			return dg2_xcs_offsets;
523 		else
524 			return gen12_xcs_offsets;
525 	}
526 }
527 
528 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
529 {
530 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) |
531 				    _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
532 				    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
533 
534 	/* TODO: Timestamp */
535 }
536 
537 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
538 {
539 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->sriov.vf.memirq;
540 	struct xe_device *xe = gt_to_xe(hwe->gt);
541 
542 	if (!IS_SRIOV_VF(xe) || !xe_device_has_memirq(xe))
543 		return;
544 
545 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
546 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
547 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
548 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
549 
550 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
551 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
552 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
553 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq);
554 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
555 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq);
556 }
557 
558 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
559 {
560 	struct xe_device *xe = gt_to_xe(hwe->gt);
561 
562 	if (GRAPHICS_VERx100(xe) >= 1250)
563 		return 0x70;
564 	else
565 		return 0x60;
566 }
567 
568 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
569 {
570 	int x;
571 
572 	x = lrc_ring_mi_mode(hwe);
573 	regs[x + 1] &= ~STOP_RING;
574 	regs[x + 1] |= STOP_RING << 16;
575 }
576 
577 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
578 {
579 	return 0;
580 }
581 
582 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
583 {
584 	return lrc->ring.size;
585 }
586 
587 /* Make the magic macros work */
588 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
589 
590 #define LRC_SEQNO_PPHWSP_OFFSET 512
591 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
592 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
593 #define LRC_PPHWSP_SIZE SZ_4K
594 
595 static size_t lrc_reg_size(struct xe_device *xe)
596 {
597 	if (GRAPHICS_VERx100(xe) >= 1250)
598 		return 96 * sizeof(u32);
599 	else
600 		return 80 * sizeof(u32);
601 }
602 
603 size_t xe_lrc_skip_size(struct xe_device *xe)
604 {
605 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
606 }
607 
608 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
609 {
610 	/* The seqno is stored in the driver-defined portion of PPHWSP */
611 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
612 }
613 
614 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
615 {
616 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
617 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
618 }
619 
620 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
621 {
622 	/* The parallel is stored in the driver-defined portion of PPHWSP */
623 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
624 }
625 
626 static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
627 {
628 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
629 }
630 
631 #define DECL_MAP_ADDR_HELPERS(elem) \
632 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
633 { \
634 	struct iosys_map map = lrc->bo->vmap; \
635 \
636 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
637 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
638 	return map; \
639 } \
640 static inline u32 __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
641 { \
642 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
643 } \
644 
645 DECL_MAP_ADDR_HELPERS(ring)
646 DECL_MAP_ADDR_HELPERS(pphwsp)
647 DECL_MAP_ADDR_HELPERS(seqno)
648 DECL_MAP_ADDR_HELPERS(regs)
649 DECL_MAP_ADDR_HELPERS(start_seqno)
650 DECL_MAP_ADDR_HELPERS(parallel)
651 
652 #undef DECL_MAP_ADDR_HELPERS
653 
654 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
655 {
656 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
657 }
658 
659 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
660 {
661 	struct xe_device *xe = lrc_to_xe(lrc);
662 	struct iosys_map map;
663 
664 	map = __xe_lrc_regs_map(lrc);
665 	iosys_map_incr(&map, reg_nr * sizeof(u32));
666 	return xe_map_read32(xe, &map);
667 }
668 
669 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
670 {
671 	struct xe_device *xe = lrc_to_xe(lrc);
672 	struct iosys_map map;
673 
674 	map = __xe_lrc_regs_map(lrc);
675 	iosys_map_incr(&map, reg_nr * sizeof(u32));
676 	xe_map_write32(xe, &map, val);
677 }
678 
679 static void *empty_lrc_data(struct xe_hw_engine *hwe)
680 {
681 	struct xe_device *xe = gt_to_xe(hwe->gt);
682 	void *data;
683 	u32 *regs;
684 
685 	data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
686 	if (!data)
687 		return NULL;
688 
689 	/* 1st page: Per-Process of HW status Page */
690 	regs = data + LRC_PPHWSP_SIZE;
691 	set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
692 	set_context_control(regs, hwe);
693 	set_memory_based_intr(regs, hwe);
694 	reset_stop_ring(regs, hwe);
695 
696 	return data;
697 }
698 
699 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
700 {
701 	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
702 
703 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
704 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
705 }
706 
707 #define PVC_CTX_ASID		(0x2e + 1)
708 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
709 
710 int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
711 		struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
712 {
713 	struct xe_gt *gt = hwe->gt;
714 	struct xe_tile *tile = gt_to_tile(gt);
715 	struct xe_device *xe = gt_to_xe(gt);
716 	struct iosys_map map;
717 	void *init_data = NULL;
718 	u32 arb_enable;
719 	int err;
720 
721 	lrc->flags = 0;
722 
723 	/*
724 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
725 	 * via VM bind calls.
726 	 */
727 	lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
728 				      ring_size + xe_lrc_size(xe, hwe->class),
729 				      ttm_bo_type_kernel,
730 				      XE_BO_CREATE_VRAM_IF_DGFX(tile) |
731 				      XE_BO_CREATE_GGTT_BIT);
732 	if (IS_ERR(lrc->bo))
733 		return PTR_ERR(lrc->bo);
734 
735 	lrc->tile = gt_to_tile(hwe->gt);
736 	lrc->ring.size = ring_size;
737 	lrc->ring.tail = 0;
738 
739 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
740 			     hwe->fence_irq, hwe->name);
741 
742 	if (!gt->default_lrc[hwe->class]) {
743 		init_data = empty_lrc_data(hwe);
744 		if (!init_data) {
745 			err = -ENOMEM;
746 			goto err_lrc_finish;
747 		}
748 	}
749 
750 	/*
751 	 * Init Per-Process of HW status Page, LRC / context state to known
752 	 * values
753 	 */
754 	map = __xe_lrc_pphwsp_map(lrc);
755 	if (!init_data) {
756 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
757 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
758 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
759 				 xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
760 	} else {
761 		xe_map_memcpy_to(xe, &map, 0, init_data,
762 				 xe_lrc_size(xe, hwe->class));
763 		kfree(init_data);
764 	}
765 
766 	if (vm) {
767 		xe_lrc_set_ppgtt(lrc, vm);
768 
769 		if (vm->xef)
770 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
771 	}
772 
773 	xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
774 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
775 	xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
776 	xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
777 			     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
778 	if (xe->info.has_asid && vm)
779 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
780 
781 	lrc->desc = LRC_VALID;
782 	lrc->desc |= LRC_LEGACY_64B_CONTEXT << LRC_ADDRESSING_MODE_SHIFT;
783 	/* TODO: Priority */
784 
785 	/* While this appears to have something about privileged batches or
786 	 * some such, it really just means PPGTT mode.
787 	 */
788 	if (vm)
789 		lrc->desc |= LRC_PRIVILEGE;
790 
791 	if (GRAPHICS_VERx100(xe) < 1250) {
792 		lrc->desc |= (u64)hwe->instance << ENGINE_INSTANCE_SHIFT;
793 		lrc->desc |= (u64)hwe->class << ENGINE_CLASS_SHIFT;
794 	}
795 
796 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
797 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
798 
799 	map = __xe_lrc_seqno_map(lrc);
800 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
801 
802 	map = __xe_lrc_start_seqno_map(lrc);
803 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
804 
805 	return 0;
806 
807 err_lrc_finish:
808 	xe_lrc_finish(lrc);
809 	return err;
810 }
811 
812 void xe_lrc_finish(struct xe_lrc *lrc)
813 {
814 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
815 	xe_bo_lock(lrc->bo, false);
816 	xe_bo_unpin(lrc->bo);
817 	xe_bo_unlock(lrc->bo);
818 	xe_bo_put(lrc->bo);
819 }
820 
821 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
822 {
823 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
824 }
825 
826 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
827 {
828 	return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
829 }
830 
831 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
832 {
833 	const u32 head = xe_lrc_ring_head(lrc);
834 	const u32 tail = lrc->ring.tail;
835 	const u32 size = lrc->ring.size;
836 
837 	return ((head - tail - 1) & (size - 1)) + 1;
838 }
839 
840 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
841 				const void *data, size_t size)
842 {
843 	struct xe_device *xe = lrc_to_xe(lrc);
844 
845 	iosys_map_incr(&ring, lrc->ring.tail);
846 	xe_map_memcpy_to(xe, &ring, 0, data, size);
847 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
848 }
849 
850 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
851 {
852 	struct xe_device *xe = lrc_to_xe(lrc);
853 	struct iosys_map ring;
854 	u32 rhs;
855 	size_t aligned_size;
856 
857 	xe_assert(xe, IS_ALIGNED(size, 4));
858 	aligned_size = ALIGN(size, 8);
859 
860 	ring = __xe_lrc_ring_map(lrc);
861 
862 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
863 	rhs = lrc->ring.size - lrc->ring.tail;
864 	if (size > rhs) {
865 		__xe_lrc_write_ring(lrc, ring, data, rhs);
866 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
867 	} else {
868 		__xe_lrc_write_ring(lrc, ring, data, size);
869 	}
870 
871 	if (aligned_size > size) {
872 		u32 noop = MI_NOOP;
873 
874 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
875 	}
876 }
877 
878 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
879 {
880 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
881 }
882 
883 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
884 {
885 	return __xe_lrc_seqno_ggtt_addr(lrc);
886 }
887 
888 struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
889 {
890 	return &xe_hw_fence_create(&lrc->fence_ctx,
891 				   __xe_lrc_seqno_map(lrc))->dma;
892 }
893 
894 s32 xe_lrc_seqno(struct xe_lrc *lrc)
895 {
896 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
897 
898 	return xe_map_read32(lrc_to_xe(lrc), &map);
899 }
900 
901 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
902 {
903 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
904 
905 	return xe_map_read32(lrc_to_xe(lrc), &map);
906 }
907 
908 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
909 {
910 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
911 }
912 
913 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
914 {
915 	return __xe_lrc_parallel_ggtt_addr(lrc);
916 }
917 
918 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
919 {
920 	return __xe_lrc_parallel_map(lrc);
921 }
922 
923 static int instr_dw(u32 cmd_header)
924 {
925 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
926 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
927 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
928 		return 1;
929 
930 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
931 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
932 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
933 
934 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
935 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
936 }
937 
938 static int dump_mi_command(struct drm_printer *p,
939 			   struct xe_gt *gt,
940 			   u32 *dw,
941 			   int remaining_dw)
942 {
943 	u32 inst_header = *dw;
944 	u32 numdw = instr_dw(inst_header);
945 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
946 	int num_noop;
947 
948 	/* First check for commands that don't have/use a '# DW' field */
949 	switch (inst_header & MI_OPCODE) {
950 	case MI_NOOP:
951 		num_noop = 1;
952 		while (num_noop < remaining_dw &&
953 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
954 			num_noop++;
955 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
956 		return num_noop;
957 
958 	case MI_TOPOLOGY_FILTER:
959 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
960 		return 1;
961 
962 	case MI_BATCH_BUFFER_END:
963 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
964 		/* Return 'remaining_dw' to consume the rest of the LRC */
965 		return remaining_dw;
966 	}
967 
968 	/*
969 	 * Any remaining commands include a # of dwords.  We should make sure
970 	 * it doesn't exceed the remaining size of the LRC.
971 	 */
972 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
973 		numdw = remaining_dw;
974 
975 	switch (inst_header & MI_OPCODE) {
976 	case MI_LOAD_REGISTER_IMM:
977 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
978 			   inst_header, (numdw - 1) / 2);
979 		for (int i = 1; i < numdw; i += 2)
980 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
981 		return numdw;
982 
983 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
984 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
985 			   inst_header,
986 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
987 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
988 		if (numdw == 4)
989 			drm_printf(p, " - %#6x = %#010llx\n",
990 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
991 		else
992 			drm_printf(p, " - %*ph (%s)\n",
993 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
994 				   numdw < 4 ? "truncated" : "malformed");
995 		return numdw;
996 
997 	case MI_FORCE_WAKEUP:
998 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
999 		return numdw;
1000 
1001 	default:
1002 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1003 			   inst_header, opcode, numdw);
1004 		return numdw;
1005 	}
1006 }
1007 
1008 static int dump_gfxpipe_command(struct drm_printer *p,
1009 				struct xe_gt *gt,
1010 				u32 *dw,
1011 				int remaining_dw)
1012 {
1013 	u32 numdw = instr_dw(*dw);
1014 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1015 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1016 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1017 
1018 	/*
1019 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1020 	 * remaining size of the LRC.
1021 	 */
1022 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1023 		numdw = remaining_dw;
1024 
1025 	switch (*dw & GFXPIPE_MATCH_MASK) {
1026 #define MATCH(cmd) \
1027 	case cmd: \
1028 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1029 		return numdw
1030 #define MATCH3D(cmd) \
1031 	case CMD_##cmd: \
1032 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1033 		return numdw
1034 
1035 	MATCH(STATE_BASE_ADDRESS);
1036 	MATCH(STATE_SIP);
1037 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1038 	MATCH(STATE_COMPUTE_MODE);
1039 	MATCH3D(3DSTATE_BTD);
1040 
1041 	MATCH3D(3DSTATE_VF_STATISTICS);
1042 
1043 	MATCH(PIPELINE_SELECT);
1044 
1045 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1046 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1047 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1048 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1049 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1050 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1051 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1052 	MATCH3D(3DSTATE_INDEX_BUFFER);
1053 	MATCH3D(3DSTATE_VF);
1054 	MATCH3D(3DSTATE_MULTISAMPLE);
1055 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1056 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1057 	MATCH3D(3DSTATE_VS);
1058 	MATCH3D(3DSTATE_GS);
1059 	MATCH3D(3DSTATE_CLIP);
1060 	MATCH3D(3DSTATE_SF);
1061 	MATCH3D(3DSTATE_WM);
1062 	MATCH3D(3DSTATE_CONSTANT_VS);
1063 	MATCH3D(3DSTATE_CONSTANT_GS);
1064 	MATCH3D(3DSTATE_SAMPLE_MASK);
1065 	MATCH3D(3DSTATE_CONSTANT_HS);
1066 	MATCH3D(3DSTATE_CONSTANT_DS);
1067 	MATCH3D(3DSTATE_HS);
1068 	MATCH3D(3DSTATE_TE);
1069 	MATCH3D(3DSTATE_DS);
1070 	MATCH3D(3DSTATE_STREAMOUT);
1071 	MATCH3D(3DSTATE_SBE);
1072 	MATCH3D(3DSTATE_PS);
1073 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1074 	MATCH3D(3DSTATE_CPS_POINTERS);
1075 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1076 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1077 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1078 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1079 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1080 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1081 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1082 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1083 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1084 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1085 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1086 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1087 	MATCH3D(3DSTATE_VF_INSTANCING);
1088 	MATCH3D(3DSTATE_VF_SGVS);
1089 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1090 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1091 	MATCH3D(3DSTATE_PS_BLEND);
1092 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1093 	MATCH3D(3DSTATE_PS_EXTRA);
1094 	MATCH3D(3DSTATE_RASTER);
1095 	MATCH3D(3DSTATE_SBE_SWIZ);
1096 	MATCH3D(3DSTATE_WM_HZ_OP);
1097 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1098 	MATCH3D(3DSTATE_VF_SGVS_2);
1099 	MATCH3D(3DSTATE_VFG);
1100 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1101 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1102 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1103 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1104 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1105 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1106 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1107 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1108 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1109 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1110 	MATCH3D(3DSTATE_AMFS);
1111 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1112 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1113 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1114 	MATCH3D(3DSTATE_MESH_CONTROL);
1115 	MATCH3D(3DSTATE_MESH_DISTRIB);
1116 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1117 	MATCH3D(3DSTATE_MESH_SHADER);
1118 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1119 	MATCH3D(3DSTATE_TASK_CONTROL);
1120 	MATCH3D(3DSTATE_TASK_SHADER);
1121 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1122 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1123 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1124 	MATCH3D(3DSTATE_CLIP_MESH);
1125 	MATCH3D(3DSTATE_SBE_MESH);
1126 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1127 
1128 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1129 	MATCH3D(3DSTATE_CHROMA_KEY);
1130 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1131 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1132 	MATCH3D(3DSTATE_LINE_STIPPLE);
1133 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1134 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1135 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1136 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1137 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1138 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1139 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1140 	MATCH3D(3DSTATE_SO_DECL_LIST);
1141 	MATCH3D(3DSTATE_SO_BUFFER);
1142 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1143 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1144 	MATCH3D(3DSTATE_3D_MODE);
1145 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1146 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1147 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1148 
1149 	default:
1150 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1151 			   *dw, pipeline, opcode, subopcode, numdw);
1152 		return numdw;
1153 	}
1154 }
1155 
1156 void xe_lrc_dump_default(struct drm_printer *p,
1157 			 struct xe_gt *gt,
1158 			 enum xe_engine_class hwe_class)
1159 {
1160 	u32 *dw;
1161 	int remaining_dw, num_dw;
1162 
1163 	if (!gt->default_lrc[hwe_class]) {
1164 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1165 		return;
1166 	}
1167 
1168 	/*
1169 	 * Skip the beginning of the LRC since it contains the per-process
1170 	 * hardware status page.
1171 	 */
1172 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1173 	remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
1174 
1175 	while (remaining_dw > 0) {
1176 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1177 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1178 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1179 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1180 		} else {
1181 			num_dw = min(instr_dw(*dw), remaining_dw);
1182 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1183 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1184 				   num_dw);
1185 		}
1186 
1187 		dw += num_dw;
1188 		remaining_dw -= num_dw;
1189 	}
1190 }
1191 
1192 struct instr_state {
1193 	u32 instr;
1194 	u16 num_dw;
1195 };
1196 
1197 static const struct instr_state xe_hpg_svg_state[] = {
1198 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1199 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1200 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1201 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1202 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1203 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1204 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1205 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1206 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1207 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1208 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1209 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1210 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1211 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1212 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1213 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1214 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1215 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1216 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1217 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1218 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1219 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1220 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1221 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1222 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1223 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1224 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1225 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1226 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1227 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1228 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1229 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1230 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1231 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1232 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1233 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1234 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1235 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1236 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1237 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1238 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1239 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1240 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1241 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1242 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1243 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1244 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1245 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1246 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1247 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1248 };
1249 
1250 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1251 {
1252 	struct xe_gt *gt = q->hwe->gt;
1253 	struct xe_device *xe = gt_to_xe(gt);
1254 	const struct instr_state *state_table = NULL;
1255 	int state_table_size = 0;
1256 
1257 	/*
1258 	 * At the moment we only need to emit non-register state for the RCS
1259 	 * engine.
1260 	 */
1261 	if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
1262 		return;
1263 
1264 	switch (GRAPHICS_VERx100(xe)) {
1265 	case 1255:
1266 	case 1270 ... 2004:
1267 		state_table = xe_hpg_svg_state;
1268 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1269 		break;
1270 	default:
1271 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1272 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1273 		return;
1274 	}
1275 
1276 	for (int i = 0; i < state_table_size; i++) {
1277 		u32 instr = state_table[i].instr;
1278 		u16 num_dw = state_table[i].num_dw;
1279 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1280 
1281 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1282 		xe_gt_assert(gt, num_dw != 0);
1283 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1284 
1285 		/*
1286 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1287 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1288 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1289 		 * Just make the replacement here rather than defining a
1290 		 * whole separate table for the single trivial change.
1291 		 */
1292 		if (GRAPHICS_VER(xe) >= 20 &&
1293 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1294 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1295 
1296 		bb->cs[bb->len] = instr;
1297 		if (!is_single_dw)
1298 			bb->cs[bb->len] |= (num_dw - 2);
1299 
1300 		bb->len += num_dw;
1301 	}
1302 }
1303