xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 90d32e92011eaae8e70a9169b4e7acf4ca8f9d3a)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <linux/ascii85.h>
9 
10 #include "instructions/xe_mi_commands.h"
11 #include "instructions/xe_gfxpipe_commands.h"
12 #include "instructions/xe_gfx_state_commands.h"
13 #include "regs/xe_engine_regs.h"
14 #include "regs/xe_gpu_commands.h"
15 #include "regs/xe_lrc_layout.h"
16 #include "xe_bb.h"
17 #include "xe_bo.h"
18 #include "xe_device.h"
19 #include "xe_drm_client.h"
20 #include "xe_exec_queue_types.h"
21 #include "xe_gt.h"
22 #include "xe_gt_printk.h"
23 #include "xe_hw_fence.h"
24 #include "xe_map.h"
25 #include "xe_memirq.h"
26 #include "xe_sriov.h"
27 #include "xe_vm.h"
28 
29 #define LRC_VALID				BIT_ULL(0)
30 #define LRC_PRIVILEGE				BIT_ULL(8)
31 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
32 #define LRC_LEGACY_64B_CONTEXT			3
33 
34 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
35 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
36 
37 struct xe_lrc_snapshot {
38 	struct xe_bo *lrc_bo;
39 	void *lrc_snapshot;
40 	unsigned long lrc_size, lrc_offset;
41 
42 	u32 context_desc;
43 	u32 head;
44 	struct {
45 		u32 internal;
46 		u32 memory;
47 	} tail;
48 	u32 start_seqno;
49 	u32 seqno;
50 };
51 
52 static struct xe_device *
53 lrc_to_xe(struct xe_lrc *lrc)
54 {
55 	return gt_to_xe(lrc->fence_ctx.gt);
56 }
57 
58 size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
59 {
60 	switch (class) {
61 	case XE_ENGINE_CLASS_RENDER:
62 		if (GRAPHICS_VER(xe) >= 20)
63 			return 4 * SZ_4K;
64 		else
65 			return 14 * SZ_4K;
66 	case XE_ENGINE_CLASS_COMPUTE:
67 		/* 14 pages since graphics_ver == 11 */
68 		if (GRAPHICS_VER(xe) >= 20)
69 			return 3 * SZ_4K;
70 		else
71 			return 14 * SZ_4K;
72 	default:
73 		WARN(1, "Unknown engine class: %d", class);
74 		fallthrough;
75 	case XE_ENGINE_CLASS_COPY:
76 	case XE_ENGINE_CLASS_VIDEO_DECODE:
77 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
78 	case XE_ENGINE_CLASS_OTHER:
79 		return 2 * SZ_4K;
80 	}
81 }
82 
83 /*
84  * The per-platform tables are u8-encoded in @data. Decode @data and set the
85  * addresses' offset and commands in @regs. The following encoding is used
86  * for each byte. There are 2 steps: decoding commands and decoding addresses.
87  *
88  * Commands:
89  * [7]: create NOPs - number of NOPs are set in lower bits
90  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
91  *      MI_LRI_FORCE_POSTED
92  * [5:0]: Number of NOPs or registers to set values to in case of
93  *        MI_LOAD_REGISTER_IMM
94  *
95  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
96  * number of registers. They are set by using the REG/REG16 macros: the former
97  * is used for offsets smaller than 0x200 while the latter is for values bigger
98  * than that. Those macros already set all the bits documented below correctly:
99  *
100  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
101  *      follow, for the lower bits
102  * [6:0]: Register offset, without considering the engine base.
103  *
104  * This function only tweaks the commands and register offsets. Values are not
105  * filled out.
106  */
107 static void set_offsets(u32 *regs,
108 			const u8 *data,
109 			const struct xe_hw_engine *hwe)
110 #define NOP(x) (BIT(7) | (x))
111 #define LRI(count, flags) ((flags) << 6 | (count) | \
112 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
113 #define POSTED BIT(0)
114 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
115 #define REG16(x) \
116 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
117 	(((x) >> 2) & 0x7f)
118 {
119 	const u32 base = hwe->mmio_base;
120 
121 	while (*data) {
122 		u8 count, flags;
123 
124 		if (*data & BIT(7)) { /* skip */
125 			count = *data++ & ~BIT(7);
126 			regs += count;
127 			continue;
128 		}
129 
130 		count = *data & 0x3f;
131 		flags = *data >> 6;
132 		data++;
133 
134 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
135 		if (flags & POSTED)
136 			*regs |= MI_LRI_FORCE_POSTED;
137 		*regs |= MI_LRI_LRM_CS_MMIO;
138 		regs++;
139 
140 		xe_gt_assert(hwe->gt, count);
141 		do {
142 			u32 offset = 0;
143 			u8 v;
144 
145 			do {
146 				v = *data++;
147 				offset <<= 7;
148 				offset |= v & ~BIT(7);
149 			} while (v & BIT(7));
150 
151 			regs[0] = base + (offset << 2);
152 			regs += 2;
153 		} while (--count);
154 	}
155 
156 	*regs = MI_BATCH_BUFFER_END | BIT(0);
157 }
158 
159 static const u8 gen12_xcs_offsets[] = {
160 	NOP(1),
161 	LRI(13, POSTED),
162 	REG16(0x244),
163 	REG(0x034),
164 	REG(0x030),
165 	REG(0x038),
166 	REG(0x03c),
167 	REG(0x168),
168 	REG(0x140),
169 	REG(0x110),
170 	REG(0x1c0),
171 	REG(0x1c4),
172 	REG(0x1c8),
173 	REG(0x180),
174 	REG16(0x2b4),
175 
176 	NOP(5),
177 	LRI(9, POSTED),
178 	REG16(0x3a8),
179 	REG16(0x28c),
180 	REG16(0x288),
181 	REG16(0x284),
182 	REG16(0x280),
183 	REG16(0x27c),
184 	REG16(0x278),
185 	REG16(0x274),
186 	REG16(0x270),
187 
188 	0
189 };
190 
191 static const u8 dg2_xcs_offsets[] = {
192 	NOP(1),
193 	LRI(15, POSTED),
194 	REG16(0x244),
195 	REG(0x034),
196 	REG(0x030),
197 	REG(0x038),
198 	REG(0x03c),
199 	REG(0x168),
200 	REG(0x140),
201 	REG(0x110),
202 	REG(0x1c0),
203 	REG(0x1c4),
204 	REG(0x1c8),
205 	REG(0x180),
206 	REG16(0x2b4),
207 	REG(0x120),
208 	REG(0x124),
209 
210 	NOP(1),
211 	LRI(9, POSTED),
212 	REG16(0x3a8),
213 	REG16(0x28c),
214 	REG16(0x288),
215 	REG16(0x284),
216 	REG16(0x280),
217 	REG16(0x27c),
218 	REG16(0x278),
219 	REG16(0x274),
220 	REG16(0x270),
221 
222 	0
223 };
224 
225 static const u8 gen12_rcs_offsets[] = {
226 	NOP(1),
227 	LRI(13, POSTED),
228 	REG16(0x244),
229 	REG(0x034),
230 	REG(0x030),
231 	REG(0x038),
232 	REG(0x03c),
233 	REG(0x168),
234 	REG(0x140),
235 	REG(0x110),
236 	REG(0x1c0),
237 	REG(0x1c4),
238 	REG(0x1c8),
239 	REG(0x180),
240 	REG16(0x2b4),
241 
242 	NOP(5),
243 	LRI(9, POSTED),
244 	REG16(0x3a8),
245 	REG16(0x28c),
246 	REG16(0x288),
247 	REG16(0x284),
248 	REG16(0x280),
249 	REG16(0x27c),
250 	REG16(0x278),
251 	REG16(0x274),
252 	REG16(0x270),
253 
254 	LRI(3, POSTED),
255 	REG(0x1b0),
256 	REG16(0x5a8),
257 	REG16(0x5ac),
258 
259 	NOP(6),
260 	LRI(1, 0),
261 	REG(0x0c8),
262 	NOP(3 + 9 + 1),
263 
264 	LRI(51, POSTED),
265 	REG16(0x588),
266 	REG16(0x588),
267 	REG16(0x588),
268 	REG16(0x588),
269 	REG16(0x588),
270 	REG16(0x588),
271 	REG(0x028),
272 	REG(0x09c),
273 	REG(0x0c0),
274 	REG(0x178),
275 	REG(0x17c),
276 	REG16(0x358),
277 	REG(0x170),
278 	REG(0x150),
279 	REG(0x154),
280 	REG(0x158),
281 	REG16(0x41c),
282 	REG16(0x600),
283 	REG16(0x604),
284 	REG16(0x608),
285 	REG16(0x60c),
286 	REG16(0x610),
287 	REG16(0x614),
288 	REG16(0x618),
289 	REG16(0x61c),
290 	REG16(0x620),
291 	REG16(0x624),
292 	REG16(0x628),
293 	REG16(0x62c),
294 	REG16(0x630),
295 	REG16(0x634),
296 	REG16(0x638),
297 	REG16(0x63c),
298 	REG16(0x640),
299 	REG16(0x644),
300 	REG16(0x648),
301 	REG16(0x64c),
302 	REG16(0x650),
303 	REG16(0x654),
304 	REG16(0x658),
305 	REG16(0x65c),
306 	REG16(0x660),
307 	REG16(0x664),
308 	REG16(0x668),
309 	REG16(0x66c),
310 	REG16(0x670),
311 	REG16(0x674),
312 	REG16(0x678),
313 	REG16(0x67c),
314 	REG(0x068),
315 	REG(0x084),
316 	NOP(1),
317 
318 	0
319 };
320 
321 static const u8 xehp_rcs_offsets[] = {
322 	NOP(1),
323 	LRI(13, POSTED),
324 	REG16(0x244),
325 	REG(0x034),
326 	REG(0x030),
327 	REG(0x038),
328 	REG(0x03c),
329 	REG(0x168),
330 	REG(0x140),
331 	REG(0x110),
332 	REG(0x1c0),
333 	REG(0x1c4),
334 	REG(0x1c8),
335 	REG(0x180),
336 	REG16(0x2b4),
337 
338 	NOP(5),
339 	LRI(9, POSTED),
340 	REG16(0x3a8),
341 	REG16(0x28c),
342 	REG16(0x288),
343 	REG16(0x284),
344 	REG16(0x280),
345 	REG16(0x27c),
346 	REG16(0x278),
347 	REG16(0x274),
348 	REG16(0x270),
349 
350 	LRI(3, POSTED),
351 	REG(0x1b0),
352 	REG16(0x5a8),
353 	REG16(0x5ac),
354 
355 	NOP(6),
356 	LRI(1, 0),
357 	REG(0x0c8),
358 
359 	0
360 };
361 
362 static const u8 dg2_rcs_offsets[] = {
363 	NOP(1),
364 	LRI(15, POSTED),
365 	REG16(0x244),
366 	REG(0x034),
367 	REG(0x030),
368 	REG(0x038),
369 	REG(0x03c),
370 	REG(0x168),
371 	REG(0x140),
372 	REG(0x110),
373 	REG(0x1c0),
374 	REG(0x1c4),
375 	REG(0x1c8),
376 	REG(0x180),
377 	REG16(0x2b4),
378 	REG(0x120),
379 	REG(0x124),
380 
381 	NOP(1),
382 	LRI(9, POSTED),
383 	REG16(0x3a8),
384 	REG16(0x28c),
385 	REG16(0x288),
386 	REG16(0x284),
387 	REG16(0x280),
388 	REG16(0x27c),
389 	REG16(0x278),
390 	REG16(0x274),
391 	REG16(0x270),
392 
393 	LRI(3, POSTED),
394 	REG(0x1b0),
395 	REG16(0x5a8),
396 	REG16(0x5ac),
397 
398 	NOP(6),
399 	LRI(1, 0),
400 	REG(0x0c8),
401 
402 	0
403 };
404 
405 static const u8 mtl_rcs_offsets[] = {
406 	NOP(1),
407 	LRI(15, POSTED),
408 	REG16(0x244),
409 	REG(0x034),
410 	REG(0x030),
411 	REG(0x038),
412 	REG(0x03c),
413 	REG(0x168),
414 	REG(0x140),
415 	REG(0x110),
416 	REG(0x1c0),
417 	REG(0x1c4),
418 	REG(0x1c8),
419 	REG(0x180),
420 	REG16(0x2b4),
421 	REG(0x120),
422 	REG(0x124),
423 
424 	NOP(1),
425 	LRI(9, POSTED),
426 	REG16(0x3a8),
427 	REG16(0x28c),
428 	REG16(0x288),
429 	REG16(0x284),
430 	REG16(0x280),
431 	REG16(0x27c),
432 	REG16(0x278),
433 	REG16(0x274),
434 	REG16(0x270),
435 
436 	NOP(2),
437 	LRI(2, POSTED),
438 	REG16(0x5a8),
439 	REG16(0x5ac),
440 
441 	NOP(6),
442 	LRI(1, 0),
443 	REG(0x0c8),
444 
445 	0
446 };
447 
448 #define XE2_CTX_COMMON \
449 	NOP(1),                 /* [0x00] */ \
450 	LRI(15, POSTED),        /* [0x01] */ \
451 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
452 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
453 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
454 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
455 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
456 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
457 	REG(0x140),             /* [0x0e] BB_ADDR */ \
458 	REG(0x110),             /* [0x10] BB_STATE */ \
459 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
460 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
461 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
462 	REG(0x180),             /* [0x18] CCID */ \
463 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
464 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
465 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
466 	\
467 	NOP(1),                 /* [0x20] */ \
468 	LRI(9, POSTED),         /* [0x21] */ \
469 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
470 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
471 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
472 	REG16(0x284),           /* [0x28] dummy reg */ \
473 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
474 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
475 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
476 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
477 	REG16(0x270)            /* [0x32] PTBP_LDW */
478 
479 static const u8 xe2_rcs_offsets[] = {
480 	XE2_CTX_COMMON,
481 
482 	NOP(2),                 /* [0x34] */
483 	LRI(2, POSTED),         /* [0x36] */
484 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
485 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
486 
487 	NOP(6),                 /* [0x41] */
488 	LRI(1, 0),              /* [0x47] */
489 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
490 
491 	0
492 };
493 
494 static const u8 xe2_bcs_offsets[] = {
495 	XE2_CTX_COMMON,
496 
497 	NOP(4 + 8 + 1),         /* [0x34] */
498 	LRI(2, POSTED),         /* [0x41] */
499 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
500 	REG16(0x204),           /* [0x44] BLIT_CCTL */
501 
502 	0
503 };
504 
505 static const u8 xe2_xcs_offsets[] = {
506 	XE2_CTX_COMMON,
507 
508 	0
509 };
510 
511 #undef REG16
512 #undef REG
513 #undef LRI
514 #undef NOP
515 
516 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
517 {
518 	if (class == XE_ENGINE_CLASS_RENDER) {
519 		if (GRAPHICS_VER(xe) >= 20)
520 			return xe2_rcs_offsets;
521 		else if (GRAPHICS_VERx100(xe) >= 1270)
522 			return mtl_rcs_offsets;
523 		else if (GRAPHICS_VERx100(xe) >= 1255)
524 			return dg2_rcs_offsets;
525 		else if (GRAPHICS_VERx100(xe) >= 1250)
526 			return xehp_rcs_offsets;
527 		else
528 			return gen12_rcs_offsets;
529 	} else if (class == XE_ENGINE_CLASS_COPY) {
530 		if (GRAPHICS_VER(xe) >= 20)
531 			return xe2_bcs_offsets;
532 		else
533 			return gen12_xcs_offsets;
534 	} else {
535 		if (GRAPHICS_VER(xe) >= 20)
536 			return xe2_xcs_offsets;
537 		else if (GRAPHICS_VERx100(xe) >= 1255)
538 			return dg2_xcs_offsets;
539 		else
540 			return gen12_xcs_offsets;
541 	}
542 }
543 
544 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
545 {
546 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
547 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
548 
549 	/* TODO: Timestamp */
550 }
551 
552 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
553 {
554 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->sriov.vf.memirq;
555 	struct xe_device *xe = gt_to_xe(hwe->gt);
556 
557 	if (!IS_SRIOV_VF(xe) || !xe_device_has_memirq(xe))
558 		return;
559 
560 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
561 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
562 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
563 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
564 
565 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
566 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
567 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
568 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq);
569 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
570 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq);
571 }
572 
573 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
574 {
575 	struct xe_device *xe = gt_to_xe(hwe->gt);
576 
577 	if (GRAPHICS_VERx100(xe) >= 1250)
578 		return 0x70;
579 	else
580 		return 0x60;
581 }
582 
583 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
584 {
585 	int x;
586 
587 	x = lrc_ring_mi_mode(hwe);
588 	regs[x + 1] &= ~STOP_RING;
589 	regs[x + 1] |= STOP_RING << 16;
590 }
591 
592 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
593 {
594 	return 0;
595 }
596 
597 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
598 {
599 	return lrc->ring.size;
600 }
601 
602 /* Make the magic macros work */
603 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
604 
605 #define LRC_SEQNO_PPHWSP_OFFSET 512
606 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
607 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
608 #define LRC_PPHWSP_SIZE SZ_4K
609 
610 static size_t lrc_reg_size(struct xe_device *xe)
611 {
612 	if (GRAPHICS_VERx100(xe) >= 1250)
613 		return 96 * sizeof(u32);
614 	else
615 		return 80 * sizeof(u32);
616 }
617 
618 size_t xe_lrc_skip_size(struct xe_device *xe)
619 {
620 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
621 }
622 
623 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
624 {
625 	/* The seqno is stored in the driver-defined portion of PPHWSP */
626 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
627 }
628 
629 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
630 {
631 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
632 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
633 }
634 
635 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
636 {
637 	/* The parallel is stored in the driver-defined portion of PPHWSP */
638 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
639 }
640 
641 static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
642 {
643 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
644 }
645 
646 #define DECL_MAP_ADDR_HELPERS(elem) \
647 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
648 { \
649 	struct iosys_map map = lrc->bo->vmap; \
650 \
651 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
652 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
653 	return map; \
654 } \
655 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
656 { \
657 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
658 } \
659 
660 DECL_MAP_ADDR_HELPERS(ring)
661 DECL_MAP_ADDR_HELPERS(pphwsp)
662 DECL_MAP_ADDR_HELPERS(seqno)
663 DECL_MAP_ADDR_HELPERS(regs)
664 DECL_MAP_ADDR_HELPERS(start_seqno)
665 DECL_MAP_ADDR_HELPERS(parallel)
666 
667 #undef DECL_MAP_ADDR_HELPERS
668 
669 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
670 {
671 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
672 }
673 
674 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
675 {
676 	struct xe_device *xe = lrc_to_xe(lrc);
677 	struct iosys_map map;
678 
679 	map = __xe_lrc_regs_map(lrc);
680 	iosys_map_incr(&map, reg_nr * sizeof(u32));
681 	return xe_map_read32(xe, &map);
682 }
683 
684 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
685 {
686 	struct xe_device *xe = lrc_to_xe(lrc);
687 	struct iosys_map map;
688 
689 	map = __xe_lrc_regs_map(lrc);
690 	iosys_map_incr(&map, reg_nr * sizeof(u32));
691 	xe_map_write32(xe, &map, val);
692 }
693 
694 static void *empty_lrc_data(struct xe_hw_engine *hwe)
695 {
696 	struct xe_device *xe = gt_to_xe(hwe->gt);
697 	void *data;
698 	u32 *regs;
699 
700 	data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
701 	if (!data)
702 		return NULL;
703 
704 	/* 1st page: Per-Process of HW status Page */
705 	regs = data + LRC_PPHWSP_SIZE;
706 	set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
707 	set_context_control(regs, hwe);
708 	set_memory_based_intr(regs, hwe);
709 	reset_stop_ring(regs, hwe);
710 
711 	return data;
712 }
713 
714 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
715 {
716 	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
717 
718 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
719 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
720 }
721 
722 #define PVC_CTX_ASID		(0x2e + 1)
723 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
724 
725 int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
726 		struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
727 {
728 	struct xe_gt *gt = hwe->gt;
729 	struct xe_tile *tile = gt_to_tile(gt);
730 	struct xe_device *xe = gt_to_xe(gt);
731 	struct iosys_map map;
732 	void *init_data = NULL;
733 	u32 arb_enable;
734 	int err;
735 
736 	lrc->flags = 0;
737 
738 	/*
739 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
740 	 * via VM bind calls.
741 	 */
742 	lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
743 				      ring_size + xe_lrc_size(xe, hwe->class),
744 				      ttm_bo_type_kernel,
745 				      XE_BO_FLAG_VRAM_IF_DGFX(tile) |
746 				      XE_BO_FLAG_GGTT |
747 				      XE_BO_FLAG_GGTT_INVALIDATE);
748 	if (IS_ERR(lrc->bo))
749 		return PTR_ERR(lrc->bo);
750 
751 	lrc->tile = gt_to_tile(hwe->gt);
752 	lrc->ring.size = ring_size;
753 	lrc->ring.tail = 0;
754 
755 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
756 			     hwe->fence_irq, hwe->name);
757 
758 	if (!gt->default_lrc[hwe->class]) {
759 		init_data = empty_lrc_data(hwe);
760 		if (!init_data) {
761 			err = -ENOMEM;
762 			goto err_lrc_finish;
763 		}
764 	}
765 
766 	/*
767 	 * Init Per-Process of HW status Page, LRC / context state to known
768 	 * values
769 	 */
770 	map = __xe_lrc_pphwsp_map(lrc);
771 	if (!init_data) {
772 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
773 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
774 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
775 				 xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
776 	} else {
777 		xe_map_memcpy_to(xe, &map, 0, init_data,
778 				 xe_lrc_size(xe, hwe->class));
779 		kfree(init_data);
780 	}
781 
782 	if (vm) {
783 		xe_lrc_set_ppgtt(lrc, vm);
784 
785 		if (vm->xef)
786 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
787 	}
788 
789 	xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
790 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
791 	xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
792 	xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
793 			     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
794 	if (xe->info.has_asid && vm)
795 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
796 
797 	lrc->desc = LRC_VALID;
798 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
799 	/* TODO: Priority */
800 
801 	/* While this appears to have something about privileged batches or
802 	 * some such, it really just means PPGTT mode.
803 	 */
804 	if (vm)
805 		lrc->desc |= LRC_PRIVILEGE;
806 
807 	if (GRAPHICS_VERx100(xe) < 1250) {
808 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
809 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
810 	}
811 
812 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
813 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
814 
815 	map = __xe_lrc_seqno_map(lrc);
816 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
817 
818 	map = __xe_lrc_start_seqno_map(lrc);
819 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
820 
821 	return 0;
822 
823 err_lrc_finish:
824 	xe_lrc_finish(lrc);
825 	return err;
826 }
827 
828 void xe_lrc_finish(struct xe_lrc *lrc)
829 {
830 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
831 	xe_bo_lock(lrc->bo, false);
832 	xe_bo_unpin(lrc->bo);
833 	xe_bo_unlock(lrc->bo);
834 	xe_bo_put(lrc->bo);
835 }
836 
837 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
838 {
839 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
840 }
841 
842 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
843 {
844 	return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
845 }
846 
847 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
848 {
849 	const u32 head = xe_lrc_ring_head(lrc);
850 	const u32 tail = lrc->ring.tail;
851 	const u32 size = lrc->ring.size;
852 
853 	return ((head - tail - 1) & (size - 1)) + 1;
854 }
855 
856 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
857 				const void *data, size_t size)
858 {
859 	struct xe_device *xe = lrc_to_xe(lrc);
860 
861 	iosys_map_incr(&ring, lrc->ring.tail);
862 	xe_map_memcpy_to(xe, &ring, 0, data, size);
863 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
864 }
865 
866 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
867 {
868 	struct xe_device *xe = lrc_to_xe(lrc);
869 	struct iosys_map ring;
870 	u32 rhs;
871 	size_t aligned_size;
872 
873 	xe_assert(xe, IS_ALIGNED(size, 4));
874 	aligned_size = ALIGN(size, 8);
875 
876 	ring = __xe_lrc_ring_map(lrc);
877 
878 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
879 	rhs = lrc->ring.size - lrc->ring.tail;
880 	if (size > rhs) {
881 		__xe_lrc_write_ring(lrc, ring, data, rhs);
882 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
883 	} else {
884 		__xe_lrc_write_ring(lrc, ring, data, size);
885 	}
886 
887 	if (aligned_size > size) {
888 		u32 noop = MI_NOOP;
889 
890 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
891 	}
892 }
893 
894 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
895 {
896 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
897 }
898 
899 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
900 {
901 	return __xe_lrc_seqno_ggtt_addr(lrc);
902 }
903 
904 struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
905 {
906 	return &xe_hw_fence_create(&lrc->fence_ctx,
907 				   __xe_lrc_seqno_map(lrc))->dma;
908 }
909 
910 s32 xe_lrc_seqno(struct xe_lrc *lrc)
911 {
912 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
913 
914 	return xe_map_read32(lrc_to_xe(lrc), &map);
915 }
916 
917 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
918 {
919 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
920 
921 	return xe_map_read32(lrc_to_xe(lrc), &map);
922 }
923 
924 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
925 {
926 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
927 }
928 
929 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
930 {
931 	return __xe_lrc_parallel_ggtt_addr(lrc);
932 }
933 
934 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
935 {
936 	return __xe_lrc_parallel_map(lrc);
937 }
938 
939 static int instr_dw(u32 cmd_header)
940 {
941 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
942 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
943 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
944 		return 1;
945 
946 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
947 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
948 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
949 
950 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
951 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
952 }
953 
954 static int dump_mi_command(struct drm_printer *p,
955 			   struct xe_gt *gt,
956 			   u32 *dw,
957 			   int remaining_dw)
958 {
959 	u32 inst_header = *dw;
960 	u32 numdw = instr_dw(inst_header);
961 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
962 	int num_noop;
963 
964 	/* First check for commands that don't have/use a '# DW' field */
965 	switch (inst_header & MI_OPCODE) {
966 	case MI_NOOP:
967 		num_noop = 1;
968 		while (num_noop < remaining_dw &&
969 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
970 			num_noop++;
971 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
972 		return num_noop;
973 
974 	case MI_TOPOLOGY_FILTER:
975 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
976 		return 1;
977 
978 	case MI_BATCH_BUFFER_END:
979 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
980 		/* Return 'remaining_dw' to consume the rest of the LRC */
981 		return remaining_dw;
982 	}
983 
984 	/*
985 	 * Any remaining commands include a # of dwords.  We should make sure
986 	 * it doesn't exceed the remaining size of the LRC.
987 	 */
988 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
989 		numdw = remaining_dw;
990 
991 	switch (inst_header & MI_OPCODE) {
992 	case MI_LOAD_REGISTER_IMM:
993 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
994 			   inst_header, (numdw - 1) / 2);
995 		for (int i = 1; i < numdw; i += 2)
996 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
997 		return numdw;
998 
999 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1000 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1001 			   inst_header,
1002 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1003 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1004 		if (numdw == 4)
1005 			drm_printf(p, " - %#6x = %#010llx\n",
1006 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1007 		else
1008 			drm_printf(p, " - %*ph (%s)\n",
1009 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1010 				   numdw < 4 ? "truncated" : "malformed");
1011 		return numdw;
1012 
1013 	case MI_FORCE_WAKEUP:
1014 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1015 		return numdw;
1016 
1017 	default:
1018 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1019 			   inst_header, opcode, numdw);
1020 		return numdw;
1021 	}
1022 }
1023 
1024 static int dump_gfxpipe_command(struct drm_printer *p,
1025 				struct xe_gt *gt,
1026 				u32 *dw,
1027 				int remaining_dw)
1028 {
1029 	u32 numdw = instr_dw(*dw);
1030 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1031 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1032 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1033 
1034 	/*
1035 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1036 	 * remaining size of the LRC.
1037 	 */
1038 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1039 		numdw = remaining_dw;
1040 
1041 	switch (*dw & GFXPIPE_MATCH_MASK) {
1042 #define MATCH(cmd) \
1043 	case cmd: \
1044 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1045 		return numdw
1046 #define MATCH3D(cmd) \
1047 	case CMD_##cmd: \
1048 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1049 		return numdw
1050 
1051 	MATCH(STATE_BASE_ADDRESS);
1052 	MATCH(STATE_SIP);
1053 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1054 	MATCH(STATE_COMPUTE_MODE);
1055 	MATCH3D(3DSTATE_BTD);
1056 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1057 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1058 
1059 	MATCH3D(3DSTATE_VF_STATISTICS);
1060 
1061 	MATCH(PIPELINE_SELECT);
1062 
1063 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1064 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1065 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1066 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1067 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1068 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1069 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1070 	MATCH3D(3DSTATE_INDEX_BUFFER);
1071 	MATCH3D(3DSTATE_VF);
1072 	MATCH3D(3DSTATE_MULTISAMPLE);
1073 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1074 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1075 	MATCH3D(3DSTATE_VS);
1076 	MATCH3D(3DSTATE_GS);
1077 	MATCH3D(3DSTATE_CLIP);
1078 	MATCH3D(3DSTATE_SF);
1079 	MATCH3D(3DSTATE_WM);
1080 	MATCH3D(3DSTATE_CONSTANT_VS);
1081 	MATCH3D(3DSTATE_CONSTANT_GS);
1082 	MATCH3D(3DSTATE_CONSTANT_PS);
1083 	MATCH3D(3DSTATE_SAMPLE_MASK);
1084 	MATCH3D(3DSTATE_CONSTANT_HS);
1085 	MATCH3D(3DSTATE_CONSTANT_DS);
1086 	MATCH3D(3DSTATE_HS);
1087 	MATCH3D(3DSTATE_TE);
1088 	MATCH3D(3DSTATE_DS);
1089 	MATCH3D(3DSTATE_STREAMOUT);
1090 	MATCH3D(3DSTATE_SBE);
1091 	MATCH3D(3DSTATE_PS);
1092 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1093 	MATCH3D(3DSTATE_CPS_POINTERS);
1094 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1095 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1096 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1097 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1098 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1099 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1100 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1101 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1102 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1103 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1104 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1105 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1106 	MATCH3D(3DSTATE_VF_INSTANCING);
1107 	MATCH3D(3DSTATE_VF_SGVS);
1108 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1109 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1110 	MATCH3D(3DSTATE_PS_BLEND);
1111 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1112 	MATCH3D(3DSTATE_PS_EXTRA);
1113 	MATCH3D(3DSTATE_RASTER);
1114 	MATCH3D(3DSTATE_SBE_SWIZ);
1115 	MATCH3D(3DSTATE_WM_HZ_OP);
1116 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1117 	MATCH3D(3DSTATE_VF_SGVS_2);
1118 	MATCH3D(3DSTATE_VFG);
1119 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1120 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1121 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1122 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1123 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1124 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1125 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1126 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1127 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1128 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1129 	MATCH3D(3DSTATE_AMFS);
1130 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1131 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1132 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1133 	MATCH3D(3DSTATE_MESH_CONTROL);
1134 	MATCH3D(3DSTATE_MESH_DISTRIB);
1135 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1136 	MATCH3D(3DSTATE_MESH_SHADER);
1137 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1138 	MATCH3D(3DSTATE_TASK_CONTROL);
1139 	MATCH3D(3DSTATE_TASK_SHADER);
1140 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1141 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1142 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1143 	MATCH3D(3DSTATE_CLIP_MESH);
1144 	MATCH3D(3DSTATE_SBE_MESH);
1145 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1146 
1147 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1148 	MATCH3D(3DSTATE_CHROMA_KEY);
1149 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1150 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1151 	MATCH3D(3DSTATE_LINE_STIPPLE);
1152 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1153 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1154 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1155 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1156 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1157 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1158 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1159 	MATCH3D(3DSTATE_SO_DECL_LIST);
1160 	MATCH3D(3DSTATE_SO_BUFFER);
1161 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1162 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1163 	MATCH3D(3DSTATE_3D_MODE);
1164 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1165 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1166 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1167 
1168 	default:
1169 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1170 			   *dw, pipeline, opcode, subopcode, numdw);
1171 		return numdw;
1172 	}
1173 }
1174 
1175 static int dump_gfx_state_command(struct drm_printer *p,
1176 				  struct xe_gt *gt,
1177 				  u32 *dw,
1178 				  int remaining_dw)
1179 {
1180 	u32 numdw = instr_dw(*dw);
1181 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1182 
1183 	/*
1184 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1185 	 * remaining size of the LRC.
1186 	 */
1187 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1188 		numdw = remaining_dw;
1189 
1190 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1191 	MATCH(STATE_WRITE_INLINE);
1192 
1193 	default:
1194 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1195 			   *dw, opcode, numdw);
1196 		return numdw;
1197 	}
1198 }
1199 
1200 void xe_lrc_dump_default(struct drm_printer *p,
1201 			 struct xe_gt *gt,
1202 			 enum xe_engine_class hwe_class)
1203 {
1204 	u32 *dw;
1205 	int remaining_dw, num_dw;
1206 
1207 	if (!gt->default_lrc[hwe_class]) {
1208 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1209 		return;
1210 	}
1211 
1212 	/*
1213 	 * Skip the beginning of the LRC since it contains the per-process
1214 	 * hardware status page.
1215 	 */
1216 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1217 	remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
1218 
1219 	while (remaining_dw > 0) {
1220 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1221 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1222 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1223 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1224 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1225 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1226 		} else {
1227 			num_dw = min(instr_dw(*dw), remaining_dw);
1228 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1229 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1230 				   num_dw);
1231 		}
1232 
1233 		dw += num_dw;
1234 		remaining_dw -= num_dw;
1235 	}
1236 }
1237 
1238 struct instr_state {
1239 	u32 instr;
1240 	u16 num_dw;
1241 };
1242 
1243 static const struct instr_state xe_hpg_svg_state[] = {
1244 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1245 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1246 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1247 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1248 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1249 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1250 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1251 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1252 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1253 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1254 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1255 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1256 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1257 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1258 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1259 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1260 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1261 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1262 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1263 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1264 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1265 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1266 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1267 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1268 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1269 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1270 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1271 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1272 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1273 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1274 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1275 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1276 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1277 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1278 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1279 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1280 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1281 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1282 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1283 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1284 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1285 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1286 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1287 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1288 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1289 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1290 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1291 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1292 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1293 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1294 };
1295 
1296 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1297 {
1298 	struct xe_gt *gt = q->hwe->gt;
1299 	struct xe_device *xe = gt_to_xe(gt);
1300 	const struct instr_state *state_table = NULL;
1301 	int state_table_size = 0;
1302 
1303 	/*
1304 	 * At the moment we only need to emit non-register state for the RCS
1305 	 * engine.
1306 	 */
1307 	if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
1308 		return;
1309 
1310 	switch (GRAPHICS_VERx100(xe)) {
1311 	case 1255:
1312 	case 1270 ... 2004:
1313 		state_table = xe_hpg_svg_state;
1314 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1315 		break;
1316 	default:
1317 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1318 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1319 		return;
1320 	}
1321 
1322 	for (int i = 0; i < state_table_size; i++) {
1323 		u32 instr = state_table[i].instr;
1324 		u16 num_dw = state_table[i].num_dw;
1325 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1326 
1327 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1328 		xe_gt_assert(gt, num_dw != 0);
1329 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1330 
1331 		/*
1332 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1333 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1334 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1335 		 * Just make the replacement here rather than defining a
1336 		 * whole separate table for the single trivial change.
1337 		 */
1338 		if (GRAPHICS_VER(xe) >= 20 &&
1339 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1340 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1341 
1342 		bb->cs[bb->len] = instr;
1343 		if (!is_single_dw)
1344 			bb->cs[bb->len] |= (num_dw - 2);
1345 
1346 		bb->len += num_dw;
1347 	}
1348 }
1349 
1350 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1351 {
1352 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1353 
1354 	if (!snapshot)
1355 		return NULL;
1356 
1357 	snapshot->context_desc = lower_32_bits(xe_lrc_ggtt_addr(lrc));
1358 	snapshot->head = xe_lrc_ring_head(lrc);
1359 	snapshot->tail.internal = lrc->ring.tail;
1360 	snapshot->tail.memory = xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL);
1361 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1362 	snapshot->seqno = xe_lrc_seqno(lrc);
1363 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
1364 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
1365 	snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
1366 	snapshot->lrc_snapshot = NULL;
1367 	return snapshot;
1368 }
1369 
1370 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
1371 {
1372 	struct xe_bo *bo;
1373 	struct iosys_map src;
1374 
1375 	if (!snapshot)
1376 		return;
1377 
1378 	bo = snapshot->lrc_bo;
1379 	snapshot->lrc_bo = NULL;
1380 
1381 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
1382 	if (!snapshot->lrc_snapshot)
1383 		goto put_bo;
1384 
1385 	dma_resv_lock(bo->ttm.base.resv, NULL);
1386 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
1387 		xe_map_memcpy_from(xe_bo_device(bo),
1388 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
1389 				   snapshot->lrc_size);
1390 		ttm_bo_vunmap(&bo->ttm, &src);
1391 	} else {
1392 		kvfree(snapshot->lrc_snapshot);
1393 		snapshot->lrc_snapshot = NULL;
1394 	}
1395 	dma_resv_unlock(bo->ttm.base.resv);
1396 put_bo:
1397 	xe_bo_put(bo);
1398 }
1399 
1400 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
1401 {
1402 	unsigned long i;
1403 
1404 	if (!snapshot)
1405 		return;
1406 
1407 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
1408 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
1409 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
1410 		   snapshot->tail.internal, snapshot->tail.memory);
1411 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
1412 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
1413 
1414 	if (!snapshot->lrc_snapshot)
1415 		return;
1416 
1417 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
1418 	drm_puts(p, "\t[HWSP].data: ");
1419 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
1420 		u32 *val = snapshot->lrc_snapshot + i;
1421 		char dumped[ASCII85_BUFSZ];
1422 
1423 		drm_puts(p, ascii85_encode(*val, dumped));
1424 	}
1425 
1426 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
1427 	drm_puts(p, "\t[HWCTX].data: ");
1428 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
1429 		u32 *val = snapshot->lrc_snapshot + i;
1430 		char dumped[ASCII85_BUFSZ];
1431 
1432 		drm_puts(p, ascii85_encode(*val, dumped));
1433 	}
1434 	drm_puts(p, "\n");
1435 }
1436 
1437 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
1438 {
1439 	if (!snapshot)
1440 		return;
1441 
1442 	kvfree(snapshot->lrc_snapshot);
1443 	if (snapshot->lrc_bo)
1444 		xe_bo_put(snapshot->lrc_bo);
1445 	kfree(snapshot);
1446 }
1447