xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 47cf96fbe393839b125a9b694a8cfdd3f4216baa)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 
12 #include "instructions/xe_mi_commands.h"
13 #include "instructions/xe_gfxpipe_commands.h"
14 #include "instructions/xe_gfx_state_commands.h"
15 #include "regs/xe_engine_regs.h"
16 #include "regs/xe_lrc_layout.h"
17 #include "xe_bb.h"
18 #include "xe_bo.h"
19 #include "xe_device.h"
20 #include "xe_drm_client.h"
21 #include "xe_exec_queue_types.h"
22 #include "xe_gt.h"
23 #include "xe_gt_printk.h"
24 #include "xe_hw_fence.h"
25 #include "xe_map.h"
26 #include "xe_memirq.h"
27 #include "xe_mmio.h"
28 #include "xe_sriov.h"
29 #include "xe_trace_lrc.h"
30 #include "xe_vm.h"
31 #include "xe_wa.h"
32 
33 #define LRC_VALID				BIT_ULL(0)
34 #define LRC_PRIVILEGE				BIT_ULL(8)
35 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
36 #define LRC_LEGACY_64B_CONTEXT			3
37 
38 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
39 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
40 
41 #define LRC_PPHWSP_SIZE				SZ_4K
42 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
43 
44 static struct xe_device *
45 lrc_to_xe(struct xe_lrc *lrc)
46 {
47 	return gt_to_xe(lrc->fence_ctx.gt);
48 }
49 
50 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
51 {
52 	struct xe_device *xe = gt_to_xe(gt);
53 	size_t size;
54 
55 	/* Per-process HW status page (PPHWSP) */
56 	size = LRC_PPHWSP_SIZE;
57 
58 	/* Engine context image */
59 	switch (class) {
60 	case XE_ENGINE_CLASS_RENDER:
61 		if (GRAPHICS_VER(xe) >= 20)
62 			size += 3 * SZ_4K;
63 		else
64 			size += 13 * SZ_4K;
65 		break;
66 	case XE_ENGINE_CLASS_COMPUTE:
67 		if (GRAPHICS_VER(xe) >= 20)
68 			size += 2 * SZ_4K;
69 		else
70 			size += 13 * SZ_4K;
71 		break;
72 	default:
73 		WARN(1, "Unknown engine class: %d", class);
74 		fallthrough;
75 	case XE_ENGINE_CLASS_COPY:
76 	case XE_ENGINE_CLASS_VIDEO_DECODE:
77 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
78 	case XE_ENGINE_CLASS_OTHER:
79 		size += 1 * SZ_4K;
80 	}
81 
82 	/* Add indirect ring state page */
83 	if (xe_gt_has_indirect_ring_state(gt))
84 		size += LRC_INDIRECT_RING_STATE_SIZE;
85 
86 	return size;
87 }
88 
89 /*
90  * The per-platform tables are u8-encoded in @data. Decode @data and set the
91  * addresses' offset and commands in @regs. The following encoding is used
92  * for each byte. There are 2 steps: decoding commands and decoding addresses.
93  *
94  * Commands:
95  * [7]: create NOPs - number of NOPs are set in lower bits
96  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
97  *      MI_LRI_FORCE_POSTED
98  * [5:0]: Number of NOPs or registers to set values to in case of
99  *        MI_LOAD_REGISTER_IMM
100  *
101  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
102  * number of registers. They are set by using the REG/REG16 macros: the former
103  * is used for offsets smaller than 0x200 while the latter is for values bigger
104  * than that. Those macros already set all the bits documented below correctly:
105  *
106  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
107  *      follow, for the lower bits
108  * [6:0]: Register offset, without considering the engine base.
109  *
110  * This function only tweaks the commands and register offsets. Values are not
111  * filled out.
112  */
113 static void set_offsets(u32 *regs,
114 			const u8 *data,
115 			const struct xe_hw_engine *hwe)
116 #define NOP(x) (BIT(7) | (x))
117 #define LRI(count, flags) ((flags) << 6 | (count) | \
118 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
119 #define POSTED BIT(0)
120 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
121 #define REG16(x) \
122 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
123 	(((x) >> 2) & 0x7f)
124 {
125 	const u32 base = hwe->mmio_base;
126 
127 	while (*data) {
128 		u8 count, flags;
129 
130 		if (*data & BIT(7)) { /* skip */
131 			count = *data++ & ~BIT(7);
132 			regs += count;
133 			continue;
134 		}
135 
136 		count = *data & 0x3f;
137 		flags = *data >> 6;
138 		data++;
139 
140 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
141 		if (flags & POSTED)
142 			*regs |= MI_LRI_FORCE_POSTED;
143 		*regs |= MI_LRI_LRM_CS_MMIO;
144 		regs++;
145 
146 		xe_gt_assert(hwe->gt, count);
147 		do {
148 			u32 offset = 0;
149 			u8 v;
150 
151 			do {
152 				v = *data++;
153 				offset <<= 7;
154 				offset |= v & ~BIT(7);
155 			} while (v & BIT(7));
156 
157 			regs[0] = base + (offset << 2);
158 			regs += 2;
159 		} while (--count);
160 	}
161 
162 	*regs = MI_BATCH_BUFFER_END | BIT(0);
163 }
164 
165 static const u8 gen12_xcs_offsets[] = {
166 	NOP(1),
167 	LRI(13, POSTED),
168 	REG16(0x244),
169 	REG(0x034),
170 	REG(0x030),
171 	REG(0x038),
172 	REG(0x03c),
173 	REG(0x168),
174 	REG(0x140),
175 	REG(0x110),
176 	REG(0x1c0),
177 	REG(0x1c4),
178 	REG(0x1c8),
179 	REG(0x180),
180 	REG16(0x2b4),
181 
182 	NOP(5),
183 	LRI(9, POSTED),
184 	REG16(0x3a8),
185 	REG16(0x28c),
186 	REG16(0x288),
187 	REG16(0x284),
188 	REG16(0x280),
189 	REG16(0x27c),
190 	REG16(0x278),
191 	REG16(0x274),
192 	REG16(0x270),
193 
194 	0
195 };
196 
197 static const u8 dg2_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(15, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 	REG(0x120),
214 	REG(0x124),
215 
216 	NOP(1),
217 	LRI(9, POSTED),
218 	REG16(0x3a8),
219 	REG16(0x28c),
220 	REG16(0x288),
221 	REG16(0x284),
222 	REG16(0x280),
223 	REG16(0x27c),
224 	REG16(0x278),
225 	REG16(0x274),
226 	REG16(0x270),
227 
228 	0
229 };
230 
231 static const u8 gen12_rcs_offsets[] = {
232 	NOP(1),
233 	LRI(13, POSTED),
234 	REG16(0x244),
235 	REG(0x034),
236 	REG(0x030),
237 	REG(0x038),
238 	REG(0x03c),
239 	REG(0x168),
240 	REG(0x140),
241 	REG(0x110),
242 	REG(0x1c0),
243 	REG(0x1c4),
244 	REG(0x1c8),
245 	REG(0x180),
246 	REG16(0x2b4),
247 
248 	NOP(5),
249 	LRI(9, POSTED),
250 	REG16(0x3a8),
251 	REG16(0x28c),
252 	REG16(0x288),
253 	REG16(0x284),
254 	REG16(0x280),
255 	REG16(0x27c),
256 	REG16(0x278),
257 	REG16(0x274),
258 	REG16(0x270),
259 
260 	LRI(3, POSTED),
261 	REG(0x1b0),
262 	REG16(0x5a8),
263 	REG16(0x5ac),
264 
265 	NOP(6),
266 	LRI(1, 0),
267 	REG(0x0c8),
268 	NOP(3 + 9 + 1),
269 
270 	LRI(51, POSTED),
271 	REG16(0x588),
272 	REG16(0x588),
273 	REG16(0x588),
274 	REG16(0x588),
275 	REG16(0x588),
276 	REG16(0x588),
277 	REG(0x028),
278 	REG(0x09c),
279 	REG(0x0c0),
280 	REG(0x178),
281 	REG(0x17c),
282 	REG16(0x358),
283 	REG(0x170),
284 	REG(0x150),
285 	REG(0x154),
286 	REG(0x158),
287 	REG16(0x41c),
288 	REG16(0x600),
289 	REG16(0x604),
290 	REG16(0x608),
291 	REG16(0x60c),
292 	REG16(0x610),
293 	REG16(0x614),
294 	REG16(0x618),
295 	REG16(0x61c),
296 	REG16(0x620),
297 	REG16(0x624),
298 	REG16(0x628),
299 	REG16(0x62c),
300 	REG16(0x630),
301 	REG16(0x634),
302 	REG16(0x638),
303 	REG16(0x63c),
304 	REG16(0x640),
305 	REG16(0x644),
306 	REG16(0x648),
307 	REG16(0x64c),
308 	REG16(0x650),
309 	REG16(0x654),
310 	REG16(0x658),
311 	REG16(0x65c),
312 	REG16(0x660),
313 	REG16(0x664),
314 	REG16(0x668),
315 	REG16(0x66c),
316 	REG16(0x670),
317 	REG16(0x674),
318 	REG16(0x678),
319 	REG16(0x67c),
320 	REG(0x068),
321 	REG(0x084),
322 	NOP(1),
323 
324 	0
325 };
326 
327 static const u8 xehp_rcs_offsets[] = {
328 	NOP(1),
329 	LRI(13, POSTED),
330 	REG16(0x244),
331 	REG(0x034),
332 	REG(0x030),
333 	REG(0x038),
334 	REG(0x03c),
335 	REG(0x168),
336 	REG(0x140),
337 	REG(0x110),
338 	REG(0x1c0),
339 	REG(0x1c4),
340 	REG(0x1c8),
341 	REG(0x180),
342 	REG16(0x2b4),
343 
344 	NOP(5),
345 	LRI(9, POSTED),
346 	REG16(0x3a8),
347 	REG16(0x28c),
348 	REG16(0x288),
349 	REG16(0x284),
350 	REG16(0x280),
351 	REG16(0x27c),
352 	REG16(0x278),
353 	REG16(0x274),
354 	REG16(0x270),
355 
356 	LRI(3, POSTED),
357 	REG(0x1b0),
358 	REG16(0x5a8),
359 	REG16(0x5ac),
360 
361 	NOP(6),
362 	LRI(1, 0),
363 	REG(0x0c8),
364 
365 	0
366 };
367 
368 static const u8 dg2_rcs_offsets[] = {
369 	NOP(1),
370 	LRI(15, POSTED),
371 	REG16(0x244),
372 	REG(0x034),
373 	REG(0x030),
374 	REG(0x038),
375 	REG(0x03c),
376 	REG(0x168),
377 	REG(0x140),
378 	REG(0x110),
379 	REG(0x1c0),
380 	REG(0x1c4),
381 	REG(0x1c8),
382 	REG(0x180),
383 	REG16(0x2b4),
384 	REG(0x120),
385 	REG(0x124),
386 
387 	NOP(1),
388 	LRI(9, POSTED),
389 	REG16(0x3a8),
390 	REG16(0x28c),
391 	REG16(0x288),
392 	REG16(0x284),
393 	REG16(0x280),
394 	REG16(0x27c),
395 	REG16(0x278),
396 	REG16(0x274),
397 	REG16(0x270),
398 
399 	LRI(3, POSTED),
400 	REG(0x1b0),
401 	REG16(0x5a8),
402 	REG16(0x5ac),
403 
404 	NOP(6),
405 	LRI(1, 0),
406 	REG(0x0c8),
407 
408 	0
409 };
410 
411 static const u8 mtl_rcs_offsets[] = {
412 	NOP(1),
413 	LRI(15, POSTED),
414 	REG16(0x244),
415 	REG(0x034),
416 	REG(0x030),
417 	REG(0x038),
418 	REG(0x03c),
419 	REG(0x168),
420 	REG(0x140),
421 	REG(0x110),
422 	REG(0x1c0),
423 	REG(0x1c4),
424 	REG(0x1c8),
425 	REG(0x180),
426 	REG16(0x2b4),
427 	REG(0x120),
428 	REG(0x124),
429 
430 	NOP(1),
431 	LRI(9, POSTED),
432 	REG16(0x3a8),
433 	REG16(0x28c),
434 	REG16(0x288),
435 	REG16(0x284),
436 	REG16(0x280),
437 	REG16(0x27c),
438 	REG16(0x278),
439 	REG16(0x274),
440 	REG16(0x270),
441 
442 	NOP(2),
443 	LRI(2, POSTED),
444 	REG16(0x5a8),
445 	REG16(0x5ac),
446 
447 	NOP(6),
448 	LRI(1, 0),
449 	REG(0x0c8),
450 
451 	0
452 };
453 
454 #define XE2_CTX_COMMON \
455 	NOP(1),                 /* [0x00] */ \
456 	LRI(15, POSTED),        /* [0x01] */ \
457 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
458 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
459 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
460 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
461 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
462 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
463 	REG(0x140),             /* [0x0e] BB_ADDR */ \
464 	REG(0x110),             /* [0x10] BB_STATE */ \
465 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
466 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
467 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
468 	REG(0x180),             /* [0x18] CCID */ \
469 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
470 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
471 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
472 	\
473 	NOP(1),                 /* [0x20] */ \
474 	LRI(9, POSTED),         /* [0x21] */ \
475 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
476 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
477 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
478 	REG16(0x284),           /* [0x28] dummy reg */ \
479 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
480 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
481 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
482 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
483 	REG16(0x270)            /* [0x32] PTBP_LDW */
484 
485 static const u8 xe2_rcs_offsets[] = {
486 	XE2_CTX_COMMON,
487 
488 	NOP(2),                 /* [0x34] */
489 	LRI(2, POSTED),         /* [0x36] */
490 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
491 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
492 
493 	NOP(6),                 /* [0x41] */
494 	LRI(1, 0),              /* [0x47] */
495 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
496 
497 	0
498 };
499 
500 static const u8 xe2_bcs_offsets[] = {
501 	XE2_CTX_COMMON,
502 
503 	NOP(4 + 8 + 1),         /* [0x34] */
504 	LRI(2, POSTED),         /* [0x41] */
505 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
506 	REG16(0x204),           /* [0x44] BLIT_CCTL */
507 
508 	0
509 };
510 
511 static const u8 xe2_xcs_offsets[] = {
512 	XE2_CTX_COMMON,
513 
514 	0
515 };
516 
517 static const u8 xe2_indirect_ring_state_offsets[] = {
518 	NOP(1),                 /* [0x00] */
519 	LRI(5, POSTED),         /* [0x01] */
520 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
521 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
522 	REG(0x038),             /* [0x06] RING_BUFFER_START */
523 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
524 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
525 
526 	NOP(5),                 /* [0x0c] */
527 	LRI(9, POSTED),         /* [0x11] */
528 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
529 	REG(0x140),             /* [0x14] BB_ADDR */
530 	REG(0x110),             /* [0x16] BB_STATE */
531 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
532 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
533 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
534 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
535 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
536 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
537 
538 	NOP(12),                 /* [0x00] */
539 
540 	0
541 };
542 
543 #undef REG16
544 #undef REG
545 #undef LRI
546 #undef NOP
547 
548 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
549 {
550 	if (class == XE_ENGINE_CLASS_RENDER) {
551 		if (GRAPHICS_VER(xe) >= 20)
552 			return xe2_rcs_offsets;
553 		else if (GRAPHICS_VERx100(xe) >= 1270)
554 			return mtl_rcs_offsets;
555 		else if (GRAPHICS_VERx100(xe) >= 1255)
556 			return dg2_rcs_offsets;
557 		else if (GRAPHICS_VERx100(xe) >= 1250)
558 			return xehp_rcs_offsets;
559 		else
560 			return gen12_rcs_offsets;
561 	} else if (class == XE_ENGINE_CLASS_COPY) {
562 		if (GRAPHICS_VER(xe) >= 20)
563 			return xe2_bcs_offsets;
564 		else
565 			return gen12_xcs_offsets;
566 	} else {
567 		if (GRAPHICS_VER(xe) >= 20)
568 			return xe2_xcs_offsets;
569 		else if (GRAPHICS_VERx100(xe) >= 1255)
570 			return dg2_xcs_offsets;
571 		else
572 			return gen12_xcs_offsets;
573 	}
574 }
575 
576 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
577 {
578 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
579 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
580 
581 	if (xe_gt_has_indirect_ring_state(hwe->gt))
582 		regs[CTX_CONTEXT_CONTROL] |=
583 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
584 
585 	/* TODO: Timestamp */
586 }
587 
588 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
589 {
590 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
591 	struct xe_device *xe = gt_to_xe(hwe->gt);
592 	u8 num_regs;
593 
594 	if (!xe_device_uses_memirq(xe))
595 		return;
596 
597 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
598 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
599 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
600 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
601 
602 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
603 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
604 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
605 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
606 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
607 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
608 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
609 
610 	if (xe_device_has_msix(xe)) {
611 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
612 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
613 	}
614 }
615 
616 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
617 {
618 	struct xe_device *xe = gt_to_xe(hwe->gt);
619 
620 	if (GRAPHICS_VERx100(xe) >= 1250)
621 		return 0x70;
622 	else
623 		return 0x60;
624 }
625 
626 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
627 {
628 	int x;
629 
630 	x = lrc_ring_mi_mode(hwe);
631 	regs[x + 1] &= ~STOP_RING;
632 	regs[x + 1] |= STOP_RING << 16;
633 }
634 
635 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
636 {
637 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
638 }
639 
640 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
641 {
642 	return 0;
643 }
644 
645 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
646 {
647 	return lrc->ring.size;
648 }
649 
650 /* Make the magic macros work */
651 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
652 #define __xe_lrc_regs_offset xe_lrc_regs_offset
653 
654 #define LRC_SEQNO_PPHWSP_OFFSET 512
655 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
656 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
657 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
658 #define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
659 
660 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
661 {
662 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
663 }
664 
665 static size_t lrc_reg_size(struct xe_device *xe)
666 {
667 	if (GRAPHICS_VERx100(xe) >= 1250)
668 		return 96 * sizeof(u32);
669 	else
670 		return 80 * sizeof(u32);
671 }
672 
673 size_t xe_lrc_skip_size(struct xe_device *xe)
674 {
675 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
676 }
677 
678 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
679 {
680 	/* The seqno is stored in the driver-defined portion of PPHWSP */
681 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
682 }
683 
684 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
685 {
686 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
687 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
688 }
689 
690 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
691 {
692 	/* This is stored in the driver-defined portion of PPHWSP */
693 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
694 }
695 
696 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
697 {
698 	/* The parallel is stored in the driver-defined portion of PPHWSP */
699 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
700 }
701 
702 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
703 {
704 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
705 }
706 
707 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
708 {
709 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
710 }
711 
712 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
713 {
714 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
715 }
716 
717 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
718 {
719 	/* Indirect ring state page is at the very end of LRC */
720 	return lrc->size - LRC_INDIRECT_RING_STATE_SIZE;
721 }
722 
723 #define DECL_MAP_ADDR_HELPERS(elem) \
724 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
725 { \
726 	struct iosys_map map = lrc->bo->vmap; \
727 \
728 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
729 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
730 	return map; \
731 } \
732 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
733 { \
734 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
735 } \
736 
737 DECL_MAP_ADDR_HELPERS(ring)
738 DECL_MAP_ADDR_HELPERS(pphwsp)
739 DECL_MAP_ADDR_HELPERS(seqno)
740 DECL_MAP_ADDR_HELPERS(regs)
741 DECL_MAP_ADDR_HELPERS(start_seqno)
742 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
743 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
744 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
745 DECL_MAP_ADDR_HELPERS(parallel)
746 DECL_MAP_ADDR_HELPERS(indirect_ring)
747 DECL_MAP_ADDR_HELPERS(engine_id)
748 
749 #undef DECL_MAP_ADDR_HELPERS
750 
751 /**
752  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
753  * @lrc: Pointer to the lrc.
754  *
755  * Returns: ctx timestamp GGTT address
756  */
757 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
758 {
759 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
760 }
761 
762 /**
763  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
764  * @lrc: Pointer to the lrc.
765  *
766  * Returns: ctx timestamp udw GGTT address
767  */
768 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
769 {
770 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
771 }
772 
773 /**
774  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
775  * @lrc: Pointer to the lrc.
776  *
777  * Returns: ctx timestamp value
778  */
779 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
780 {
781 	struct xe_device *xe = lrc_to_xe(lrc);
782 	struct iosys_map map;
783 	u32 ldw, udw = 0;
784 
785 	map = __xe_lrc_ctx_timestamp_map(lrc);
786 	ldw = xe_map_read32(xe, &map);
787 
788 	if (xe->info.has_64bit_timestamp) {
789 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
790 		udw = xe_map_read32(xe, &map);
791 	}
792 
793 	return (u64)udw << 32 | ldw;
794 }
795 
796 /**
797  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
798  * @lrc: Pointer to the lrc.
799  *
800  * Returns: ctx timestamp job GGTT address
801  */
802 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
803 {
804 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
805 }
806 
807 /**
808  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
809  * @lrc: Pointer to the lrc.
810  *
811  * Returns: ctx timestamp job value
812  */
813 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
814 {
815 	struct xe_device *xe = lrc_to_xe(lrc);
816 	struct iosys_map map;
817 
818 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
819 	return xe_map_read32(xe, &map);
820 }
821 
822 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
823 {
824 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
825 }
826 
827 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
828 {
829 	if (!xe_lrc_has_indirect_ring_state(lrc))
830 		return 0;
831 
832 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
833 }
834 
835 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
836 {
837 	struct xe_device *xe = lrc_to_xe(lrc);
838 	struct iosys_map map;
839 
840 	map = __xe_lrc_indirect_ring_map(lrc);
841 	iosys_map_incr(&map, reg_nr * sizeof(u32));
842 	return xe_map_read32(xe, &map);
843 }
844 
845 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
846 					  int reg_nr, u32 val)
847 {
848 	struct xe_device *xe = lrc_to_xe(lrc);
849 	struct iosys_map map;
850 
851 	map = __xe_lrc_indirect_ring_map(lrc);
852 	iosys_map_incr(&map, reg_nr * sizeof(u32));
853 	xe_map_write32(xe, &map, val);
854 }
855 
856 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
857 {
858 	struct xe_device *xe = lrc_to_xe(lrc);
859 	struct iosys_map map;
860 
861 	map = __xe_lrc_regs_map(lrc);
862 	iosys_map_incr(&map, reg_nr * sizeof(u32));
863 	return xe_map_read32(xe, &map);
864 }
865 
866 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
867 {
868 	struct xe_device *xe = lrc_to_xe(lrc);
869 	struct iosys_map map;
870 
871 	map = __xe_lrc_regs_map(lrc);
872 	iosys_map_incr(&map, reg_nr * sizeof(u32));
873 	xe_map_write32(xe, &map, val);
874 }
875 
876 static void *empty_lrc_data(struct xe_hw_engine *hwe)
877 {
878 	struct xe_gt *gt = hwe->gt;
879 	void *data;
880 	u32 *regs;
881 
882 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
883 	if (!data)
884 		return NULL;
885 
886 	/* 1st page: Per-Process of HW status Page */
887 	regs = data + LRC_PPHWSP_SIZE;
888 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
889 	set_context_control(regs, hwe);
890 	set_memory_based_intr(regs, hwe);
891 	reset_stop_ring(regs, hwe);
892 	if (xe_gt_has_indirect_ring_state(gt)) {
893 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
894 		       LRC_INDIRECT_RING_STATE_SIZE;
895 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
896 	}
897 
898 	return data;
899 }
900 
901 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
902 {
903 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
904 
905 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
906 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
907 }
908 
909 static void xe_lrc_finish(struct xe_lrc *lrc)
910 {
911 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
912 	xe_bo_lock(lrc->bo, false);
913 	xe_bo_unpin(lrc->bo);
914 	xe_bo_unlock(lrc->bo);
915 	xe_bo_put(lrc->bo);
916 	xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
917 }
918 
919 /*
920  * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
921  * context run ticks.
922  * @lrc: Pointer to the lrc.
923  *
924  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
925  * context, but only gets updated when the context switches out. In order to
926  * check how long a context has been active before it switches out, two things
927  * are required:
928  *
929  * (1) Determine if the context is running:
930  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
931  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
932  * initialized. During a query, we just check for this value to determine if the
933  * context is active. If the context switched out, it would overwrite this
934  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
935  * the last part of context restore, so reusing this LRC location will not
936  * clobber anything.
937  *
938  * (2) Calculate the time that the context has been active for:
939  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
940  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
941  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
942  * engine instance. Since we do not know which instance the context is running
943  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
944  * store it in the PPHSWP.
945  */
946 #define CONTEXT_ACTIVE 1ULL
947 static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
948 {
949 	u32 *cmd;
950 
951 	cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
952 
953 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
954 	*cmd++ = ENGINE_ID(0).addr;
955 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
956 	*cmd++ = 0;
957 
958 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
959 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
960 	*cmd++ = 0;
961 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
962 
963 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
964 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
965 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
966 		*cmd++ = 0;
967 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
968 	}
969 
970 	*cmd++ = MI_BATCH_BUFFER_END;
971 
972 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
973 			     xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
974 
975 }
976 
977 #define PVC_CTX_ASID		(0x2e + 1)
978 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
979 
980 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
981 		       struct xe_vm *vm, u32 ring_size, u16 msix_vec,
982 		       u32 init_flags)
983 {
984 	struct xe_gt *gt = hwe->gt;
985 	struct xe_tile *tile = gt_to_tile(gt);
986 	struct xe_device *xe = gt_to_xe(gt);
987 	struct iosys_map map;
988 	void *init_data = NULL;
989 	u32 arb_enable;
990 	u32 lrc_size;
991 	u32 bo_flags;
992 	int err;
993 
994 	kref_init(&lrc->refcount);
995 	lrc->gt = gt;
996 	lrc->flags = 0;
997 	lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
998 	if (xe_gt_has_indirect_ring_state(gt))
999 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1000 
1001 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1002 		   XE_BO_FLAG_GGTT_INVALIDATE;
1003 	if (vm && vm->xef) /* userspace */
1004 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE;
1005 
1006 	/*
1007 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
1008 	 * via VM bind calls.
1009 	 */
1010 	lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size,
1011 				       ttm_bo_type_kernel,
1012 				       bo_flags);
1013 	if (IS_ERR(lrc->bo))
1014 		return PTR_ERR(lrc->bo);
1015 
1016 	lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
1017 						  ttm_bo_type_kernel,
1018 						  bo_flags);
1019 	if (IS_ERR(lrc->bb_per_ctx_bo)) {
1020 		err = PTR_ERR(lrc->bb_per_ctx_bo);
1021 		goto err_lrc_finish;
1022 	}
1023 
1024 	lrc->size = lrc_size;
1025 	lrc->ring.size = ring_size;
1026 	lrc->ring.tail = 0;
1027 
1028 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1029 			     hwe->fence_irq, hwe->name);
1030 
1031 	if (!gt->default_lrc[hwe->class]) {
1032 		init_data = empty_lrc_data(hwe);
1033 		if (!init_data) {
1034 			err = -ENOMEM;
1035 			goto err_lrc_finish;
1036 		}
1037 	}
1038 
1039 	/*
1040 	 * Init Per-Process of HW status Page, LRC / context state to known
1041 	 * values
1042 	 */
1043 	map = __xe_lrc_pphwsp_map(lrc);
1044 	if (!init_data) {
1045 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1046 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1047 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1048 				 xe_gt_lrc_size(gt, hwe->class) - LRC_PPHWSP_SIZE);
1049 	} else {
1050 		xe_map_memcpy_to(xe, &map, 0, init_data,
1051 				 xe_gt_lrc_size(gt, hwe->class));
1052 		kfree(init_data);
1053 	}
1054 
1055 	if (vm) {
1056 		xe_lrc_set_ppgtt(lrc, vm);
1057 
1058 		if (vm->xef)
1059 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1060 	}
1061 
1062 	if (xe_device_has_msix(xe)) {
1063 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1064 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1065 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1066 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1067 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1068 	}
1069 
1070 	if (xe_gt_has_indirect_ring_state(gt)) {
1071 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1072 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1073 
1074 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1075 					      __xe_lrc_ring_ggtt_addr(lrc));
1076 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1077 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1078 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1079 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1080 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1081 	} else {
1082 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1083 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1084 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1085 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1086 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1087 	}
1088 
1089 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1090 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1091 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1092 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1093 
1094 	if (init_flags & XE_LRC_CREATE_PXP)
1095 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1096 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1097 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1098 
1099 	lrc->ctx_timestamp = 0;
1100 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1101 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1102 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1103 
1104 	if (xe->info.has_asid && vm)
1105 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
1106 
1107 	lrc->desc = LRC_VALID;
1108 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1109 	/* TODO: Priority */
1110 
1111 	/* While this appears to have something about privileged batches or
1112 	 * some such, it really just means PPGTT mode.
1113 	 */
1114 	if (vm)
1115 		lrc->desc |= LRC_PRIVILEGE;
1116 
1117 	if (GRAPHICS_VERx100(xe) < 1250) {
1118 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1119 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1120 	}
1121 
1122 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1123 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1124 
1125 	map = __xe_lrc_seqno_map(lrc);
1126 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1127 
1128 	map = __xe_lrc_start_seqno_map(lrc);
1129 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1130 
1131 	xe_lrc_setup_utilization(lrc);
1132 
1133 	return 0;
1134 
1135 err_lrc_finish:
1136 	xe_lrc_finish(lrc);
1137 	return err;
1138 }
1139 
1140 /**
1141  * xe_lrc_create - Create a LRC
1142  * @hwe: Hardware Engine
1143  * @vm: The VM (address space)
1144  * @ring_size: LRC ring size
1145  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1146  * @flags: LRC initialization flags
1147  *
1148  * Allocate and initialize the Logical Ring Context (LRC).
1149  *
1150  * Return pointer to created LRC upon success and an error pointer
1151  * upon failure.
1152  */
1153 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1154 			     u32 ring_size, u16 msix_vec, u32 flags)
1155 {
1156 	struct xe_lrc *lrc;
1157 	int err;
1158 
1159 	lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1160 	if (!lrc)
1161 		return ERR_PTR(-ENOMEM);
1162 
1163 	err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1164 	if (err) {
1165 		kfree(lrc);
1166 		return ERR_PTR(err);
1167 	}
1168 
1169 	return lrc;
1170 }
1171 
1172 /**
1173  * xe_lrc_destroy - Destroy the LRC
1174  * @ref: reference to LRC
1175  *
1176  * Called when ref == 0, release resources held by the Logical Ring Context
1177  * (LRC) and free the LRC memory.
1178  */
1179 void xe_lrc_destroy(struct kref *ref)
1180 {
1181 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1182 
1183 	xe_lrc_finish(lrc);
1184 	kfree(lrc);
1185 }
1186 
1187 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1188 {
1189 	if (xe_lrc_has_indirect_ring_state(lrc))
1190 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1191 	else
1192 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1193 }
1194 
1195 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1196 {
1197 	if (xe_lrc_has_indirect_ring_state(lrc))
1198 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1199 	else
1200 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1201 }
1202 
1203 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1204 {
1205 	if (xe_lrc_has_indirect_ring_state(lrc))
1206 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1207 	else
1208 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1209 }
1210 
1211 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1212 {
1213 	if (xe_lrc_has_indirect_ring_state(lrc))
1214 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1215 	else
1216 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1217 }
1218 
1219 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1220 {
1221 	if (xe_lrc_has_indirect_ring_state(lrc))
1222 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1223 	else
1224 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1225 }
1226 
1227 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1228 {
1229 	const u32 head = xe_lrc_ring_head(lrc);
1230 	const u32 tail = lrc->ring.tail;
1231 	const u32 size = lrc->ring.size;
1232 
1233 	return ((head - tail - 1) & (size - 1)) + 1;
1234 }
1235 
1236 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1237 				const void *data, size_t size)
1238 {
1239 	struct xe_device *xe = lrc_to_xe(lrc);
1240 
1241 	iosys_map_incr(&ring, lrc->ring.tail);
1242 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1243 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1244 }
1245 
1246 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1247 {
1248 	struct xe_device *xe = lrc_to_xe(lrc);
1249 	struct iosys_map ring;
1250 	u32 rhs;
1251 	size_t aligned_size;
1252 
1253 	xe_assert(xe, IS_ALIGNED(size, 4));
1254 	aligned_size = ALIGN(size, 8);
1255 
1256 	ring = __xe_lrc_ring_map(lrc);
1257 
1258 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1259 	rhs = lrc->ring.size - lrc->ring.tail;
1260 	if (size > rhs) {
1261 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1262 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1263 	} else {
1264 		__xe_lrc_write_ring(lrc, ring, data, size);
1265 	}
1266 
1267 	if (aligned_size > size) {
1268 		u32 noop = MI_NOOP;
1269 
1270 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1271 	}
1272 }
1273 
1274 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1275 {
1276 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1277 }
1278 
1279 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1280 {
1281 	return __xe_lrc_seqno_ggtt_addr(lrc);
1282 }
1283 
1284 /**
1285  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1286  *
1287  * Allocate but don't initialize an lrc seqno fence.
1288  *
1289  * Return: Pointer to the allocated fence or
1290  * negative error pointer on error.
1291  */
1292 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1293 {
1294 	return xe_hw_fence_alloc();
1295 }
1296 
1297 /**
1298  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1299  * @fence: Pointer to the fence to free.
1300  *
1301  * Frees an lrc seqno fence that hasn't yet been
1302  * initialized.
1303  */
1304 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1305 {
1306 	xe_hw_fence_free(fence);
1307 }
1308 
1309 /**
1310  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1311  * @lrc: Pointer to the lrc.
1312  * @fence: Pointer to the fence to initialize.
1313  *
1314  * Initializes a pre-allocated lrc seqno fence.
1315  * After initialization, the fence is subject to normal
1316  * dma-fence refcounting.
1317  */
1318 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1319 {
1320 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1321 }
1322 
1323 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1324 {
1325 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1326 
1327 	return xe_map_read32(lrc_to_xe(lrc), &map);
1328 }
1329 
1330 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1331 {
1332 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1333 
1334 	return xe_map_read32(lrc_to_xe(lrc), &map);
1335 }
1336 
1337 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1338 {
1339 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1340 }
1341 
1342 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1343 {
1344 	return __xe_lrc_parallel_ggtt_addr(lrc);
1345 }
1346 
1347 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1348 {
1349 	return __xe_lrc_parallel_map(lrc);
1350 }
1351 
1352 /**
1353  * xe_lrc_engine_id() - Read engine id value
1354  * @lrc: Pointer to the lrc.
1355  *
1356  * Returns: context id value
1357  */
1358 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1359 {
1360 	struct xe_device *xe = lrc_to_xe(lrc);
1361 	struct iosys_map map;
1362 
1363 	map = __xe_lrc_engine_id_map(lrc);
1364 	return xe_map_read32(xe, &map);
1365 }
1366 
1367 static int instr_dw(u32 cmd_header)
1368 {
1369 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1370 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1371 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1372 		return 1;
1373 
1374 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1375 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1376 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1377 
1378 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1379 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1380 }
1381 
1382 static int dump_mi_command(struct drm_printer *p,
1383 			   struct xe_gt *gt,
1384 			   u32 *dw,
1385 			   int remaining_dw)
1386 {
1387 	u32 inst_header = *dw;
1388 	u32 numdw = instr_dw(inst_header);
1389 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1390 	int num_noop;
1391 
1392 	/* First check for commands that don't have/use a '# DW' field */
1393 	switch (inst_header & MI_OPCODE) {
1394 	case MI_NOOP:
1395 		num_noop = 1;
1396 		while (num_noop < remaining_dw &&
1397 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1398 			num_noop++;
1399 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1400 		return num_noop;
1401 
1402 	case MI_TOPOLOGY_FILTER:
1403 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1404 		return 1;
1405 
1406 	case MI_BATCH_BUFFER_END:
1407 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1408 		/* Return 'remaining_dw' to consume the rest of the LRC */
1409 		return remaining_dw;
1410 	}
1411 
1412 	/*
1413 	 * Any remaining commands include a # of dwords.  We should make sure
1414 	 * it doesn't exceed the remaining size of the LRC.
1415 	 */
1416 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1417 		numdw = remaining_dw;
1418 
1419 	switch (inst_header & MI_OPCODE) {
1420 	case MI_LOAD_REGISTER_IMM:
1421 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1422 			   inst_header, (numdw - 1) / 2);
1423 		for (int i = 1; i < numdw; i += 2)
1424 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1425 		return numdw;
1426 
1427 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1428 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1429 			   inst_header,
1430 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1431 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1432 		if (numdw == 4)
1433 			drm_printf(p, " - %#6x = %#010llx\n",
1434 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1435 		else
1436 			drm_printf(p, " - %*ph (%s)\n",
1437 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1438 				   numdw < 4 ? "truncated" : "malformed");
1439 		return numdw;
1440 
1441 	case MI_FORCE_WAKEUP:
1442 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1443 		return numdw;
1444 
1445 	default:
1446 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1447 			   inst_header, opcode, numdw);
1448 		return numdw;
1449 	}
1450 }
1451 
1452 static int dump_gfxpipe_command(struct drm_printer *p,
1453 				struct xe_gt *gt,
1454 				u32 *dw,
1455 				int remaining_dw)
1456 {
1457 	u32 numdw = instr_dw(*dw);
1458 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1459 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1460 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1461 
1462 	/*
1463 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1464 	 * remaining size of the LRC.
1465 	 */
1466 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1467 		numdw = remaining_dw;
1468 
1469 	switch (*dw & GFXPIPE_MATCH_MASK) {
1470 #define MATCH(cmd) \
1471 	case cmd: \
1472 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1473 		return numdw
1474 #define MATCH3D(cmd) \
1475 	case CMD_##cmd: \
1476 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1477 		return numdw
1478 
1479 	MATCH(STATE_BASE_ADDRESS);
1480 	MATCH(STATE_SIP);
1481 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1482 	MATCH(STATE_COMPUTE_MODE);
1483 	MATCH3D(3DSTATE_BTD);
1484 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1485 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1486 
1487 	MATCH3D(3DSTATE_VF_STATISTICS);
1488 
1489 	MATCH(PIPELINE_SELECT);
1490 
1491 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1492 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1493 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1494 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1495 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1496 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1497 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1498 	MATCH3D(3DSTATE_INDEX_BUFFER);
1499 	MATCH3D(3DSTATE_VF);
1500 	MATCH3D(3DSTATE_MULTISAMPLE);
1501 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1502 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1503 	MATCH3D(3DSTATE_VS);
1504 	MATCH3D(3DSTATE_GS);
1505 	MATCH3D(3DSTATE_CLIP);
1506 	MATCH3D(3DSTATE_SF);
1507 	MATCH3D(3DSTATE_WM);
1508 	MATCH3D(3DSTATE_CONSTANT_VS);
1509 	MATCH3D(3DSTATE_CONSTANT_GS);
1510 	MATCH3D(3DSTATE_CONSTANT_PS);
1511 	MATCH3D(3DSTATE_SAMPLE_MASK);
1512 	MATCH3D(3DSTATE_CONSTANT_HS);
1513 	MATCH3D(3DSTATE_CONSTANT_DS);
1514 	MATCH3D(3DSTATE_HS);
1515 	MATCH3D(3DSTATE_TE);
1516 	MATCH3D(3DSTATE_DS);
1517 	MATCH3D(3DSTATE_STREAMOUT);
1518 	MATCH3D(3DSTATE_SBE);
1519 	MATCH3D(3DSTATE_PS);
1520 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1521 	MATCH3D(3DSTATE_CPS_POINTERS);
1522 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1523 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1524 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1525 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1526 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1527 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1528 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1529 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1530 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1531 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1532 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1533 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1534 	MATCH3D(3DSTATE_VF_INSTANCING);
1535 	MATCH3D(3DSTATE_VF_SGVS);
1536 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1537 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1538 	MATCH3D(3DSTATE_PS_BLEND);
1539 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1540 	MATCH3D(3DSTATE_PS_EXTRA);
1541 	MATCH3D(3DSTATE_RASTER);
1542 	MATCH3D(3DSTATE_SBE_SWIZ);
1543 	MATCH3D(3DSTATE_WM_HZ_OP);
1544 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1545 	MATCH3D(3DSTATE_VF_SGVS_2);
1546 	MATCH3D(3DSTATE_VFG);
1547 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1548 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1549 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1550 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1551 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1552 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1553 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1554 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1555 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1556 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1557 	MATCH3D(3DSTATE_AMFS);
1558 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1559 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1560 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1561 	MATCH3D(3DSTATE_MESH_CONTROL);
1562 	MATCH3D(3DSTATE_MESH_DISTRIB);
1563 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1564 	MATCH3D(3DSTATE_MESH_SHADER);
1565 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1566 	MATCH3D(3DSTATE_TASK_CONTROL);
1567 	MATCH3D(3DSTATE_TASK_SHADER);
1568 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1569 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1570 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1571 	MATCH3D(3DSTATE_CLIP_MESH);
1572 	MATCH3D(3DSTATE_SBE_MESH);
1573 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1574 	MATCH3D(3DSTATE_COARSE_PIXEL);
1575 
1576 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1577 	MATCH3D(3DSTATE_CHROMA_KEY);
1578 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1579 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1580 	MATCH3D(3DSTATE_LINE_STIPPLE);
1581 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1582 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1583 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1584 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1585 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1586 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1587 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1588 	MATCH3D(3DSTATE_SO_DECL_LIST);
1589 	MATCH3D(3DSTATE_SO_BUFFER);
1590 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1591 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1592 	MATCH3D(3DSTATE_3D_MODE);
1593 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1594 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1595 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1596 
1597 	default:
1598 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1599 			   *dw, pipeline, opcode, subopcode, numdw);
1600 		return numdw;
1601 	}
1602 }
1603 
1604 static int dump_gfx_state_command(struct drm_printer *p,
1605 				  struct xe_gt *gt,
1606 				  u32 *dw,
1607 				  int remaining_dw)
1608 {
1609 	u32 numdw = instr_dw(*dw);
1610 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1611 
1612 	/*
1613 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1614 	 * remaining size of the LRC.
1615 	 */
1616 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1617 		numdw = remaining_dw;
1618 
1619 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1620 	MATCH(STATE_WRITE_INLINE);
1621 
1622 	default:
1623 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1624 			   *dw, opcode, numdw);
1625 		return numdw;
1626 	}
1627 }
1628 
1629 void xe_lrc_dump_default(struct drm_printer *p,
1630 			 struct xe_gt *gt,
1631 			 enum xe_engine_class hwe_class)
1632 {
1633 	u32 *dw;
1634 	int remaining_dw, num_dw;
1635 
1636 	if (!gt->default_lrc[hwe_class]) {
1637 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1638 		return;
1639 	}
1640 
1641 	/*
1642 	 * Skip the beginning of the LRC since it contains the per-process
1643 	 * hardware status page.
1644 	 */
1645 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1646 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
1647 
1648 	while (remaining_dw > 0) {
1649 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1650 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1651 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1652 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1653 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1654 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1655 		} else {
1656 			num_dw = min(instr_dw(*dw), remaining_dw);
1657 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1658 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1659 				   num_dw);
1660 		}
1661 
1662 		dw += num_dw;
1663 		remaining_dw -= num_dw;
1664 	}
1665 }
1666 
1667 struct instr_state {
1668 	u32 instr;
1669 	u16 num_dw;
1670 };
1671 
1672 static const struct instr_state xe_hpg_svg_state[] = {
1673 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1674 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1675 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1676 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1677 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1678 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1679 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1680 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1681 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1682 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1683 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1684 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1685 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1686 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1687 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1688 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1689 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1690 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1691 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1692 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1693 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1694 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1695 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1696 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1697 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1698 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1699 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1700 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1701 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1702 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1703 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1704 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1705 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1706 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1707 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1708 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1709 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1710 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1711 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1712 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1713 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1714 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1715 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1716 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1717 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1718 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1719 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1720 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1721 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1722 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1723 };
1724 
1725 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1726 {
1727 	struct xe_gt *gt = q->hwe->gt;
1728 	struct xe_device *xe = gt_to_xe(gt);
1729 	const struct instr_state *state_table = NULL;
1730 	int state_table_size = 0;
1731 
1732 	/*
1733 	 * Wa_14019789679
1734 	 *
1735 	 * If the driver doesn't explicitly emit the SVG instructions while
1736 	 * setting up the default LRC, the context switch will write 0's
1737 	 * (noops) into the LRC memory rather than the expected instruction
1738 	 * headers.  Application contexts start out as a copy of the default
1739 	 * LRC, and if they also do not emit specific settings for some SVG
1740 	 * state, then on context restore they'll unintentionally inherit
1741 	 * whatever state setting the previous context had programmed into the
1742 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
1743 	 * prevent the hardware from resetting that state back to any specific
1744 	 * value).
1745 	 *
1746 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
1747 	 * since that's a specific state setting that can easily cause GPU
1748 	 * hangs if unintentionally inherited.  However to be safe we'll
1749 	 * continue to emit all of the SVG state since it's best not to leak
1750 	 * any of the state between contexts, even if that leakage is harmless.
1751 	 */
1752 	if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
1753 		state_table = xe_hpg_svg_state;
1754 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1755 	}
1756 
1757 	if (!state_table) {
1758 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1759 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1760 		return;
1761 	}
1762 
1763 	for (int i = 0; i < state_table_size; i++) {
1764 		u32 instr = state_table[i].instr;
1765 		u16 num_dw = state_table[i].num_dw;
1766 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1767 
1768 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1769 		xe_gt_assert(gt, num_dw != 0);
1770 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1771 
1772 		/*
1773 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1774 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1775 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1776 		 * Just make the replacement here rather than defining a
1777 		 * whole separate table for the single trivial change.
1778 		 */
1779 		if (GRAPHICS_VER(xe) >= 20 &&
1780 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1781 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1782 
1783 		bb->cs[bb->len] = instr;
1784 		if (!is_single_dw)
1785 			bb->cs[bb->len] |= (num_dw - 2);
1786 
1787 		bb->len += num_dw;
1788 	}
1789 }
1790 
1791 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1792 {
1793 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1794 
1795 	if (!snapshot)
1796 		return NULL;
1797 
1798 	if (lrc->bo->vm)
1799 		xe_vm_get(lrc->bo->vm);
1800 
1801 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
1802 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
1803 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
1804 	snapshot->head = xe_lrc_ring_head(lrc);
1805 	snapshot->tail.internal = lrc->ring.tail;
1806 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
1807 	snapshot->start = xe_lrc_ring_start(lrc);
1808 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1809 	snapshot->seqno = xe_lrc_seqno(lrc);
1810 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
1811 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
1812 	snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
1813 	snapshot->lrc_snapshot = NULL;
1814 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
1815 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
1816 	return snapshot;
1817 }
1818 
1819 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
1820 {
1821 	struct xe_bo *bo;
1822 	struct xe_vm *vm;
1823 	struct iosys_map src;
1824 
1825 	if (!snapshot)
1826 		return;
1827 
1828 	bo = snapshot->lrc_bo;
1829 	vm = bo->vm;
1830 	snapshot->lrc_bo = NULL;
1831 
1832 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
1833 	if (!snapshot->lrc_snapshot)
1834 		goto put_bo;
1835 
1836 	xe_bo_lock(bo, false);
1837 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
1838 		xe_map_memcpy_from(xe_bo_device(bo),
1839 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
1840 				   snapshot->lrc_size);
1841 		ttm_bo_vunmap(&bo->ttm, &src);
1842 	} else {
1843 		kvfree(snapshot->lrc_snapshot);
1844 		snapshot->lrc_snapshot = NULL;
1845 	}
1846 	xe_bo_unlock(bo);
1847 put_bo:
1848 	xe_bo_put(bo);
1849 	if (vm)
1850 		xe_vm_put(vm);
1851 }
1852 
1853 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
1854 {
1855 	unsigned long i;
1856 
1857 	if (!snapshot)
1858 		return;
1859 
1860 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
1861 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
1862 		   snapshot->ring_addr);
1863 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
1864 		   snapshot->indirect_context_desc);
1865 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
1866 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
1867 		   snapshot->tail.internal, snapshot->tail.memory);
1868 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
1869 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
1870 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
1871 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
1872 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
1873 
1874 	if (!snapshot->lrc_snapshot)
1875 		return;
1876 
1877 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
1878 	drm_puts(p, "\t[HWSP].data: ");
1879 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
1880 		u32 *val = snapshot->lrc_snapshot + i;
1881 		char dumped[ASCII85_BUFSZ];
1882 
1883 		drm_puts(p, ascii85_encode(*val, dumped));
1884 	}
1885 
1886 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
1887 	drm_puts(p, "\t[HWCTX].data: ");
1888 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
1889 		u32 *val = snapshot->lrc_snapshot + i;
1890 		char dumped[ASCII85_BUFSZ];
1891 
1892 		drm_puts(p, ascii85_encode(*val, dumped));
1893 	}
1894 	drm_puts(p, "\n");
1895 }
1896 
1897 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
1898 {
1899 	if (!snapshot)
1900 		return;
1901 
1902 	kvfree(snapshot->lrc_snapshot);
1903 	if (snapshot->lrc_bo) {
1904 		struct xe_vm *vm;
1905 
1906 		vm = snapshot->lrc_bo->vm;
1907 		xe_bo_put(snapshot->lrc_bo);
1908 		if (vm)
1909 			xe_vm_put(vm);
1910 	}
1911 	kfree(snapshot);
1912 }
1913 
1914 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
1915 {
1916 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
1917 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
1918 	struct xe_hw_engine *hwe;
1919 	u64 val;
1920 
1921 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
1922 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
1923 			    "Unexpected engine class:instance %d:%d for context utilization\n",
1924 			    class, instance))
1925 		return -1;
1926 
1927 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1928 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
1929 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
1930 	else
1931 		val = xe_mmio_read32(&hwe->gt->mmio,
1932 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
1933 
1934 	*reg_ctx_ts = val;
1935 
1936 	return 0;
1937 }
1938 
1939 /**
1940  * xe_lrc_update_timestamp() - Update ctx timestamp
1941  * @lrc: Pointer to the lrc.
1942  * @old_ts: Old timestamp value
1943  *
1944  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
1945  * update saved value. With support for active contexts, the calculation may be
1946  * slightly racy, so follow a read-again logic to ensure that the context is
1947  * still active before returning the right timestamp.
1948  *
1949  * Returns: New ctx timestamp value
1950  */
1951 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
1952 {
1953 	u64 lrc_ts, reg_ts;
1954 	u32 engine_id;
1955 
1956 	*old_ts = lrc->ctx_timestamp;
1957 
1958 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
1959 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
1960 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
1961 		lrc->ctx_timestamp = lrc_ts;
1962 		goto done;
1963 	}
1964 
1965 	if (lrc_ts == CONTEXT_ACTIVE) {
1966 		engine_id = xe_lrc_engine_id(lrc);
1967 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
1968 			lrc->ctx_timestamp = reg_ts;
1969 
1970 		/* read lrc again to ensure context is still active */
1971 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
1972 	}
1973 
1974 	/*
1975 	 * If context switched out, just use the lrc_ts. Note that this needs to
1976 	 * be a separate if condition.
1977 	 */
1978 	if (lrc_ts != CONTEXT_ACTIVE)
1979 		lrc->ctx_timestamp = lrc_ts;
1980 
1981 done:
1982 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
1983 
1984 	return lrc->ctx_timestamp;
1985 }
1986 
1987 /**
1988  * xe_lrc_ring_is_idle() - LRC is idle
1989  * @lrc: Pointer to the lrc.
1990  *
1991  * Compare LRC ring head and tail to determine if idle.
1992  *
1993  * Return: True is ring is idle, False otherwise
1994  */
1995 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
1996 {
1997 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
1998 }
1999