xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 1f30396de0c95d10859bce34f56d8b5b7a7d4bb8)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_gt_regs.h"
18 #include "regs/xe_lrc_layout.h"
19 #include "xe_bb.h"
20 #include "xe_bo.h"
21 #include "xe_configfs.h"
22 #include "xe_device.h"
23 #include "xe_drm_client.h"
24 #include "xe_exec_queue.h"
25 #include "xe_exec_queue_types.h"
26 #include "xe_gt.h"
27 #include "xe_gt_clock.h"
28 #include "xe_gt_printk.h"
29 #include "xe_hw_fence.h"
30 #include "xe_map.h"
31 #include "xe_memirq.h"
32 #include "xe_mmio.h"
33 #include "xe_ring_ops.h"
34 #include "xe_sriov.h"
35 #include "xe_trace_lrc.h"
36 #include "xe_vm.h"
37 #include "xe_wa.h"
38 
39 #define LRC_VALID				BIT_ULL(0)
40 #define LRC_PRIVILEGE				BIT_ULL(8)
41 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
42 #define LRC_LEGACY_64B_CONTEXT			3
43 
44 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
45 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
46 
47 #define LRC_PPHWSP_SIZE				SZ_4K
48 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
49 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
50 
51 #define LRC_PRIORITY				GENMASK_ULL(10, 9)
52 #define LRC_PRIORITY_LOW			0
53 #define LRC_PRIORITY_NORMAL			1
54 #define LRC_PRIORITY_HIGH			2
55 
56 /*
57  * Layout of the LRC and associated data allocated as
58  * lrc->bo:
59  *
60  *   Region                       Size
61  *  +============================+=================================+ <- __xe_lrc_ring_offset()
62  *  | Ring                       | ring_size, see                  |
63  *  |                            | xe_lrc_init()                   |
64  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
65  *  | PPHWSP (includes SW state) | 4K                              |
66  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
67  *  | Engine Context Image       | n * 4K, see                     |
68  *  |                            | xe_gt_lrc_size()                |
69  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
70  *  | Indirect Ring State Page   | 0 or 4k, see                    |
71  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
72  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
73  *  | Indirect Context Page      | 0 or 4k, see                    |
74  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
75  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
76  *  | WA BB Per Ctx              | 4k                              |
77  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
78  */
79 
80 static struct xe_device *
81 lrc_to_xe(struct xe_lrc *lrc)
82 {
83 	return gt_to_xe(lrc->fence_ctx.gt);
84 }
85 
86 static bool
87 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
88 {
89 	struct xe_device *xe = gt_to_xe(gt);
90 
91 	if (XE_GT_WA(gt, 16010904313) &&
92 	    (class == XE_ENGINE_CLASS_RENDER ||
93 	     class == XE_ENGINE_CLASS_COMPUTE))
94 		return true;
95 
96 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
97 					       class, NULL))
98 		return true;
99 
100 	if (gt->ring_ops[class]->emit_aux_table_inv)
101 		return true;
102 
103 	return false;
104 }
105 
106 /**
107  * xe_gt_lrc_hang_replay_size() - Hang replay size
108  * @gt: The GT
109  * @class: Hardware engine class
110  *
111  * Determine size of GPU hang replay state for a GT and hardware engine class.
112  *
113  * Return: Size of GPU hang replay size
114  */
115 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
116 {
117 	struct xe_device *xe = gt_to_xe(gt);
118 	size_t size = 0;
119 
120 	/* Engine context image */
121 	switch (class) {
122 	case XE_ENGINE_CLASS_RENDER:
123 		if (GRAPHICS_VERx100(xe) >= 3510)
124 			size += 7 * SZ_4K;
125 		else if (GRAPHICS_VER(xe) >= 20)
126 			size += 3 * SZ_4K;
127 		else
128 			size += 13 * SZ_4K;
129 		break;
130 	case XE_ENGINE_CLASS_COMPUTE:
131 		if (GRAPHICS_VERx100(xe) >= 3510)
132 			size += 5 * SZ_4K;
133 		else if (GRAPHICS_VER(xe) >= 20)
134 			size += 2 * SZ_4K;
135 		else
136 			size += 13 * SZ_4K;
137 		break;
138 	default:
139 		WARN(1, "Unknown engine class: %d", class);
140 		fallthrough;
141 	case XE_ENGINE_CLASS_COPY:
142 	case XE_ENGINE_CLASS_VIDEO_DECODE:
143 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
144 	case XE_ENGINE_CLASS_OTHER:
145 		size += 1 * SZ_4K;
146 	}
147 
148 	return size;
149 }
150 
151 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
152 {
153 	size_t size = xe_gt_lrc_hang_replay_size(gt, class);
154 
155 	/* Add indirect ring state page */
156 	if (xe_gt_has_indirect_ring_state(gt))
157 		size += LRC_INDIRECT_RING_STATE_SIZE;
158 
159 	return size + LRC_PPHWSP_SIZE;
160 }
161 
162 /*
163  * The per-platform tables are u8-encoded in @data. Decode @data and set the
164  * addresses' offset and commands in @regs. The following encoding is used
165  * for each byte. There are 2 steps: decoding commands and decoding addresses.
166  *
167  * Commands:
168  * [7]: create NOPs - number of NOPs are set in lower bits
169  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
170  *      MI_LRI_FORCE_POSTED
171  * [5:0]: Number of NOPs or registers to set values to in case of
172  *        MI_LOAD_REGISTER_IMM
173  *
174  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
175  * number of registers. They are set by using the REG/REG16 macros: the former
176  * is used for offsets smaller than 0x200 while the latter is for values bigger
177  * than that. Those macros already set all the bits documented below correctly:
178  *
179  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
180  *      follow, for the lower bits
181  * [6:0]: Register offset, without considering the engine base.
182  *
183  * This function only tweaks the commands and register offsets. Values are not
184  * filled out.
185  */
186 static void set_offsets(u32 *regs,
187 			const u8 *data,
188 			const struct xe_hw_engine *hwe)
189 #define NOP(x) (BIT(7) | (x))
190 #define LRI(count, flags) ((flags) << 6 | (count) | \
191 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
192 #define POSTED BIT(0)
193 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
194 #define REG16(x) \
195 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
196 	(((x) >> 2) & 0x7f)
197 {
198 	const u32 base = hwe->mmio_base;
199 
200 	while (*data) {
201 		u8 count, flags;
202 
203 		if (*data & BIT(7)) { /* skip */
204 			count = *data++ & ~BIT(7);
205 			regs += count;
206 			continue;
207 		}
208 
209 		count = *data & 0x3f;
210 		flags = *data >> 6;
211 		data++;
212 
213 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
214 		if (flags & POSTED)
215 			*regs |= MI_LRI_FORCE_POSTED;
216 		*regs |= MI_LRI_LRM_CS_MMIO;
217 		regs++;
218 
219 		xe_gt_assert(hwe->gt, count);
220 		do {
221 			u32 offset = 0;
222 			u8 v;
223 
224 			do {
225 				v = *data++;
226 				offset <<= 7;
227 				offset |= v & ~BIT(7);
228 			} while (v & BIT(7));
229 
230 			regs[0] = base + (offset << 2);
231 			regs += 2;
232 		} while (--count);
233 	}
234 
235 	*regs = MI_BATCH_BUFFER_END | BIT(0);
236 }
237 
238 static const u8 gen12_xcs_offsets[] = {
239 	NOP(1),
240 	LRI(13, POSTED),
241 	REG16(0x244),
242 	REG(0x034),
243 	REG(0x030),
244 	REG(0x038),
245 	REG(0x03c),
246 	REG(0x168),
247 	REG(0x140),
248 	REG(0x110),
249 	REG(0x1c0),
250 	REG(0x1c4),
251 	REG(0x1c8),
252 	REG(0x180),
253 	REG16(0x2b4),
254 
255 	NOP(5),
256 	LRI(9, POSTED),
257 	REG16(0x3a8),
258 	REG16(0x28c),
259 	REG16(0x288),
260 	REG16(0x284),
261 	REG16(0x280),
262 	REG16(0x27c),
263 	REG16(0x278),
264 	REG16(0x274),
265 	REG16(0x270),
266 
267 	0
268 };
269 
270 static const u8 dg2_xcs_offsets[] = {
271 	NOP(1),
272 	LRI(15, POSTED),
273 	REG16(0x244),
274 	REG(0x034),
275 	REG(0x030),
276 	REG(0x038),
277 	REG(0x03c),
278 	REG(0x168),
279 	REG(0x140),
280 	REG(0x110),
281 	REG(0x1c0),
282 	REG(0x1c4),
283 	REG(0x1c8),
284 	REG(0x180),
285 	REG16(0x2b4),
286 	REG(0x120),
287 	REG(0x124),
288 
289 	NOP(1),
290 	LRI(9, POSTED),
291 	REG16(0x3a8),
292 	REG16(0x28c),
293 	REG16(0x288),
294 	REG16(0x284),
295 	REG16(0x280),
296 	REG16(0x27c),
297 	REG16(0x278),
298 	REG16(0x274),
299 	REG16(0x270),
300 
301 	0
302 };
303 
304 static const u8 gen12_rcs_offsets[] = {
305 	NOP(1),
306 	LRI(13, POSTED),
307 	REG16(0x244),
308 	REG(0x034),
309 	REG(0x030),
310 	REG(0x038),
311 	REG(0x03c),
312 	REG(0x168),
313 	REG(0x140),
314 	REG(0x110),
315 	REG(0x1c0),
316 	REG(0x1c4),
317 	REG(0x1c8),
318 	REG(0x180),
319 	REG16(0x2b4),
320 
321 	NOP(5),
322 	LRI(9, POSTED),
323 	REG16(0x3a8),
324 	REG16(0x28c),
325 	REG16(0x288),
326 	REG16(0x284),
327 	REG16(0x280),
328 	REG16(0x27c),
329 	REG16(0x278),
330 	REG16(0x274),
331 	REG16(0x270),
332 
333 	LRI(3, POSTED),
334 	REG(0x1b0),
335 	REG16(0x5a8),
336 	REG16(0x5ac),
337 
338 	NOP(6),
339 	LRI(1, 0),
340 	REG(0x0c8),
341 	NOP(3 + 9 + 1),
342 
343 	LRI(51, POSTED),
344 	REG16(0x588),
345 	REG16(0x588),
346 	REG16(0x588),
347 	REG16(0x588),
348 	REG16(0x588),
349 	REG16(0x588),
350 	REG(0x028),
351 	REG(0x09c),
352 	REG(0x0c0),
353 	REG(0x178),
354 	REG(0x17c),
355 	REG16(0x358),
356 	REG(0x170),
357 	REG(0x150),
358 	REG(0x154),
359 	REG(0x158),
360 	REG16(0x41c),
361 	REG16(0x600),
362 	REG16(0x604),
363 	REG16(0x608),
364 	REG16(0x60c),
365 	REG16(0x610),
366 	REG16(0x614),
367 	REG16(0x618),
368 	REG16(0x61c),
369 	REG16(0x620),
370 	REG16(0x624),
371 	REG16(0x628),
372 	REG16(0x62c),
373 	REG16(0x630),
374 	REG16(0x634),
375 	REG16(0x638),
376 	REG16(0x63c),
377 	REG16(0x640),
378 	REG16(0x644),
379 	REG16(0x648),
380 	REG16(0x64c),
381 	REG16(0x650),
382 	REG16(0x654),
383 	REG16(0x658),
384 	REG16(0x65c),
385 	REG16(0x660),
386 	REG16(0x664),
387 	REG16(0x668),
388 	REG16(0x66c),
389 	REG16(0x670),
390 	REG16(0x674),
391 	REG16(0x678),
392 	REG16(0x67c),
393 	REG(0x068),
394 	REG(0x084),
395 	NOP(1),
396 
397 	0
398 };
399 
400 static const u8 xehp_rcs_offsets[] = {
401 	NOP(1),
402 	LRI(13, POSTED),
403 	REG16(0x244),
404 	REG(0x034),
405 	REG(0x030),
406 	REG(0x038),
407 	REG(0x03c),
408 	REG(0x168),
409 	REG(0x140),
410 	REG(0x110),
411 	REG(0x1c0),
412 	REG(0x1c4),
413 	REG(0x1c8),
414 	REG(0x180),
415 	REG16(0x2b4),
416 
417 	NOP(5),
418 	LRI(9, POSTED),
419 	REG16(0x3a8),
420 	REG16(0x28c),
421 	REG16(0x288),
422 	REG16(0x284),
423 	REG16(0x280),
424 	REG16(0x27c),
425 	REG16(0x278),
426 	REG16(0x274),
427 	REG16(0x270),
428 
429 	LRI(3, POSTED),
430 	REG(0x1b0),
431 	REG16(0x5a8),
432 	REG16(0x5ac),
433 
434 	NOP(6),
435 	LRI(1, 0),
436 	REG(0x0c8),
437 
438 	0
439 };
440 
441 static const u8 dg2_rcs_offsets[] = {
442 	NOP(1),
443 	LRI(15, POSTED),
444 	REG16(0x244),
445 	REG(0x034),
446 	REG(0x030),
447 	REG(0x038),
448 	REG(0x03c),
449 	REG(0x168),
450 	REG(0x140),
451 	REG(0x110),
452 	REG(0x1c0),
453 	REG(0x1c4),
454 	REG(0x1c8),
455 	REG(0x180),
456 	REG16(0x2b4),
457 	REG(0x120),
458 	REG(0x124),
459 
460 	NOP(1),
461 	LRI(9, POSTED),
462 	REG16(0x3a8),
463 	REG16(0x28c),
464 	REG16(0x288),
465 	REG16(0x284),
466 	REG16(0x280),
467 	REG16(0x27c),
468 	REG16(0x278),
469 	REG16(0x274),
470 	REG16(0x270),
471 
472 	LRI(3, POSTED),
473 	REG(0x1b0),
474 	REG16(0x5a8),
475 	REG16(0x5ac),
476 
477 	NOP(6),
478 	LRI(1, 0),
479 	REG(0x0c8),
480 
481 	0
482 };
483 
484 static const u8 mtl_rcs_offsets[] = {
485 	NOP(1),
486 	LRI(15, POSTED),
487 	REG16(0x244),
488 	REG(0x034),
489 	REG(0x030),
490 	REG(0x038),
491 	REG(0x03c),
492 	REG(0x168),
493 	REG(0x140),
494 	REG(0x110),
495 	REG(0x1c0),
496 	REG(0x1c4),
497 	REG(0x1c8),
498 	REG(0x180),
499 	REG16(0x2b4),
500 	REG(0x120),
501 	REG(0x124),
502 
503 	NOP(1),
504 	LRI(9, POSTED),
505 	REG16(0x3a8),
506 	REG16(0x28c),
507 	REG16(0x288),
508 	REG16(0x284),
509 	REG16(0x280),
510 	REG16(0x27c),
511 	REG16(0x278),
512 	REG16(0x274),
513 	REG16(0x270),
514 
515 	NOP(2),
516 	LRI(2, POSTED),
517 	REG16(0x5a8),
518 	REG16(0x5ac),
519 
520 	NOP(6),
521 	LRI(1, 0),
522 	REG(0x0c8),
523 
524 	0
525 };
526 
527 #define XE2_CTX_COMMON \
528 	NOP(1),                 /* [0x00] */ \
529 	LRI(15, POSTED),        /* [0x01] */ \
530 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
531 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
532 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
533 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
534 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
535 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
536 	REG(0x140),             /* [0x0e] BB_ADDR */ \
537 	REG(0x110),             /* [0x10] BB_STATE */ \
538 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
539 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
540 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
541 	REG(0x180),             /* [0x18] CCID */ \
542 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
543 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
544 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
545 	\
546 	NOP(1),                 /* [0x20] */ \
547 	LRI(9, POSTED),         /* [0x21] */ \
548 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
549 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
550 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
551 	REG16(0x284),           /* [0x28] dummy reg */ \
552 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
553 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
554 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
555 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
556 	REG16(0x270)            /* [0x32] PTBP_LDW */
557 
558 static const u8 xe2_rcs_offsets[] = {
559 	XE2_CTX_COMMON,
560 
561 	NOP(2),                 /* [0x34] */
562 	LRI(2, POSTED),         /* [0x36] */
563 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
564 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
565 
566 	NOP(6),                 /* [0x41] */
567 	LRI(1, 0),              /* [0x47] */
568 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
569 
570 	0
571 };
572 
573 static const u8 xe2_bcs_offsets[] = {
574 	XE2_CTX_COMMON,
575 
576 	NOP(4 + 8 + 1),         /* [0x34] */
577 	LRI(2, POSTED),         /* [0x41] */
578 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
579 	REG16(0x204),           /* [0x44] BLIT_CCTL */
580 
581 	0
582 };
583 
584 static const u8 xe2_xcs_offsets[] = {
585 	XE2_CTX_COMMON,
586 
587 	0
588 };
589 
590 static const u8 xe2_indirect_ring_state_offsets[] = {
591 	NOP(1),                 /* [0x00] */
592 	LRI(5, POSTED),         /* [0x01] */
593 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
594 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
595 	REG(0x038),             /* [0x06] RING_BUFFER_START */
596 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
597 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
598 
599 	NOP(5),                 /* [0x0c] */
600 	LRI(9, POSTED),         /* [0x11] */
601 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
602 	REG(0x140),             /* [0x14] BB_ADDR */
603 	REG(0x110),             /* [0x16] BB_STATE */
604 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
605 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
606 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
607 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
608 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
609 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
610 
611 	NOP(12),                 /* [0x00] */
612 
613 	0
614 };
615 
616 #undef REG16
617 #undef REG
618 #undef LRI
619 #undef NOP
620 
621 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
622 {
623 	if (class == XE_ENGINE_CLASS_RENDER) {
624 		if (GRAPHICS_VER(xe) >= 20)
625 			return xe2_rcs_offsets;
626 		else if (GRAPHICS_VERx100(xe) >= 1270)
627 			return mtl_rcs_offsets;
628 		else if (GRAPHICS_VERx100(xe) >= 1255)
629 			return dg2_rcs_offsets;
630 		else if (GRAPHICS_VERx100(xe) >= 1250)
631 			return xehp_rcs_offsets;
632 		else
633 			return gen12_rcs_offsets;
634 	} else if (class == XE_ENGINE_CLASS_COPY) {
635 		if (GRAPHICS_VER(xe) >= 20)
636 			return xe2_bcs_offsets;
637 		else
638 			return gen12_xcs_offsets;
639 	} else {
640 		if (GRAPHICS_VER(xe) >= 20)
641 			return xe2_xcs_offsets;
642 		else if (GRAPHICS_VERx100(xe) >= 1255)
643 			return dg2_xcs_offsets;
644 		else
645 			return gen12_xcs_offsets;
646 	}
647 }
648 
649 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
650 {
651 	regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
652 							    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
653 
654 	if (xe_gt_has_indirect_ring_state(hwe->gt))
655 		regs[CTX_CONTEXT_CONTROL] |=
656 			REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
657 }
658 
659 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
660 {
661 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
662 	struct xe_device *xe = gt_to_xe(hwe->gt);
663 	u8 num_regs;
664 
665 	if (!xe_device_uses_memirq(xe))
666 		return;
667 
668 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
669 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
670 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
671 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
672 
673 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
674 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
675 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
676 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
677 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
678 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
679 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
680 
681 	if (xe_device_has_msix(xe)) {
682 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
683 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
684 	}
685 }
686 
687 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
688 {
689 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
690 }
691 
692 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
693 {
694 	return 0;
695 }
696 
697 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
698 {
699 	return lrc->ring.size;
700 }
701 
702 /* Make the magic macros work */
703 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
704 #define __xe_lrc_regs_offset xe_lrc_regs_offset
705 
706 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512
707 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
708 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
709 
710 #define LRC_SEQNO_OFFSET 0
711 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8)
712 
713 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
714 {
715 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
716 }
717 
718 /**
719  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
720  * @xe: the &xe_device struct instance
721  *
722  * Returns: Size of the LRC registers area for current platform
723  */
724 size_t xe_lrc_reg_size(struct xe_device *xe)
725 {
726 	if (GRAPHICS_VERx100(xe) >= 1250)
727 		return 96 * sizeof(u32);
728 	else
729 		return 80 * sizeof(u32);
730 }
731 
732 /**
733  * xe_lrc_engine_state_size() - Get size of the engine state within LRC
734  * @gt: the &xe_gt struct instance
735  * @class: Hardware engine class
736  *
737  * Returns: Size of the engine state
738  */
739 size_t xe_lrc_engine_state_size(struct xe_gt *gt, enum xe_engine_class class)
740 {
741 	return xe_gt_lrc_hang_replay_size(gt, class) - xe_lrc_reg_size(gt_to_xe(gt));
742 }
743 
744 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
745 {
746 	return LRC_SEQNO_OFFSET;
747 }
748 
749 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
750 {
751 	return LRC_START_SEQNO_OFFSET;
752 }
753 
754 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
755 {
756 	/* This is stored in the driver-defined portion of PPHWSP */
757 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
758 }
759 
760 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
761 {
762 	/* The parallel is stored in the driver-defined portion of PPHWSP */
763 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
764 }
765 
766 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
767 {
768 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
769 }
770 
771 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
772 {
773 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
774 }
775 
776 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
777 {
778 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
779 }
780 
781 static u32 __xe_lrc_queue_timestamp_offset(struct xe_lrc *lrc)
782 {
783 	return __xe_lrc_regs_offset(lrc) + CTX_QUEUE_TIMESTAMP * sizeof(u32);
784 }
785 
786 static u32 __xe_lrc_queue_timestamp_udw_offset(struct xe_lrc *lrc)
787 {
788 	return __xe_lrc_regs_offset(lrc) + CTX_QUEUE_TIMESTAMP_UDW * sizeof(u32);
789 }
790 
791 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
792 {
793 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
794 		     LRC_INDIRECT_RING_STATE_SIZE;
795 
796 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
797 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
798 
799 	return offset;
800 }
801 
802 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
803 {
804 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
805 }
806 
807 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
808 {
809 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
810 }
811 
812 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \
813 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
814 { \
815 	struct xe_bo *bo = (bo_expr); \
816 	struct iosys_map map = bo->vmap; \
817 \
818 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
819 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
820 	return map; \
821 } \
822 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
823 { \
824 	struct xe_bo *bo = (bo_expr); \
825 \
826 	return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \
827 } \
828 
829 DECL_MAP_ADDR_HELPERS(ring, lrc->bo)
830 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo)
831 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo)
832 DECL_MAP_ADDR_HELPERS(regs, lrc->bo)
833 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo)
834 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo)
835 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo)
836 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo)
837 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo)
838 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo)
839 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo)
840 DECL_MAP_ADDR_HELPERS(queue_timestamp, lrc->bo)
841 DECL_MAP_ADDR_HELPERS(queue_timestamp_udw, lrc->bo)
842 
843 #undef DECL_MAP_ADDR_HELPERS
844 
845 /**
846  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
847  * @lrc: Pointer to the lrc.
848  *
849  * Returns: ctx timestamp GGTT address
850  */
851 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
852 {
853 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
854 }
855 
856 /**
857  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
858  * @lrc: Pointer to the lrc.
859  *
860  * Returns: ctx timestamp udw GGTT address
861  */
862 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
863 {
864 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
865 }
866 
867 /**
868  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
869  * @lrc: Pointer to the lrc.
870  *
871  * Returns: ctx timestamp value
872  */
873 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
874 {
875 	struct xe_device *xe = lrc_to_xe(lrc);
876 	struct iosys_map map;
877 	u32 ldw, udw = 0;
878 
879 	map = __xe_lrc_ctx_timestamp_map(lrc);
880 	ldw = xe_map_read32(xe, &map);
881 
882 	if (xe->info.has_64bit_timestamp) {
883 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
884 		udw = xe_map_read32(xe, &map);
885 	}
886 
887 	return (u64)udw << 32 | ldw;
888 }
889 
890 /**
891  * xe_lrc_queue_timestamp() - Read queue timestamp value
892  * @lrc: Pointer to the lrc.
893  *
894  * Returns: queue timestamp value
895  */
896 static u64 xe_lrc_queue_timestamp(struct xe_lrc *lrc)
897 {
898 	struct xe_device *xe = lrc_to_xe(lrc);
899 	struct iosys_map map;
900 	u32 ldw, udw = 0;
901 
902 	xe_assert(xe, xe_lrc_is_multi_queue(lrc));
903 
904 	map = __xe_lrc_queue_timestamp_map(lrc);
905 	ldw = xe_map_read32(xe, &map);
906 
907 	map = __xe_lrc_queue_timestamp_udw_map(lrc);
908 	udw = xe_map_read32(xe, &map);
909 
910 	return (u64)udw << 32 | ldw;
911 }
912 
913 /**
914  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
915  * @lrc: Pointer to the lrc.
916  *
917  * Returns: ctx timestamp job GGTT address
918  */
919 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
920 {
921 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
922 }
923 
924 /**
925  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
926  * @lrc: Pointer to the lrc.
927  *
928  * Returns: ctx timestamp job value
929  */
930 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
931 {
932 	struct xe_device *xe = lrc_to_xe(lrc);
933 	struct iosys_map map;
934 
935 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
936 	return xe_map_read32(xe, &map);
937 }
938 
939 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
940 {
941 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
942 }
943 
944 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
945 {
946 	if (!xe_lrc_has_indirect_ring_state(lrc))
947 		return 0;
948 
949 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
950 }
951 
952 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
953 {
954 	struct xe_device *xe = lrc_to_xe(lrc);
955 	struct iosys_map map;
956 
957 	map = __xe_lrc_indirect_ring_map(lrc);
958 	iosys_map_incr(&map, reg_nr * sizeof(u32));
959 	return xe_map_read32(xe, &map);
960 }
961 
962 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
963 					  int reg_nr, u32 val)
964 {
965 	struct xe_device *xe = lrc_to_xe(lrc);
966 	struct iosys_map map;
967 
968 	map = __xe_lrc_indirect_ring_map(lrc);
969 	iosys_map_incr(&map, reg_nr * sizeof(u32));
970 	xe_map_write32(xe, &map, val);
971 }
972 
973 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
974 {
975 	struct xe_device *xe = lrc_to_xe(lrc);
976 	struct iosys_map map;
977 
978 	map = __xe_lrc_regs_map(lrc);
979 	iosys_map_incr(&map, reg_nr * sizeof(u32));
980 	return xe_map_read32(xe, &map);
981 }
982 
983 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
984 {
985 	struct xe_device *xe = lrc_to_xe(lrc);
986 	struct iosys_map map;
987 
988 	map = __xe_lrc_regs_map(lrc);
989 	iosys_map_incr(&map, reg_nr * sizeof(u32));
990 	xe_map_write32(xe, &map, val);
991 }
992 
993 static void *empty_lrc_data(struct xe_hw_engine *hwe)
994 {
995 	struct xe_gt *gt = hwe->gt;
996 	void *data;
997 	u32 *regs;
998 
999 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
1000 	if (!data)
1001 		return NULL;
1002 
1003 	/* 1st page: Per-Process of HW status Page */
1004 	regs = data + LRC_PPHWSP_SIZE;
1005 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
1006 	set_context_control(regs, hwe);
1007 	set_memory_based_intr(regs, hwe);
1008 	if (xe_gt_has_indirect_ring_state(gt)) {
1009 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
1010 		       LRC_INDIRECT_RING_STATE_SIZE;
1011 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
1012 	}
1013 
1014 	return data;
1015 }
1016 
1017 /**
1018  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
1019  * of given engine.
1020  * @hwe: the &xe_hw_engine struct instance
1021  */
1022 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
1023 {
1024 	struct xe_gt *gt = hwe->gt;
1025 	u32 *regs;
1026 
1027 	if (!gt->default_lrc[hwe->class])
1028 		return;
1029 
1030 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
1031 	set_memory_based_intr(regs, hwe);
1032 }
1033 
1034 /**
1035  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
1036  * for given LRC.
1037  * @lrc: the &xe_lrc struct instance
1038  * @hwe: the &xe_hw_engine struct instance
1039  * @regs: scratch buffer to be used as temporary storage
1040  */
1041 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1042 					    u32 *regs)
1043 {
1044 	struct xe_gt *gt = hwe->gt;
1045 	struct iosys_map map;
1046 	size_t regs_len;
1047 
1048 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
1049 		return;
1050 
1051 	map = __xe_lrc_regs_map(lrc);
1052 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1053 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1054 	set_memory_based_intr(regs, hwe);
1055 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1056 }
1057 
1058 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1059 {
1060 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1061 
1062 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1063 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1064 }
1065 
1066 static void xe_lrc_finish(struct xe_lrc *lrc)
1067 {
1068 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1069 	xe_bo_unpin_map_no_vm(lrc->bo);
1070 	xe_bo_unpin_map_no_vm(lrc->seqno_bo);
1071 }
1072 
1073 /*
1074  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1075  * in calculating active context run ticks.
1076  *
1077  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1078  * context, but only gets updated when the context switches out. In order to
1079  * check how long a context has been active before it switches out, two things
1080  * are required:
1081  *
1082  * (1) Determine if the context is running:
1083  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1084  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1085  * initialized. During a query, we just check for this value to determine if the
1086  * context is active. If the context switched out, it would overwrite this
1087  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1088  * the last part of context restore, so reusing this LRC location will not
1089  * clobber anything.
1090  *
1091  * (2) Calculate the time that the context has been active for:
1092  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1093  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1094  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1095  * engine instance. Since we do not know which instance the context is running
1096  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1097  * store it in the PPHSWP.
1098  */
1099 #define CONTEXT_ACTIVE 1ULL
1100 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1101 				    struct xe_hw_engine *hwe,
1102 				    u32 *batch,
1103 				    size_t max_len)
1104 {
1105 	u32 *cmd = batch;
1106 
1107 	if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1108 		return 0;
1109 
1110 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1111 		return -ENOSPC;
1112 
1113 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1114 	*cmd++ = ENGINE_ID(0).addr;
1115 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1116 	*cmd++ = 0;
1117 
1118 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1119 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1120 	*cmd++ = 0;
1121 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1122 
1123 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1124 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1125 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1126 		*cmd++ = 0;
1127 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1128 	}
1129 
1130 	return cmd - batch;
1131 }
1132 
1133 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1134 				  u32 *batch, size_t max_len)
1135 {
1136 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1137 	u32 *cmd = batch;
1138 
1139 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1140 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1141 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1142 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1143 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1144 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1145 		return 0;
1146 
1147 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1148 		return -ENOSPC;
1149 
1150 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1151 		 MI_LRM_ASYNC;
1152 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1153 	*cmd++ = ts_addr;
1154 	*cmd++ = 0;
1155 
1156 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1157 		 MI_LRM_ASYNC;
1158 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1159 	*cmd++ = ts_addr;
1160 	*cmd++ = 0;
1161 
1162 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1163 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1164 	*cmd++ = ts_addr;
1165 	*cmd++ = 0;
1166 
1167 	return cmd - batch;
1168 }
1169 
1170 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1171 						  struct xe_hw_engine *hwe,
1172 						  u32 *batch, size_t max_len)
1173 {
1174 	struct xe_device *xe = gt_to_xe(lrc->gt);
1175 	const u32 *user_batch;
1176 	u32 *cmd = batch;
1177 	u32 count;
1178 
1179 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1180 						    hwe->class, &user_batch);
1181 	if (!count)
1182 		return 0;
1183 
1184 	if (count > max_len)
1185 		return -ENOSPC;
1186 
1187 	/*
1188 	 * This should be used only for tests and validation. Taint the kernel
1189 	 * as anything could be submitted directly in context switches
1190 	 */
1191 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1192 
1193 	memcpy(cmd, user_batch, count * sizeof(u32));
1194 	cmd += count;
1195 
1196 	return cmd - batch;
1197 }
1198 
1199 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1200 						 struct xe_hw_engine *hwe,
1201 						 u32 *batch, size_t max_len)
1202 {
1203 	struct xe_device *xe = gt_to_xe(lrc->gt);
1204 	const u32 *user_batch;
1205 	u32 *cmd = batch;
1206 	u32 count;
1207 
1208 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1209 						   hwe->class, &user_batch);
1210 	if (!count)
1211 		return 0;
1212 
1213 	if (count > max_len)
1214 		return -ENOSPC;
1215 
1216 	/*
1217 	 * This should be used only for tests and validation. Taint the kernel
1218 	 * as anything could be submitted directly in context switches
1219 	 */
1220 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1221 
1222 	memcpy(cmd, user_batch, count * sizeof(u32));
1223 	cmd += count;
1224 
1225 	return cmd - batch;
1226 }
1227 
1228 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1229 					       struct xe_hw_engine *hwe,
1230 					       u32 *batch, size_t max_len)
1231 {
1232 	u32 *cmd = batch;
1233 
1234 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1235 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1236 		return 0;
1237 
1238 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1239 		return -ENOSPC;
1240 
1241 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_LRM_CS_MMIO | MI_LRI_NUM_REGS(1);
1242 	*cmd++ = CS_DEBUG_MODE2(0).addr;
1243 	*cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1244 
1245 	return cmd - batch;
1246 }
1247 
1248 static ssize_t setup_invalidate_auxccs_wa(struct xe_lrc *lrc,
1249 					  struct xe_hw_engine *hwe,
1250 					  u32 *batch, size_t max_len)
1251 {
1252 	struct xe_gt *gt = lrc->gt;
1253 	u32 *(*emit)(struct xe_gt *gt, u32 *cmd) =
1254 		gt->ring_ops[hwe->class]->emit_aux_table_inv;
1255 
1256 	if (!emit)
1257 		return 0;
1258 
1259 	if (xe_gt_WARN_ON(gt, max_len < 8))
1260 		return -ENOSPC;
1261 
1262 	return emit(gt, batch) - batch;
1263 }
1264 
1265 struct bo_setup {
1266 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1267 			 u32 *batch, size_t max_size);
1268 };
1269 
1270 struct bo_setup_state {
1271 	/* Input: */
1272 	struct xe_lrc		*lrc;
1273 	struct xe_hw_engine	*hwe;
1274 	size_t			max_size;
1275 	size_t                  reserve_dw;
1276 	unsigned int		offset;
1277 	const struct bo_setup	*funcs;
1278 	unsigned int		num_funcs;
1279 
1280 	/* State: */
1281 	u32			*buffer;
1282 	u32			*ptr;
1283 	unsigned int		written;
1284 };
1285 
1286 static int setup_bo(struct bo_setup_state *state)
1287 {
1288 	ssize_t remain;
1289 
1290 	if (state->lrc->bo->vmap.is_iomem) {
1291 		xe_gt_assert(state->hwe->gt, state->buffer);
1292 		state->ptr = state->buffer;
1293 	} else {
1294 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1295 	}
1296 
1297 	remain = state->max_size / sizeof(u32);
1298 
1299 	for (size_t i = 0; i < state->num_funcs; i++) {
1300 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1301 						    state->ptr, remain);
1302 
1303 		remain -= len;
1304 
1305 		/*
1306 		 * Caller has asked for at least reserve_dw to remain unused.
1307 		 */
1308 		if (len < 0 ||
1309 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1310 			goto fail;
1311 
1312 		state->ptr += len;
1313 		state->written += len;
1314 	}
1315 
1316 	return 0;
1317 
1318 fail:
1319 	return -ENOSPC;
1320 }
1321 
1322 static void finish_bo(struct bo_setup_state *state)
1323 {
1324 	if (!state->lrc->bo->vmap.is_iomem)
1325 		return;
1326 
1327 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1328 			 state->offset, state->buffer,
1329 			 state->written * sizeof(u32));
1330 }
1331 
1332 /**
1333  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1334  * @lrc: the &xe_lrc struct instance
1335  * @hwe: the &xe_hw_engine struct instance
1336  * @scratch: preallocated scratch buffer for temporary storage
1337  * Return: 0 on success, negative error code on failure
1338  */
1339 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1340 {
1341 	static const struct bo_setup funcs[] = {
1342 		{ .setup = setup_timestamp_wa },
1343 		{ .setup = setup_invalidate_state_cache_wa },
1344 		{ .setup = setup_utilization_wa },
1345 		{ .setup = setup_configfs_post_ctx_restore_bb },
1346 	};
1347 	struct bo_setup_state state = {
1348 		.lrc = lrc,
1349 		.hwe = hwe,
1350 		.max_size = LRC_WA_BB_SIZE,
1351 		.buffer = scratch,
1352 		.reserve_dw = 1,
1353 		.offset = __xe_lrc_wa_bb_offset(lrc),
1354 		.funcs = funcs,
1355 		.num_funcs = ARRAY_SIZE(funcs),
1356 	};
1357 	int ret;
1358 
1359 	ret = setup_bo(&state);
1360 	if (ret)
1361 		return ret;
1362 
1363 	*state.ptr++ = MI_BATCH_BUFFER_END;
1364 	state.written++;
1365 
1366 	finish_bo(&state);
1367 
1368 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1369 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1370 
1371 	return 0;
1372 }
1373 
1374 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1375 {
1376 	u32 *buf = NULL;
1377 	int ret;
1378 
1379 	if (lrc->bo->vmap.is_iomem) {
1380 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1381 		if (!buf)
1382 			return -ENOMEM;
1383 	}
1384 
1385 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1386 
1387 	kfree(buf);
1388 
1389 	return ret;
1390 }
1391 
1392 static int
1393 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1394 {
1395 	static const struct bo_setup rcs_funcs[] = {
1396 		{ .setup = setup_timestamp_wa },
1397 		{ .setup = setup_invalidate_auxccs_wa },
1398 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1399 	};
1400 	static const struct bo_setup xcs_funcs[] = {
1401 		{ .setup = setup_invalidate_auxccs_wa },
1402 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1403 	};
1404 	struct bo_setup_state state = {
1405 		.lrc = lrc,
1406 		.hwe = hwe,
1407 		.max_size = (63 * 64) /* max 63 cachelines */,
1408 		.buffer = NULL,
1409 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1410 	};
1411 	int ret;
1412 
1413 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1414 		return 0;
1415 
1416 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1417 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1418 		state.funcs = rcs_funcs;
1419 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1420 	} else {
1421 		state.funcs = xcs_funcs;
1422 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1423 	}
1424 
1425 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1426 		return 0;
1427 
1428 	if (lrc->bo->vmap.is_iomem) {
1429 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1430 		if (!state.buffer)
1431 			return -ENOMEM;
1432 	}
1433 
1434 	ret = setup_bo(&state);
1435 	if (ret) {
1436 		kfree(state.buffer);
1437 		return ret;
1438 	}
1439 
1440 	/*
1441 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1442 	 * execute: size for indirect ctx must be a multiple of 64.
1443 	 */
1444 	while (state.written & 0xf) {
1445 		*state.ptr++ = MI_NOOP;
1446 		state.written++;
1447 	}
1448 
1449 	finish_bo(&state);
1450 	kfree(state.buffer);
1451 
1452 	/*
1453 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1454 	 * varies per engine class, but the default is good enough
1455 	 */
1456 	xe_lrc_write_ctx_reg(lrc,
1457 			     CTX_CS_INDIRECT_CTX,
1458 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1459 			     /* Size in CLs. */
1460 			     (state.written * sizeof(u32) / 64));
1461 
1462 	return 0;
1463 }
1464 
1465 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1466 {
1467 	struct xe_device *xe = gt_to_xe(lrc->gt);
1468 
1469 	xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1470 		       priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1471 
1472 	/* xe_multi_queue_priority is directly mapped to LRC priority values */
1473 	return priority;
1474 }
1475 
1476 /**
1477  * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1478  * @lrc: Logical Ring Context
1479  * @priority: Multi queue priority of the exec queue
1480  *
1481  * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1482  */
1483 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1484 {
1485 	lrc->desc &= ~LRC_PRIORITY;
1486 	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1487 }
1488 
1489 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1490 			   void *replay_state, u16 msix_vec, u32 init_flags)
1491 {
1492 	struct xe_gt *gt = hwe->gt;
1493 	struct xe_tile *tile = gt_to_tile(gt);
1494 	struct xe_device *xe = gt_to_xe(gt);
1495 	struct iosys_map map;
1496 	u32 arb_enable;
1497 	u32 state_cache_perf_fix[3];
1498 	int err;
1499 
1500 	/*
1501 	 * Init Per-Process of HW status Page, LRC / context state to known
1502 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1503 	 * it's the early submission to record the lrc: build a new empty one from
1504 	 * scratch.
1505 	 */
1506 	map = __xe_lrc_pphwsp_map(lrc);
1507 	if (gt->default_lrc[hwe->class] || replay_state) {
1508 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1509 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1510 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1511 				 lrc->size - LRC_PPHWSP_SIZE);
1512 		if (replay_state)
1513 			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1514 					 replay_state, lrc->replay_size);
1515 	} else {
1516 		void *init_data = empty_lrc_data(hwe);
1517 
1518 		if (!init_data) {
1519 			return -ENOMEM;
1520 		}
1521 
1522 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
1523 		kfree(init_data);
1524 	}
1525 
1526 	if (vm)
1527 		xe_lrc_set_ppgtt(lrc, vm);
1528 
1529 	if (xe_device_has_msix(xe)) {
1530 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1531 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1532 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1533 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1534 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1535 	}
1536 
1537 	if (xe_gt_has_indirect_ring_state(gt)) {
1538 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1539 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1540 
1541 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1542 					      __xe_lrc_ring_ggtt_addr(lrc));
1543 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1544 
1545 		/* Match head and tail pointers */
1546 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail);
1547 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1548 
1549 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1550 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1551 	} else {
1552 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1553 
1554 		/* Match head and tail pointers */
1555 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail);
1556 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1557 
1558 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1559 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1560 	}
1561 
1562 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1563 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1564 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1565 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE));
1566 
1567 	if (init_flags & XE_LRC_CREATE_PXP)
1568 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1569 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1570 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE));
1571 
1572 	lrc->ctx_timestamp = 0;
1573 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1574 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1575 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1576 
1577 	/*
1578 	 * Note: It's possible that this LRC may belong to an exec_queue that is
1579 	 * not part of a multi-queue group. That said, it doesn't hurt to set
1580 	 * this field anyways since any class that supports multi-queue will
1581 	 * have these LRC fields defined.
1582 	 */
1583 	if (xe_gt_supports_multi_queue(gt, hwe->class)) {
1584 		lrc->queue_timestamp = 0;
1585 		xe_lrc_write_ctx_reg(lrc, CTX_QUEUE_TIMESTAMP, 0);
1586 		xe_lrc_write_ctx_reg(lrc, CTX_QUEUE_TIMESTAMP_UDW, 0);
1587 	}
1588 
1589 	if (xe->info.has_asid && vm)
1590 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1591 
1592 	lrc->desc = LRC_VALID;
1593 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1594 	/* TODO: Priority */
1595 
1596 	/* While this appears to have something about privileged batches or
1597 	 * some such, it really just means PPGTT mode.
1598 	 */
1599 	if (vm)
1600 		lrc->desc |= LRC_PRIVILEGE;
1601 
1602 	if (GRAPHICS_VERx100(xe) < 1250) {
1603 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1604 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1605 	}
1606 
1607 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1608 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1609 
1610 	if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) {
1611 		state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1612 		state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
1613 		state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX);
1614 		xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
1615 	}
1616 
1617 	map = __xe_lrc_seqno_map(lrc);
1618 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1619 
1620 	map = __xe_lrc_start_seqno_map(lrc);
1621 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1622 
1623 	err = setup_wa_bb(lrc, hwe);
1624 	if (err)
1625 		return err;
1626 
1627 	err = setup_indirect_ctx(lrc, hwe);
1628 
1629 	return err;
1630 }
1631 
1632 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1633 		       void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
1634 {
1635 	struct xe_gt *gt = hwe->gt;
1636 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1637 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1638 	struct xe_tile *tile = gt_to_tile(gt);
1639 	struct xe_device *xe = gt_to_xe(gt);
1640 	struct xe_bo *bo;
1641 	u32 bo_flags;
1642 	int err;
1643 
1644 	kref_init(&lrc->refcount);
1645 	lrc->gt = gt;
1646 	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1647 	lrc->size = lrc_size;
1648 	lrc->flags = 0;
1649 	lrc->ring.size = ring_size;
1650 	lrc->ring.tail = 0;
1651 
1652 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1653 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1654 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1655 	}
1656 
1657 	if (xe_gt_has_indirect_ring_state(gt))
1658 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1659 
1660 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1661 		   XE_BO_FLAG_GGTT_INVALIDATE;
1662 
1663 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1664 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1665 
1666 	bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
1667 				       ttm_bo_type_kernel,
1668 				       bo_flags, false);
1669 	if (IS_ERR(bo))
1670 		return PTR_ERR(bo);
1671 
1672 	lrc->bo = bo;
1673 
1674 	bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
1675 				       ttm_bo_type_kernel,
1676 				       XE_BO_FLAG_GGTT |
1677 				       XE_BO_FLAG_GGTT_INVALIDATE |
1678 				       XE_BO_FLAG_SYSTEM, false);
1679 	if (IS_ERR(bo)) {
1680 		err = PTR_ERR(bo);
1681 		goto err_lrc_finish;
1682 	}
1683 	lrc->seqno_bo = bo;
1684 
1685 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1686 			     hwe->fence_irq, hwe->name);
1687 
1688 	err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags);
1689 	if (err)
1690 		goto err_lrc_finish;
1691 
1692 	if (vm && vm->xef)
1693 		xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1694 
1695 	return 0;
1696 
1697 err_lrc_finish:
1698 	xe_lrc_finish(lrc);
1699 	return err;
1700 }
1701 
1702 /**
1703  * xe_lrc_create - Create a LRC
1704  * @hwe: Hardware Engine
1705  * @vm: The VM (address space)
1706  * @replay_state: GPU hang replay state
1707  * @ring_size: LRC ring size
1708  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1709  * @flags: LRC initialization flags
1710  *
1711  * Allocate and initialize the Logical Ring Context (LRC).
1712  *
1713  * Return pointer to created LRC upon success and an error pointer
1714  * upon failure.
1715  */
1716 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1717 			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1718 {
1719 	struct xe_lrc *lrc;
1720 	int err;
1721 
1722 	lrc = kzalloc_obj(*lrc);
1723 	if (!lrc)
1724 		return ERR_PTR(-ENOMEM);
1725 
1726 	err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1727 	if (err) {
1728 		kfree(lrc);
1729 		return ERR_PTR(err);
1730 	}
1731 
1732 	return lrc;
1733 }
1734 
1735 /**
1736  * xe_lrc_destroy - Destroy the LRC
1737  * @ref: reference to LRC
1738  *
1739  * Called when ref == 0, release resources held by the Logical Ring Context
1740  * (LRC) and free the LRC memory.
1741  */
1742 void xe_lrc_destroy(struct kref *ref)
1743 {
1744 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1745 
1746 	xe_lrc_finish(lrc);
1747 	kfree(lrc);
1748 }
1749 
1750 /**
1751  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1752  * @lrc: the &xe_lrc struct instance
1753  */
1754 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1755 {
1756 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1757 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1758 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1759 
1760 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1761 					      __xe_lrc_ring_ggtt_addr(lrc));
1762 	} else {
1763 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1764 	}
1765 }
1766 
1767 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1768 {
1769 	if (xe_lrc_has_indirect_ring_state(lrc))
1770 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1771 	else
1772 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1773 }
1774 
1775 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1776 {
1777 	if (xe_lrc_has_indirect_ring_state(lrc))
1778 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1779 	else
1780 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1781 }
1782 
1783 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1784 {
1785 	if (xe_lrc_has_indirect_ring_state(lrc))
1786 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1787 	else
1788 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1789 }
1790 
1791 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1792 {
1793 	if (xe_lrc_has_indirect_ring_state(lrc))
1794 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1795 	else
1796 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1797 }
1798 
1799 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1800 {
1801 	if (xe_lrc_has_indirect_ring_state(lrc))
1802 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1803 	else
1804 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1805 }
1806 
1807 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1808 {
1809 	const u32 head = xe_lrc_ring_head(lrc);
1810 	const u32 tail = lrc->ring.tail;
1811 	const u32 size = lrc->ring.size;
1812 
1813 	return ((head - tail - 1) & (size - 1)) + 1;
1814 }
1815 
1816 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1817 				const void *data, size_t size)
1818 {
1819 	struct xe_device *xe = lrc_to_xe(lrc);
1820 
1821 	iosys_map_incr(&ring, lrc->ring.tail);
1822 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1823 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1824 }
1825 
1826 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1827 {
1828 	struct xe_device *xe = lrc_to_xe(lrc);
1829 	struct iosys_map ring;
1830 	u32 rhs;
1831 	size_t aligned_size;
1832 
1833 	xe_assert(xe, IS_ALIGNED(size, 4));
1834 	aligned_size = ALIGN(size, 8);
1835 
1836 	ring = __xe_lrc_ring_map(lrc);
1837 
1838 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1839 	rhs = lrc->ring.size - lrc->ring.tail;
1840 	if (size > rhs) {
1841 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1842 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1843 	} else {
1844 		__xe_lrc_write_ring(lrc, ring, data, size);
1845 	}
1846 
1847 	if (aligned_size > size) {
1848 		u32 noop = MI_NOOP;
1849 
1850 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1851 	}
1852 }
1853 
1854 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1855 {
1856 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1857 }
1858 
1859 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1860 {
1861 	return __xe_lrc_seqno_ggtt_addr(lrc);
1862 }
1863 
1864 /**
1865  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1866  *
1867  * Allocate but don't initialize an lrc seqno fence.
1868  *
1869  * Return: Pointer to the allocated fence or
1870  * negative error pointer on error.
1871  */
1872 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1873 {
1874 	return xe_hw_fence_alloc();
1875 }
1876 
1877 /**
1878  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1879  * @fence: Pointer to the fence to free.
1880  *
1881  * Frees an lrc seqno fence that hasn't yet been
1882  * initialized.
1883  */
1884 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1885 {
1886 	xe_hw_fence_free(fence);
1887 }
1888 
1889 /**
1890  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1891  * @lrc: Pointer to the lrc.
1892  * @fence: Pointer to the fence to initialize.
1893  *
1894  * Initializes a pre-allocated lrc seqno fence.
1895  * After initialization, the fence is subject to normal
1896  * dma-fence refcounting.
1897  */
1898 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1899 {
1900 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1901 }
1902 
1903 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1904 {
1905 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1906 
1907 	return xe_map_read32(lrc_to_xe(lrc), &map);
1908 }
1909 
1910 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1911 {
1912 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1913 
1914 	return xe_map_read32(lrc_to_xe(lrc), &map);
1915 }
1916 
1917 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1918 {
1919 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1920 }
1921 
1922 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1923 {
1924 	return __xe_lrc_parallel_ggtt_addr(lrc);
1925 }
1926 
1927 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1928 {
1929 	return __xe_lrc_parallel_map(lrc);
1930 }
1931 
1932 /**
1933  * xe_lrc_engine_id() - Read engine id value
1934  * @lrc: Pointer to the lrc.
1935  *
1936  * Returns: context id value
1937  */
1938 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1939 {
1940 	struct xe_device *xe = lrc_to_xe(lrc);
1941 	struct iosys_map map;
1942 
1943 	map = __xe_lrc_engine_id_map(lrc);
1944 	return xe_map_read32(xe, &map);
1945 }
1946 
1947 static int instr_dw(u32 cmd_header)
1948 {
1949 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1950 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1951 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1952 		return 1;
1953 
1954 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1955 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1956 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1957 
1958 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1959 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1960 }
1961 
1962 static int dump_mi_command(struct drm_printer *p,
1963 			   struct xe_gt *gt,
1964 			   u32 *start,
1965 			   u32 *dw,
1966 			   int remaining_dw)
1967 {
1968 	u32 inst_header = *dw;
1969 	u32 numdw = instr_dw(inst_header);
1970 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1971 	int num_noop;
1972 
1973 	/* First check for commands that don't have/use a '# DW' field */
1974 	switch (inst_header & MI_OPCODE) {
1975 	case MI_NOOP:
1976 		num_noop = 1;
1977 		while (num_noop < remaining_dw &&
1978 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1979 			num_noop++;
1980 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_NOOP (%d dwords)\n",
1981 			   dw - num_noop - start, inst_header, num_noop);
1982 		return num_noop;
1983 
1984 	case MI_TOPOLOGY_FILTER:
1985 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_TOPOLOGY_FILTER\n",
1986 			   dw - start, inst_header);
1987 		return 1;
1988 
1989 	case MI_BATCH_BUFFER_END:
1990 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_BATCH_BUFFER_END\n",
1991 			   dw - start, inst_header);
1992 		/* Return 'remaining_dw' to consume the rest of the LRC */
1993 		return remaining_dw;
1994 	}
1995 
1996 	/*
1997 	 * Any remaining commands include a # of dwords.  We should make sure
1998 	 * it doesn't exceed the remaining size of the LRC.
1999 	 */
2000 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2001 		numdw = remaining_dw;
2002 
2003 	switch (inst_header & MI_OPCODE) {
2004 	case MI_LOAD_REGISTER_IMM:
2005 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
2006 			   dw - start, inst_header, (numdw - 1) / 2);
2007 		for (int i = 1; i < numdw; i += 2)
2008 			drm_printf(p, "LRC[%#5tx]  =  - %#6x = %#010x\n",
2009 				   &dw[i] - start, dw[i], dw[i + 1]);
2010 		return numdw;
2011 
2012 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
2013 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
2014 			   dw - start, inst_header,
2015 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
2016 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
2017 		if (numdw == 4)
2018 			drm_printf(p, "LRC[%#5tx]  =  - %#6x = %#010llx\n",
2019 				   dw - start,
2020 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
2021 		else
2022 			drm_printf(p, "LRC[%#5tx]  =  - %*ph (%s)\n",
2023 				   dw - start, (int)sizeof(u32) * (numdw - 1),
2024 				   dw + 1, numdw < 4 ? "truncated" : "malformed");
2025 		return numdw;
2026 
2027 	case MI_FORCE_WAKEUP:
2028 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_FORCE_WAKEUP\n",
2029 			   dw - start, inst_header);
2030 		return numdw;
2031 
2032 	default:
2033 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown MI opcode %#x, likely %d dwords\n",
2034 			   dw - start, inst_header, opcode, numdw);
2035 		return numdw;
2036 	}
2037 }
2038 
2039 static int dump_gfxpipe_command(struct drm_printer *p,
2040 				struct xe_gt *gt,
2041 				u32 *start,
2042 				u32 *dw,
2043 				int remaining_dw)
2044 {
2045 	u32 numdw = instr_dw(*dw);
2046 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
2047 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
2048 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
2049 
2050 	/*
2051 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2052 	 * remaining size of the LRC.
2053 	 */
2054 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2055 		numdw = remaining_dw;
2056 
2057 	switch (*dw & GFXPIPE_MATCH_MASK) {
2058 #define MATCH(cmd) \
2059 	case cmd: \
2060 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] " #cmd " (%d dwords)\n", \
2061 			   dw - start, *dw, numdw); \
2062 		return numdw
2063 #define MATCH3D(cmd) \
2064 	case CMD_##cmd: \
2065 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] " #cmd " (%d dwords)\n", \
2066 			   dw - start, *dw, numdw); \
2067 		return numdw
2068 
2069 	MATCH(STATE_BASE_ADDRESS);
2070 	MATCH(STATE_SIP);
2071 	MATCH(GPGPU_CSR_BASE_ADDRESS);
2072 	MATCH(STATE_COMPUTE_MODE);
2073 	MATCH3D(3DSTATE_BTD);
2074 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
2075 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
2076 
2077 	MATCH3D(3DSTATE_VF_STATISTICS);
2078 
2079 	MATCH(PIPELINE_SELECT);
2080 
2081 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
2082 	MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN);
2083 	MATCH3D(3DSTATE_CLEAR_PARAMS);
2084 	MATCH3D(3DSTATE_DEPTH_BUFFER);
2085 	MATCH3D(3DSTATE_STENCIL_BUFFER);
2086 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
2087 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
2088 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
2089 	MATCH3D(3DSTATE_INDEX_BUFFER);
2090 	MATCH3D(3DSTATE_VF);
2091 	MATCH3D(3DSTATE_MULTISAMPLE);
2092 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
2093 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
2094 	MATCH3D(3DSTATE_VS);
2095 	MATCH3D(3DSTATE_GS);
2096 	MATCH3D(3DSTATE_CLIP);
2097 	MATCH3D(3DSTATE_SF);
2098 	MATCH3D(3DSTATE_WM);
2099 	MATCH3D(3DSTATE_CONSTANT_VS);
2100 	MATCH3D(3DSTATE_CONSTANT_GS);
2101 	MATCH3D(3DSTATE_CONSTANT_PS);
2102 	MATCH3D(3DSTATE_SAMPLE_MASK);
2103 	MATCH3D(3DSTATE_CONSTANT_HS);
2104 	MATCH3D(3DSTATE_CONSTANT_DS);
2105 	MATCH3D(3DSTATE_HS);
2106 	MATCH3D(3DSTATE_TE);
2107 	MATCH3D(3DSTATE_DS);
2108 	MATCH3D(3DSTATE_STREAMOUT);
2109 	MATCH3D(3DSTATE_SBE);
2110 	MATCH3D(3DSTATE_PS);
2111 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
2112 	MATCH3D(3DSTATE_CPS_POINTERS);
2113 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2114 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2115 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2116 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2117 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2118 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2119 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2120 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2121 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2122 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2123 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2124 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2125 	MATCH3D(3DSTATE_VF_INSTANCING);
2126 	MATCH3D(3DSTATE_VF_SGVS);
2127 	MATCH3D(3DSTATE_VF_TOPOLOGY);
2128 	MATCH3D(3DSTATE_WM_CHROMAKEY);
2129 	MATCH3D(3DSTATE_PS_BLEND);
2130 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2131 	MATCH3D(3DSTATE_PS_EXTRA);
2132 	MATCH3D(3DSTATE_RASTER);
2133 	MATCH3D(3DSTATE_SBE_SWIZ);
2134 	MATCH3D(3DSTATE_WM_HZ_OP);
2135 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2136 	MATCH3D(3DSTATE_VF_SGVS_2);
2137 	MATCH3D(3DSTATE_VFG);
2138 	MATCH3D(3DSTATE_URB_ALLOC_VS);
2139 	MATCH3D(3DSTATE_URB_ALLOC_HS);
2140 	MATCH3D(3DSTATE_URB_ALLOC_DS);
2141 	MATCH3D(3DSTATE_URB_ALLOC_GS);
2142 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2143 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2144 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2145 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2146 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2147 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2148 	MATCH3D(3DSTATE_AMFS);
2149 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
2150 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2151 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2152 	MATCH3D(3DSTATE_MESH_CONTROL);
2153 	MATCH3D(3DSTATE_MESH_DISTRIB);
2154 	MATCH3D(3DSTATE_TASK_REDISTRIB);
2155 	MATCH3D(3DSTATE_MESH_SHADER);
2156 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
2157 	MATCH3D(3DSTATE_TASK_CONTROL);
2158 	MATCH3D(3DSTATE_TASK_SHADER);
2159 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
2160 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
2161 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
2162 	MATCH3D(3DSTATE_CLIP_MESH);
2163 	MATCH3D(3DSTATE_SBE_MESH);
2164 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2165 	MATCH3D(3DSTATE_COARSE_PIXEL);
2166 	MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT);
2167 	MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT);
2168 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2);
2169 	MATCH3D(3DSTATE_CC_STATE_POINTERS_2);
2170 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2);
2171 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2);
2172 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2);
2173 
2174 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2175 	MATCH3D(3DSTATE_URB_MEMORY);
2176 	MATCH3D(3DSTATE_CHROMA_KEY);
2177 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2178 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2179 	MATCH3D(3DSTATE_LINE_STIPPLE);
2180 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2181 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2182 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2183 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2184 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2185 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2186 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2187 	MATCH3D(3DSTATE_SO_DECL_LIST);
2188 	MATCH3D(3DSTATE_SO_BUFFER);
2189 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2190 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2191 	MATCH3D(3DSTATE_3D_MODE);
2192 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2193 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2194 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2195 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2);
2196 
2197 	default:
2198 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2199 			   dw - start, *dw, pipeline, opcode, subopcode, numdw);
2200 		return numdw;
2201 	}
2202 }
2203 
2204 static int dump_gfx_state_command(struct drm_printer *p,
2205 				  struct xe_gt *gt,
2206 				  u32 *start,
2207 				  u32 *dw,
2208 				  int remaining_dw)
2209 {
2210 	u32 numdw = instr_dw(*dw);
2211 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2212 
2213 	/*
2214 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2215 	 * remaining size of the LRC.
2216 	 */
2217 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2218 		numdw = remaining_dw;
2219 
2220 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2221 	MATCH(STATE_WRITE_INLINE);
2222 
2223 	default:
2224 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2225 			   dw - start, *dw, opcode, numdw);
2226 		return numdw;
2227 	}
2228 }
2229 
2230 void xe_lrc_dump_default(struct drm_printer *p,
2231 			 struct xe_gt *gt,
2232 			 enum xe_engine_class hwe_class)
2233 {
2234 	u32 *dw, *start;
2235 	int remaining_dw, num_dw;
2236 
2237 	if (!gt->default_lrc[hwe_class]) {
2238 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2239 		return;
2240 	}
2241 
2242 	/*
2243 	 * Skip the beginning of the LRC since it contains the per-process
2244 	 * hardware status page.
2245 	 */
2246 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2247 	start = dw;
2248 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2249 
2250 	while (remaining_dw > 0) {
2251 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2252 			num_dw = dump_mi_command(p, gt, start, dw, remaining_dw);
2253 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2254 			num_dw = dump_gfxpipe_command(p, gt, start, dw, remaining_dw);
2255 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2256 			num_dw = dump_gfx_state_command(p, gt, start, dw, remaining_dw);
2257 		} else {
2258 			num_dw = min(instr_dw(*dw), remaining_dw);
2259 			drm_printf(p, "LRC[%#5tx]  =  [%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2260 				   dw - start,
2261 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2262 				   num_dw);
2263 		}
2264 
2265 		dw += num_dw;
2266 		remaining_dw -= num_dw;
2267 	}
2268 }
2269 
2270 /*
2271  * Lookup the value of a register within the offset/value pairs of an
2272  * MI_LOAD_REGISTER_IMM instruction.
2273  *
2274  * Return -ENOENT if the register is not present in the MI_LRI instruction.
2275  */
2276 static int lookup_reg_in_mi_lri(u32 offset, u32 *value,
2277 				const u32 *dword_pair, int num_regs)
2278 {
2279 	for (int i = 0; i < num_regs; i++) {
2280 		if (dword_pair[2 * i] == offset) {
2281 			*value = dword_pair[2 * i + 1];
2282 			return 0;
2283 		}
2284 	}
2285 
2286 	return -ENOENT;
2287 }
2288 
2289 /*
2290  * Lookup the value of a register in a specific engine type's default LRC.
2291  *
2292  * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register
2293  * cannot be found in the default LRC.
2294  */
2295 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt,
2296 				    enum xe_engine_class hwe_class,
2297 				    u32 offset,
2298 				    u32 *value)
2299 {
2300 	u32 *dw;
2301 	int remaining_dw, ret;
2302 
2303 	if (!gt->default_lrc[hwe_class])
2304 		return -EINVAL;
2305 
2306 	/*
2307 	 * Skip the beginning of the LRC since it contains the per-process
2308 	 * hardware status page.
2309 	 */
2310 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2311 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2312 
2313 	while (remaining_dw > 0) {
2314 		u32 num_dw = instr_dw(*dw);
2315 
2316 		if (num_dw > remaining_dw)
2317 			num_dw = remaining_dw;
2318 
2319 		switch (*dw & XE_INSTR_CMD_TYPE) {
2320 		case XE_INSTR_MI:
2321 			switch (*dw & MI_OPCODE) {
2322 			case MI_BATCH_BUFFER_END:
2323 				/* End of LRC; register not found */
2324 				return -ENOENT;
2325 
2326 			case MI_NOOP:
2327 			case MI_TOPOLOGY_FILTER:
2328 				/*
2329 				 * MI_NOOP and MI_TOPOLOGY_FILTER don't have
2330 				 * a length field and are always 1-dword
2331 				 * instructions.
2332 				 */
2333 				remaining_dw--;
2334 				dw++;
2335 				break;
2336 
2337 			case MI_LOAD_REGISTER_IMM:
2338 				ret = lookup_reg_in_mi_lri(offset, value,
2339 							   dw + 1, (num_dw - 1) / 2);
2340 				if (ret == 0)
2341 					return 0;
2342 
2343 				fallthrough;
2344 
2345 			default:
2346 				/*
2347 				 * Jump to next instruction based on length
2348 				 * field.
2349 				 */
2350 				remaining_dw -= num_dw;
2351 				dw += num_dw;
2352 				break;
2353 			}
2354 			break;
2355 
2356 		default:
2357 			/* Jump to next instruction based on length field. */
2358 			remaining_dw -= num_dw;
2359 			dw += num_dw;
2360 		}
2361 	}
2362 
2363 	return -ENOENT;
2364 }
2365 
2366 struct instr_state {
2367 	u32 instr;
2368 	u16 num_dw;
2369 };
2370 
2371 static const struct instr_state xe_hpg_svg_state[] = {
2372 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2373 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2374 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2375 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2376 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2377 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2378 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2379 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2380 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2381 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2382 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2383 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2384 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2385 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2386 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2387 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2388 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2389 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2390 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2391 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2392 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2393 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2394 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2395 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2396 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2397 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2398 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2399 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2400 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2401 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2402 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2403 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2404 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2405 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2406 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2407 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2408 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2409 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2410 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2411 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2412 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2413 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2414 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2415 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2416 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2417 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2418 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2419 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2420 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2421 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2422 };
2423 
2424 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2425 {
2426 	struct xe_gt *gt = q->hwe->gt;
2427 	struct xe_device *xe = gt_to_xe(gt);
2428 	const struct instr_state *state_table = NULL;
2429 	int state_table_size = 0;
2430 
2431 	/*
2432 	 * Wa_14019789679
2433 	 *
2434 	 * If the driver doesn't explicitly emit the SVG instructions while
2435 	 * setting up the default LRC, the context switch will write 0's
2436 	 * (noops) into the LRC memory rather than the expected instruction
2437 	 * headers.  Application contexts start out as a copy of the default
2438 	 * LRC, and if they also do not emit specific settings for some SVG
2439 	 * state, then on context restore they'll unintentionally inherit
2440 	 * whatever state setting the previous context had programmed into the
2441 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2442 	 * prevent the hardware from resetting that state back to any specific
2443 	 * value).
2444 	 *
2445 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2446 	 * since that's a specific state setting that can easily cause GPU
2447 	 * hangs if unintentionally inherited.  However to be safe we'll
2448 	 * continue to emit all of the SVG state since it's best not to leak
2449 	 * any of the state between contexts, even if that leakage is harmless.
2450 	 */
2451 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2452 		state_table = xe_hpg_svg_state;
2453 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2454 	}
2455 
2456 	if (!state_table) {
2457 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2458 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2459 		return cs;
2460 	}
2461 
2462 	for (int i = 0; i < state_table_size; i++) {
2463 		u32 instr = state_table[i].instr;
2464 		u16 num_dw = state_table[i].num_dw;
2465 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2466 
2467 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2468 		xe_gt_assert(gt, num_dw != 0);
2469 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2470 
2471 		/*
2472 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2473 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2474 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2475 		 * Just make the replacement here rather than defining a
2476 		 * whole separate table for the single trivial change.
2477 		 */
2478 		if (GRAPHICS_VER(xe) >= 20 &&
2479 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2480 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2481 
2482 		*cs = instr;
2483 		if (!is_single_dw)
2484 			*cs |= (num_dw - 2);
2485 
2486 		cs += num_dw;
2487 	}
2488 
2489 	return cs;
2490 }
2491 
2492 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2493 {
2494 	struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2495 
2496 	if (!snapshot)
2497 		return NULL;
2498 
2499 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2500 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2501 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2502 	snapshot->head = xe_lrc_ring_head(lrc);
2503 	snapshot->tail.internal = lrc->ring.tail;
2504 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2505 	snapshot->start = xe_lrc_ring_start(lrc);
2506 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2507 	snapshot->seqno = xe_lrc_seqno(lrc);
2508 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2509 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2510 	snapshot->lrc_size = lrc->size;
2511 	snapshot->replay_offset = 0;
2512 	snapshot->replay_size = lrc->replay_size;
2513 	snapshot->lrc_snapshot = NULL;
2514 	snapshot->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
2515 	snapshot->ctx_timestamp_ms =
2516 		xe_gt_clock_interval_to_ms(lrc->gt, xe_lrc_ctx_timestamp(lrc));
2517 	if (xe_lrc_is_multi_queue(lrc)) {
2518 		snapshot->queue_timestamp = xe_lrc_queue_timestamp(lrc);
2519 		snapshot->queue_timestamp_ms =
2520 			xe_gt_clock_interval_to_ms(lrc->gt, snapshot->queue_timestamp);
2521 	} else {
2522 		snapshot->queue_timestamp = 0;
2523 		snapshot->queue_timestamp_ms = 0;
2524 	}
2525 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2526 	return snapshot;
2527 }
2528 
2529 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2530 {
2531 	struct xe_bo *bo;
2532 	struct iosys_map src;
2533 
2534 	if (!snapshot)
2535 		return;
2536 
2537 	bo = snapshot->lrc_bo;
2538 	snapshot->lrc_bo = NULL;
2539 
2540 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2541 	if (!snapshot->lrc_snapshot)
2542 		goto put_bo;
2543 
2544 	xe_bo_lock(bo, false);
2545 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2546 		xe_map_memcpy_from(xe_bo_device(bo),
2547 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2548 				   snapshot->lrc_size);
2549 		ttm_bo_vunmap(&bo->ttm, &src);
2550 	} else {
2551 		kvfree(snapshot->lrc_snapshot);
2552 		snapshot->lrc_snapshot = NULL;
2553 	}
2554 	xe_bo_unlock(bo);
2555 put_bo:
2556 	xe_bo_put(bo);
2557 }
2558 
2559 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2560 {
2561 	unsigned long i;
2562 
2563 	if (!snapshot)
2564 		return;
2565 
2566 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2567 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2568 		   snapshot->ring_addr);
2569 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2570 		   snapshot->indirect_context_desc);
2571 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2572 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2573 		   snapshot->tail.internal, snapshot->tail.memory);
2574 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2575 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2576 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2577 	drm_printf(p, "\tTimestamp: 0x%016llx\n", snapshot->ctx_timestamp);
2578 	drm_printf(p, "\tTimestamp ms: %llu\n", snapshot->ctx_timestamp_ms);
2579 	drm_printf(p, "\tQueue Timestamp: 0x%016llx\n", snapshot->queue_timestamp);
2580 	drm_printf(p, "\tQueue Timestamp ms: %llu\n", snapshot->queue_timestamp_ms);
2581 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2582 
2583 	if (!snapshot->lrc_snapshot)
2584 		return;
2585 
2586 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2587 	drm_puts(p, "\t[HWSP].data: ");
2588 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2589 		u32 *val = snapshot->lrc_snapshot + i;
2590 		char dumped[ASCII85_BUFSZ];
2591 
2592 		drm_puts(p, ascii85_encode(*val, dumped));
2593 	}
2594 
2595 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2596 	drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2597 	drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2598 
2599 	drm_puts(p, "\t[HWCTX].data: ");
2600 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2601 		u32 *val = snapshot->lrc_snapshot + i;
2602 		char dumped[ASCII85_BUFSZ];
2603 
2604 		drm_puts(p, ascii85_encode(*val, dumped));
2605 	}
2606 	drm_puts(p, "\n");
2607 }
2608 
2609 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2610 {
2611 	if (!snapshot)
2612 		return;
2613 
2614 	kvfree(snapshot->lrc_snapshot);
2615 	if (snapshot->lrc_bo)
2616 		xe_bo_put(snapshot->lrc_bo);
2617 
2618 	kfree(snapshot);
2619 }
2620 
2621 static struct xe_hw_engine *engine_id_to_hwe(struct xe_gt *gt, u32 engine_id)
2622 {
2623 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2624 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2625 	struct xe_hw_engine *hwe = xe_gt_hw_engine(gt, class, instance, false);
2626 
2627 	if (xe_gt_WARN_ONCE(gt, !hwe || xe_hw_engine_is_reserved(hwe),
2628 			    "Unexpected engine class:instance %d:%d for utilization\n",
2629 			    class, instance))
2630 		return NULL;
2631 
2632 	return hwe;
2633 }
2634 
2635 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2636 {
2637 	struct xe_hw_engine *hwe;
2638 	u64 val;
2639 
2640 	hwe = engine_id_to_hwe(lrc->gt, engine_id);
2641 	if (!hwe)
2642 		return -1;
2643 
2644 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2645 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2646 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2647 	else
2648 		val = xe_mmio_read32(&hwe->gt->mmio,
2649 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2650 
2651 	*reg_ctx_ts = val;
2652 
2653 	return 0;
2654 }
2655 
2656 static u64 get_queue_timestamp(struct xe_hw_engine *hwe)
2657 {
2658 	return xe_mmio_read64_2x32(&hwe->gt->mmio,
2659 				   RING_QUEUE_TIMESTAMP(hwe->mmio_base));
2660 }
2661 
2662 static u32 get_multi_queue_active_queue_id(struct xe_hw_engine *hwe)
2663 {
2664 	u32 val = xe_mmio_read32(&hwe->gt->mmio,
2665 				 RING_CSMQDEBUG(hwe->mmio_base));
2666 
2667 	return REG_FIELD_GET(CURRENT_ACTIVE_QUEUE_ID_MASK, val);
2668 }
2669 
2670 static bool context_active(struct xe_lrc *lrc)
2671 {
2672 	return xe_lrc_ctx_timestamp(lrc) == CONTEXT_ACTIVE;
2673 }
2674 
2675 static u64 xe_lrc_multi_queue_timestamp(struct xe_lrc *lrc)
2676 {
2677 	struct xe_device *xe = lrc_to_xe(lrc);
2678 	struct xe_lrc *primary_lrc = lrc->multi_queue.primary_lrc;
2679 	struct xe_hw_engine *hwe;
2680 	u64 reg_queue_ts = lrc->queue_timestamp;
2681 
2682 	if (IS_SRIOV_VF(xe))
2683 		return xe_lrc_queue_timestamp(lrc);
2684 
2685 	xe_assert(xe, primary_lrc);
2686 
2687 	/* WA BB populates CONTEXT_ACTIVE cookie for primary context only */
2688 	if (!context_active(primary_lrc))
2689 		return xe_lrc_queue_timestamp(lrc);
2690 
2691 	/* WA BB populates engine id in PPHWSP of primary context only */
2692 	hwe = engine_id_to_hwe(primary_lrc->gt, xe_lrc_engine_id(primary_lrc));
2693 	if (!hwe)
2694 		return xe_lrc_queue_timestamp(lrc);
2695 
2696 	if (get_multi_queue_active_queue_id(hwe) != lrc->multi_queue.pos)
2697 		return xe_lrc_queue_timestamp(lrc);
2698 
2699 	/* queue is active, so store the queue timestamp register */
2700 	reg_queue_ts = get_queue_timestamp(hwe);
2701 
2702 	/* double check queue and primary queue are both still active */
2703 	if (get_multi_queue_active_queue_id(hwe) != lrc->multi_queue.pos ||
2704 	    !context_active(primary_lrc))
2705 		return xe_lrc_queue_timestamp(lrc);
2706 
2707 	return reg_queue_ts;
2708 }
2709 
2710 static u64 xe_lrc_update_multi_queue_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2711 {
2712 	*old_ts = lrc->queue_timestamp;
2713 	lrc->queue_timestamp = xe_lrc_multi_queue_timestamp(lrc);
2714 
2715 	trace_xe_lrc_update_queue_timestamp(lrc, *old_ts);
2716 
2717 	return lrc->queue_timestamp;
2718 }
2719 
2720 static u64 xe_lrc_context_timestamp(struct xe_lrc *lrc)
2721 {
2722 	u64 reg_ts, new_ts = lrc->ctx_timestamp;
2723 
2724 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2725 	if (IS_SRIOV_VF(lrc_to_xe(lrc)))
2726 		return xe_lrc_ctx_timestamp(lrc);
2727 
2728 	if (context_active(lrc) &&
2729 	    !get_ctx_timestamp(lrc, xe_lrc_engine_id(lrc), &reg_ts))
2730 		new_ts = reg_ts;
2731 
2732 	/*
2733 	 * If context swicthed out while we were here, just return the latest
2734 	 * LRC CTX TIMESTAMP value.
2735 	 */
2736 	if (!context_active(lrc))
2737 		return xe_lrc_ctx_timestamp(lrc);
2738 
2739 	return new_ts;
2740 }
2741 
2742 static u64 xe_lrc_update_context_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2743 {
2744 	*old_ts = lrc->ctx_timestamp;
2745 	lrc->ctx_timestamp = xe_lrc_context_timestamp(lrc);
2746 
2747 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2748 
2749 	return lrc->ctx_timestamp;
2750 }
2751 
2752 /**
2753  * xe_lrc_timestamp() - Current lrc timestamp
2754  * @lrc: Pointer to the lrc.
2755  *
2756  * Return latest lrc timestamp. With support for active contexts/queues, the
2757  * calculation may be slightly racy, so follow a read-again logic to ensure that
2758  * the context/queue is still active before returning the right timestamp.
2759  *
2760  * Returns: New lrc timestamp value
2761  */
2762 u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2763 {
2764 	if (xe_lrc_is_multi_queue(lrc))
2765 		return xe_lrc_multi_queue_timestamp(lrc);
2766 	else
2767 		return xe_lrc_context_timestamp(lrc);
2768 }
2769 
2770 /**
2771  * xe_lrc_update_timestamp() - Update lrc timestamp
2772  * @lrc: Pointer to the lrc.
2773  * @old_ts: Old timestamp value
2774  *
2775  * Populate @old_ts with current saved lrc timestamp, read new lrc timestamp and
2776  * update saved value.
2777  *
2778  * Returns: New lrc timestamp value
2779  */
2780 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2781 {
2782 	if (xe_lrc_is_multi_queue(lrc))
2783 		return xe_lrc_update_multi_queue_timestamp(lrc, old_ts);
2784 	else
2785 		return xe_lrc_update_context_timestamp(lrc, old_ts);
2786 }
2787 
2788 /**
2789  * xe_lrc_ring_is_idle() - LRC is idle
2790  * @lrc: Pointer to the lrc.
2791  *
2792  * Compare LRC ring head and tail to determine if idle.
2793  *
2794  * Return: True is ring is idle, False otherwise
2795  */
2796 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2797 {
2798 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2799 }
2800