xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 6f17ab9a63e670bd62a287f95e3982f99eafd77e)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 
12 #include "instructions/xe_mi_commands.h"
13 #include "instructions/xe_gfxpipe_commands.h"
14 #include "instructions/xe_gfx_state_commands.h"
15 #include "regs/xe_engine_regs.h"
16 #include "regs/xe_lrc_layout.h"
17 #include "xe_bb.h"
18 #include "xe_bo.h"
19 #include "xe_device.h"
20 #include "xe_drm_client.h"
21 #include "xe_exec_queue_types.h"
22 #include "xe_gt.h"
23 #include "xe_gt_printk.h"
24 #include "xe_hw_fence.h"
25 #include "xe_map.h"
26 #include "xe_memirq.h"
27 #include "xe_mmio.h"
28 #include "xe_sriov.h"
29 #include "xe_trace_lrc.h"
30 #include "xe_vm.h"
31 #include "xe_wa.h"
32 
33 #define LRC_VALID				BIT_ULL(0)
34 #define LRC_PRIVILEGE				BIT_ULL(8)
35 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
36 #define LRC_LEGACY_64B_CONTEXT			3
37 
38 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
39 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
40 
41 #define LRC_PPHWSP_SIZE				SZ_4K
42 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
43 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
44 
45 /*
46  * Layout of the LRC and associated data allocated as
47  * lrc->bo:
48  *
49  *   Region                       Size
50  *  +============================+=================================+ <- __xe_lrc_ring_offset()
51  *  | Ring                       | ring_size, see                  |
52  *  |                            | xe_lrc_init()                   |
53  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
54  *  | PPHWSP (includes SW state) | 4K                              |
55  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
56  *  | Engine Context Image       | n * 4K, see                     |
57  *  |                            | xe_gt_lrc_size()                |
58  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
59  *  | Indirect Ring State Page   | 0 or 4k, see                    |
60  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
61  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
62  *  | Indirect Context Page      | 0 or 4k, see                    |
63  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
64  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
65  *  | WA BB Per Ctx              | 4k                              |
66  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
67  */
68 
69 static struct xe_device *
70 lrc_to_xe(struct xe_lrc *lrc)
71 {
72 	return gt_to_xe(lrc->fence_ctx.gt);
73 }
74 
75 static bool
76 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
77 {
78 	if (XE_GT_WA(gt, 16010904313) &&
79 	    (class == XE_ENGINE_CLASS_RENDER ||
80 	     class == XE_ENGINE_CLASS_COMPUTE))
81 		return true;
82 
83 	return false;
84 }
85 
86 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
87 {
88 	struct xe_device *xe = gt_to_xe(gt);
89 	size_t size;
90 
91 	/* Per-process HW status page (PPHWSP) */
92 	size = LRC_PPHWSP_SIZE;
93 
94 	/* Engine context image */
95 	switch (class) {
96 	case XE_ENGINE_CLASS_RENDER:
97 		if (GRAPHICS_VER(xe) >= 20)
98 			size += 3 * SZ_4K;
99 		else
100 			size += 13 * SZ_4K;
101 		break;
102 	case XE_ENGINE_CLASS_COMPUTE:
103 		if (GRAPHICS_VER(xe) >= 20)
104 			size += 2 * SZ_4K;
105 		else
106 			size += 13 * SZ_4K;
107 		break;
108 	default:
109 		WARN(1, "Unknown engine class: %d", class);
110 		fallthrough;
111 	case XE_ENGINE_CLASS_COPY:
112 	case XE_ENGINE_CLASS_VIDEO_DECODE:
113 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
114 	case XE_ENGINE_CLASS_OTHER:
115 		size += 1 * SZ_4K;
116 	}
117 
118 	/* Add indirect ring state page */
119 	if (xe_gt_has_indirect_ring_state(gt))
120 		size += LRC_INDIRECT_RING_STATE_SIZE;
121 
122 	return size;
123 }
124 
125 /*
126  * The per-platform tables are u8-encoded in @data. Decode @data and set the
127  * addresses' offset and commands in @regs. The following encoding is used
128  * for each byte. There are 2 steps: decoding commands and decoding addresses.
129  *
130  * Commands:
131  * [7]: create NOPs - number of NOPs are set in lower bits
132  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
133  *      MI_LRI_FORCE_POSTED
134  * [5:0]: Number of NOPs or registers to set values to in case of
135  *        MI_LOAD_REGISTER_IMM
136  *
137  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
138  * number of registers. They are set by using the REG/REG16 macros: the former
139  * is used for offsets smaller than 0x200 while the latter is for values bigger
140  * than that. Those macros already set all the bits documented below correctly:
141  *
142  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
143  *      follow, for the lower bits
144  * [6:0]: Register offset, without considering the engine base.
145  *
146  * This function only tweaks the commands and register offsets. Values are not
147  * filled out.
148  */
149 static void set_offsets(u32 *regs,
150 			const u8 *data,
151 			const struct xe_hw_engine *hwe)
152 #define NOP(x) (BIT(7) | (x))
153 #define LRI(count, flags) ((flags) << 6 | (count) | \
154 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
155 #define POSTED BIT(0)
156 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
157 #define REG16(x) \
158 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
159 	(((x) >> 2) & 0x7f)
160 {
161 	const u32 base = hwe->mmio_base;
162 
163 	while (*data) {
164 		u8 count, flags;
165 
166 		if (*data & BIT(7)) { /* skip */
167 			count = *data++ & ~BIT(7);
168 			regs += count;
169 			continue;
170 		}
171 
172 		count = *data & 0x3f;
173 		flags = *data >> 6;
174 		data++;
175 
176 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
177 		if (flags & POSTED)
178 			*regs |= MI_LRI_FORCE_POSTED;
179 		*regs |= MI_LRI_LRM_CS_MMIO;
180 		regs++;
181 
182 		xe_gt_assert(hwe->gt, count);
183 		do {
184 			u32 offset = 0;
185 			u8 v;
186 
187 			do {
188 				v = *data++;
189 				offset <<= 7;
190 				offset |= v & ~BIT(7);
191 			} while (v & BIT(7));
192 
193 			regs[0] = base + (offset << 2);
194 			regs += 2;
195 		} while (--count);
196 	}
197 
198 	*regs = MI_BATCH_BUFFER_END | BIT(0);
199 }
200 
201 static const u8 gen12_xcs_offsets[] = {
202 	NOP(1),
203 	LRI(13, POSTED),
204 	REG16(0x244),
205 	REG(0x034),
206 	REG(0x030),
207 	REG(0x038),
208 	REG(0x03c),
209 	REG(0x168),
210 	REG(0x140),
211 	REG(0x110),
212 	REG(0x1c0),
213 	REG(0x1c4),
214 	REG(0x1c8),
215 	REG(0x180),
216 	REG16(0x2b4),
217 
218 	NOP(5),
219 	LRI(9, POSTED),
220 	REG16(0x3a8),
221 	REG16(0x28c),
222 	REG16(0x288),
223 	REG16(0x284),
224 	REG16(0x280),
225 	REG16(0x27c),
226 	REG16(0x278),
227 	REG16(0x274),
228 	REG16(0x270),
229 
230 	0
231 };
232 
233 static const u8 dg2_xcs_offsets[] = {
234 	NOP(1),
235 	LRI(15, POSTED),
236 	REG16(0x244),
237 	REG(0x034),
238 	REG(0x030),
239 	REG(0x038),
240 	REG(0x03c),
241 	REG(0x168),
242 	REG(0x140),
243 	REG(0x110),
244 	REG(0x1c0),
245 	REG(0x1c4),
246 	REG(0x1c8),
247 	REG(0x180),
248 	REG16(0x2b4),
249 	REG(0x120),
250 	REG(0x124),
251 
252 	NOP(1),
253 	LRI(9, POSTED),
254 	REG16(0x3a8),
255 	REG16(0x28c),
256 	REG16(0x288),
257 	REG16(0x284),
258 	REG16(0x280),
259 	REG16(0x27c),
260 	REG16(0x278),
261 	REG16(0x274),
262 	REG16(0x270),
263 
264 	0
265 };
266 
267 static const u8 gen12_rcs_offsets[] = {
268 	NOP(1),
269 	LRI(13, POSTED),
270 	REG16(0x244),
271 	REG(0x034),
272 	REG(0x030),
273 	REG(0x038),
274 	REG(0x03c),
275 	REG(0x168),
276 	REG(0x140),
277 	REG(0x110),
278 	REG(0x1c0),
279 	REG(0x1c4),
280 	REG(0x1c8),
281 	REG(0x180),
282 	REG16(0x2b4),
283 
284 	NOP(5),
285 	LRI(9, POSTED),
286 	REG16(0x3a8),
287 	REG16(0x28c),
288 	REG16(0x288),
289 	REG16(0x284),
290 	REG16(0x280),
291 	REG16(0x27c),
292 	REG16(0x278),
293 	REG16(0x274),
294 	REG16(0x270),
295 
296 	LRI(3, POSTED),
297 	REG(0x1b0),
298 	REG16(0x5a8),
299 	REG16(0x5ac),
300 
301 	NOP(6),
302 	LRI(1, 0),
303 	REG(0x0c8),
304 	NOP(3 + 9 + 1),
305 
306 	LRI(51, POSTED),
307 	REG16(0x588),
308 	REG16(0x588),
309 	REG16(0x588),
310 	REG16(0x588),
311 	REG16(0x588),
312 	REG16(0x588),
313 	REG(0x028),
314 	REG(0x09c),
315 	REG(0x0c0),
316 	REG(0x178),
317 	REG(0x17c),
318 	REG16(0x358),
319 	REG(0x170),
320 	REG(0x150),
321 	REG(0x154),
322 	REG(0x158),
323 	REG16(0x41c),
324 	REG16(0x600),
325 	REG16(0x604),
326 	REG16(0x608),
327 	REG16(0x60c),
328 	REG16(0x610),
329 	REG16(0x614),
330 	REG16(0x618),
331 	REG16(0x61c),
332 	REG16(0x620),
333 	REG16(0x624),
334 	REG16(0x628),
335 	REG16(0x62c),
336 	REG16(0x630),
337 	REG16(0x634),
338 	REG16(0x638),
339 	REG16(0x63c),
340 	REG16(0x640),
341 	REG16(0x644),
342 	REG16(0x648),
343 	REG16(0x64c),
344 	REG16(0x650),
345 	REG16(0x654),
346 	REG16(0x658),
347 	REG16(0x65c),
348 	REG16(0x660),
349 	REG16(0x664),
350 	REG16(0x668),
351 	REG16(0x66c),
352 	REG16(0x670),
353 	REG16(0x674),
354 	REG16(0x678),
355 	REG16(0x67c),
356 	REG(0x068),
357 	REG(0x084),
358 	NOP(1),
359 
360 	0
361 };
362 
363 static const u8 xehp_rcs_offsets[] = {
364 	NOP(1),
365 	LRI(13, POSTED),
366 	REG16(0x244),
367 	REG(0x034),
368 	REG(0x030),
369 	REG(0x038),
370 	REG(0x03c),
371 	REG(0x168),
372 	REG(0x140),
373 	REG(0x110),
374 	REG(0x1c0),
375 	REG(0x1c4),
376 	REG(0x1c8),
377 	REG(0x180),
378 	REG16(0x2b4),
379 
380 	NOP(5),
381 	LRI(9, POSTED),
382 	REG16(0x3a8),
383 	REG16(0x28c),
384 	REG16(0x288),
385 	REG16(0x284),
386 	REG16(0x280),
387 	REG16(0x27c),
388 	REG16(0x278),
389 	REG16(0x274),
390 	REG16(0x270),
391 
392 	LRI(3, POSTED),
393 	REG(0x1b0),
394 	REG16(0x5a8),
395 	REG16(0x5ac),
396 
397 	NOP(6),
398 	LRI(1, 0),
399 	REG(0x0c8),
400 
401 	0
402 };
403 
404 static const u8 dg2_rcs_offsets[] = {
405 	NOP(1),
406 	LRI(15, POSTED),
407 	REG16(0x244),
408 	REG(0x034),
409 	REG(0x030),
410 	REG(0x038),
411 	REG(0x03c),
412 	REG(0x168),
413 	REG(0x140),
414 	REG(0x110),
415 	REG(0x1c0),
416 	REG(0x1c4),
417 	REG(0x1c8),
418 	REG(0x180),
419 	REG16(0x2b4),
420 	REG(0x120),
421 	REG(0x124),
422 
423 	NOP(1),
424 	LRI(9, POSTED),
425 	REG16(0x3a8),
426 	REG16(0x28c),
427 	REG16(0x288),
428 	REG16(0x284),
429 	REG16(0x280),
430 	REG16(0x27c),
431 	REG16(0x278),
432 	REG16(0x274),
433 	REG16(0x270),
434 
435 	LRI(3, POSTED),
436 	REG(0x1b0),
437 	REG16(0x5a8),
438 	REG16(0x5ac),
439 
440 	NOP(6),
441 	LRI(1, 0),
442 	REG(0x0c8),
443 
444 	0
445 };
446 
447 static const u8 mtl_rcs_offsets[] = {
448 	NOP(1),
449 	LRI(15, POSTED),
450 	REG16(0x244),
451 	REG(0x034),
452 	REG(0x030),
453 	REG(0x038),
454 	REG(0x03c),
455 	REG(0x168),
456 	REG(0x140),
457 	REG(0x110),
458 	REG(0x1c0),
459 	REG(0x1c4),
460 	REG(0x1c8),
461 	REG(0x180),
462 	REG16(0x2b4),
463 	REG(0x120),
464 	REG(0x124),
465 
466 	NOP(1),
467 	LRI(9, POSTED),
468 	REG16(0x3a8),
469 	REG16(0x28c),
470 	REG16(0x288),
471 	REG16(0x284),
472 	REG16(0x280),
473 	REG16(0x27c),
474 	REG16(0x278),
475 	REG16(0x274),
476 	REG16(0x270),
477 
478 	NOP(2),
479 	LRI(2, POSTED),
480 	REG16(0x5a8),
481 	REG16(0x5ac),
482 
483 	NOP(6),
484 	LRI(1, 0),
485 	REG(0x0c8),
486 
487 	0
488 };
489 
490 #define XE2_CTX_COMMON \
491 	NOP(1),                 /* [0x00] */ \
492 	LRI(15, POSTED),        /* [0x01] */ \
493 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
494 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
495 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
496 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
497 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
498 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
499 	REG(0x140),             /* [0x0e] BB_ADDR */ \
500 	REG(0x110),             /* [0x10] BB_STATE */ \
501 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
502 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
503 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
504 	REG(0x180),             /* [0x18] CCID */ \
505 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
506 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
507 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
508 	\
509 	NOP(1),                 /* [0x20] */ \
510 	LRI(9, POSTED),         /* [0x21] */ \
511 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
512 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
513 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
514 	REG16(0x284),           /* [0x28] dummy reg */ \
515 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
516 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
517 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
518 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
519 	REG16(0x270)            /* [0x32] PTBP_LDW */
520 
521 static const u8 xe2_rcs_offsets[] = {
522 	XE2_CTX_COMMON,
523 
524 	NOP(2),                 /* [0x34] */
525 	LRI(2, POSTED),         /* [0x36] */
526 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
527 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
528 
529 	NOP(6),                 /* [0x41] */
530 	LRI(1, 0),              /* [0x47] */
531 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
532 
533 	0
534 };
535 
536 static const u8 xe2_bcs_offsets[] = {
537 	XE2_CTX_COMMON,
538 
539 	NOP(4 + 8 + 1),         /* [0x34] */
540 	LRI(2, POSTED),         /* [0x41] */
541 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
542 	REG16(0x204),           /* [0x44] BLIT_CCTL */
543 
544 	0
545 };
546 
547 static const u8 xe2_xcs_offsets[] = {
548 	XE2_CTX_COMMON,
549 
550 	0
551 };
552 
553 static const u8 xe2_indirect_ring_state_offsets[] = {
554 	NOP(1),                 /* [0x00] */
555 	LRI(5, POSTED),         /* [0x01] */
556 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
557 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
558 	REG(0x038),             /* [0x06] RING_BUFFER_START */
559 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
560 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
561 
562 	NOP(5),                 /* [0x0c] */
563 	LRI(9, POSTED),         /* [0x11] */
564 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
565 	REG(0x140),             /* [0x14] BB_ADDR */
566 	REG(0x110),             /* [0x16] BB_STATE */
567 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
568 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
569 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
570 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
571 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
572 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
573 
574 	NOP(12),                 /* [0x00] */
575 
576 	0
577 };
578 
579 #undef REG16
580 #undef REG
581 #undef LRI
582 #undef NOP
583 
584 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
585 {
586 	if (class == XE_ENGINE_CLASS_RENDER) {
587 		if (GRAPHICS_VER(xe) >= 20)
588 			return xe2_rcs_offsets;
589 		else if (GRAPHICS_VERx100(xe) >= 1270)
590 			return mtl_rcs_offsets;
591 		else if (GRAPHICS_VERx100(xe) >= 1255)
592 			return dg2_rcs_offsets;
593 		else if (GRAPHICS_VERx100(xe) >= 1250)
594 			return xehp_rcs_offsets;
595 		else
596 			return gen12_rcs_offsets;
597 	} else if (class == XE_ENGINE_CLASS_COPY) {
598 		if (GRAPHICS_VER(xe) >= 20)
599 			return xe2_bcs_offsets;
600 		else
601 			return gen12_xcs_offsets;
602 	} else {
603 		if (GRAPHICS_VER(xe) >= 20)
604 			return xe2_xcs_offsets;
605 		else if (GRAPHICS_VERx100(xe) >= 1255)
606 			return dg2_xcs_offsets;
607 		else
608 			return gen12_xcs_offsets;
609 	}
610 }
611 
612 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
613 {
614 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
615 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
616 
617 	if (xe_gt_has_indirect_ring_state(hwe->gt))
618 		regs[CTX_CONTEXT_CONTROL] |=
619 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
620 }
621 
622 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
623 {
624 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
625 	struct xe_device *xe = gt_to_xe(hwe->gt);
626 	u8 num_regs;
627 
628 	if (!xe_device_uses_memirq(xe))
629 		return;
630 
631 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
632 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
633 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
634 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
635 
636 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
637 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
638 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
639 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
640 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
641 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
642 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
643 
644 	if (xe_device_has_msix(xe)) {
645 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
646 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
647 	}
648 }
649 
650 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
651 {
652 	struct xe_device *xe = gt_to_xe(hwe->gt);
653 
654 	if (GRAPHICS_VERx100(xe) >= 1250)
655 		return 0x70;
656 	else
657 		return 0x60;
658 }
659 
660 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
661 {
662 	int x;
663 
664 	x = lrc_ring_mi_mode(hwe);
665 	regs[x + 1] &= ~STOP_RING;
666 	regs[x + 1] |= STOP_RING << 16;
667 }
668 
669 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
670 {
671 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
672 }
673 
674 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
675 {
676 	return 0;
677 }
678 
679 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
680 {
681 	return lrc->ring.size;
682 }
683 
684 /* Make the magic macros work */
685 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
686 #define __xe_lrc_regs_offset xe_lrc_regs_offset
687 
688 #define LRC_SEQNO_PPHWSP_OFFSET 512
689 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
690 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
691 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
692 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
693 
694 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
695 {
696 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
697 }
698 
699 /**
700  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
701  * @xe: the &xe_device struct instance
702  *
703  * Returns: Size of the LRC registers area for current platform
704  */
705 size_t xe_lrc_reg_size(struct xe_device *xe)
706 {
707 	if (GRAPHICS_VERx100(xe) >= 1250)
708 		return 96 * sizeof(u32);
709 	else
710 		return 80 * sizeof(u32);
711 }
712 
713 size_t xe_lrc_skip_size(struct xe_device *xe)
714 {
715 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
716 }
717 
718 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
719 {
720 	/* The seqno is stored in the driver-defined portion of PPHWSP */
721 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
722 }
723 
724 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
725 {
726 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
727 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
728 }
729 
730 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
731 {
732 	/* This is stored in the driver-defined portion of PPHWSP */
733 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
734 }
735 
736 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
737 {
738 	/* The parallel is stored in the driver-defined portion of PPHWSP */
739 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
740 }
741 
742 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
743 {
744 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
745 }
746 
747 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
748 {
749 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
750 }
751 
752 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
753 {
754 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
755 }
756 
757 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
758 {
759 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
760 		     LRC_INDIRECT_RING_STATE_SIZE;
761 
762 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
763 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
764 
765 	return offset;
766 }
767 
768 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
769 {
770 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
771 }
772 
773 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
774 {
775 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
776 }
777 
778 #define DECL_MAP_ADDR_HELPERS(elem) \
779 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
780 { \
781 	struct iosys_map map = lrc->bo->vmap; \
782 \
783 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
784 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
785 	return map; \
786 } \
787 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
788 { \
789 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
790 } \
791 
792 DECL_MAP_ADDR_HELPERS(ring)
793 DECL_MAP_ADDR_HELPERS(pphwsp)
794 DECL_MAP_ADDR_HELPERS(seqno)
795 DECL_MAP_ADDR_HELPERS(regs)
796 DECL_MAP_ADDR_HELPERS(start_seqno)
797 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
798 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
799 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
800 DECL_MAP_ADDR_HELPERS(parallel)
801 DECL_MAP_ADDR_HELPERS(indirect_ring)
802 DECL_MAP_ADDR_HELPERS(engine_id)
803 
804 #undef DECL_MAP_ADDR_HELPERS
805 
806 /**
807  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
808  * @lrc: Pointer to the lrc.
809  *
810  * Returns: ctx timestamp GGTT address
811  */
812 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
813 {
814 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
815 }
816 
817 /**
818  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
819  * @lrc: Pointer to the lrc.
820  *
821  * Returns: ctx timestamp udw GGTT address
822  */
823 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
824 {
825 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
826 }
827 
828 /**
829  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
830  * @lrc: Pointer to the lrc.
831  *
832  * Returns: ctx timestamp value
833  */
834 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
835 {
836 	struct xe_device *xe = lrc_to_xe(lrc);
837 	struct iosys_map map;
838 	u32 ldw, udw = 0;
839 
840 	map = __xe_lrc_ctx_timestamp_map(lrc);
841 	ldw = xe_map_read32(xe, &map);
842 
843 	if (xe->info.has_64bit_timestamp) {
844 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
845 		udw = xe_map_read32(xe, &map);
846 	}
847 
848 	return (u64)udw << 32 | ldw;
849 }
850 
851 /**
852  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
853  * @lrc: Pointer to the lrc.
854  *
855  * Returns: ctx timestamp job GGTT address
856  */
857 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
858 {
859 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
860 }
861 
862 /**
863  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
864  * @lrc: Pointer to the lrc.
865  *
866  * Returns: ctx timestamp job value
867  */
868 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
869 {
870 	struct xe_device *xe = lrc_to_xe(lrc);
871 	struct iosys_map map;
872 
873 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
874 	return xe_map_read32(xe, &map);
875 }
876 
877 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
878 {
879 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
880 }
881 
882 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
883 {
884 	if (!xe_lrc_has_indirect_ring_state(lrc))
885 		return 0;
886 
887 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
888 }
889 
890 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
891 {
892 	struct xe_device *xe = lrc_to_xe(lrc);
893 	struct iosys_map map;
894 
895 	map = __xe_lrc_indirect_ring_map(lrc);
896 	iosys_map_incr(&map, reg_nr * sizeof(u32));
897 	return xe_map_read32(xe, &map);
898 }
899 
900 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
901 					  int reg_nr, u32 val)
902 {
903 	struct xe_device *xe = lrc_to_xe(lrc);
904 	struct iosys_map map;
905 
906 	map = __xe_lrc_indirect_ring_map(lrc);
907 	iosys_map_incr(&map, reg_nr * sizeof(u32));
908 	xe_map_write32(xe, &map, val);
909 }
910 
911 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
912 {
913 	struct xe_device *xe = lrc_to_xe(lrc);
914 	struct iosys_map map;
915 
916 	map = __xe_lrc_regs_map(lrc);
917 	iosys_map_incr(&map, reg_nr * sizeof(u32));
918 	return xe_map_read32(xe, &map);
919 }
920 
921 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
922 {
923 	struct xe_device *xe = lrc_to_xe(lrc);
924 	struct iosys_map map;
925 
926 	map = __xe_lrc_regs_map(lrc);
927 	iosys_map_incr(&map, reg_nr * sizeof(u32));
928 	xe_map_write32(xe, &map, val);
929 }
930 
931 static void *empty_lrc_data(struct xe_hw_engine *hwe)
932 {
933 	struct xe_gt *gt = hwe->gt;
934 	void *data;
935 	u32 *regs;
936 
937 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
938 	if (!data)
939 		return NULL;
940 
941 	/* 1st page: Per-Process of HW status Page */
942 	regs = data + LRC_PPHWSP_SIZE;
943 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
944 	set_context_control(regs, hwe);
945 	set_memory_based_intr(regs, hwe);
946 	reset_stop_ring(regs, hwe);
947 	if (xe_gt_has_indirect_ring_state(gt)) {
948 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
949 		       LRC_INDIRECT_RING_STATE_SIZE;
950 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
951 	}
952 
953 	return data;
954 }
955 
956 /**
957  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
958  * of given engine.
959  * @hwe: the &xe_hw_engine struct instance
960  */
961 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
962 {
963 	struct xe_gt *gt = hwe->gt;
964 	u32 *regs;
965 
966 	if (!gt->default_lrc[hwe->class])
967 		return;
968 
969 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
970 	set_memory_based_intr(regs, hwe);
971 }
972 
973 /**
974  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
975  * for given LRC.
976  * @lrc: the &xe_lrc struct instance
977  * @hwe: the &xe_hw_engine struct instance
978  * @regs: scratch buffer to be used as temporary storage
979  */
980 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
981 					    u32 *regs)
982 {
983 	struct xe_gt *gt = hwe->gt;
984 	struct iosys_map map;
985 	size_t regs_len;
986 
987 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
988 		return;
989 
990 	map = __xe_lrc_regs_map(lrc);
991 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
992 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
993 	set_memory_based_intr(regs, hwe);
994 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
995 }
996 
997 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
998 {
999 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1000 
1001 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1002 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1003 }
1004 
1005 static void xe_lrc_finish(struct xe_lrc *lrc)
1006 {
1007 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1008 	xe_bo_unpin_map_no_vm(lrc->bo);
1009 }
1010 
1011 /*
1012  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1013  * in calculating active context run ticks.
1014  *
1015  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1016  * context, but only gets updated when the context switches out. In order to
1017  * check how long a context has been active before it switches out, two things
1018  * are required:
1019  *
1020  * (1) Determine if the context is running:
1021  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1022  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1023  * initialized. During a query, we just check for this value to determine if the
1024  * context is active. If the context switched out, it would overwrite this
1025  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1026  * the last part of context restore, so reusing this LRC location will not
1027  * clobber anything.
1028  *
1029  * (2) Calculate the time that the context has been active for:
1030  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1031  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1032  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1033  * engine instance. Since we do not know which instance the context is running
1034  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1035  * store it in the PPHSWP.
1036  */
1037 #define CONTEXT_ACTIVE 1ULL
1038 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1039 				    struct xe_hw_engine *hwe,
1040 				    u32 *batch,
1041 				    size_t max_len)
1042 {
1043 	u32 *cmd = batch;
1044 
1045 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1046 		return -ENOSPC;
1047 
1048 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1049 	*cmd++ = ENGINE_ID(0).addr;
1050 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1051 	*cmd++ = 0;
1052 
1053 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1054 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1055 	*cmd++ = 0;
1056 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1057 
1058 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1059 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1060 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1061 		*cmd++ = 0;
1062 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1063 	}
1064 
1065 	return cmd - batch;
1066 }
1067 
1068 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1069 				  u32 *batch, size_t max_len)
1070 {
1071 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1072 	u32 *cmd = batch;
1073 
1074 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1075 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1076 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1077 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1078 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1079 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1080 		return 0;
1081 
1082 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1083 		return -ENOSPC;
1084 
1085 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1086 		 MI_LRM_ASYNC;
1087 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1088 	*cmd++ = ts_addr;
1089 	*cmd++ = 0;
1090 
1091 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1092 		 MI_LRM_ASYNC;
1093 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1094 	*cmd++ = ts_addr;
1095 	*cmd++ = 0;
1096 
1097 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1098 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1099 	*cmd++ = ts_addr;
1100 	*cmd++ = 0;
1101 
1102 	return cmd - batch;
1103 }
1104 
1105 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1106 					       struct xe_hw_engine *hwe,
1107 					       u32 *batch, size_t max_len)
1108 {
1109 	u32 *cmd = batch;
1110 
1111 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1112 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1113 		return 0;
1114 
1115 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1116 		return -ENOSPC;
1117 
1118 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1119 	*cmd++ = CS_DEBUG_MODE1(0).addr;
1120 	*cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1121 
1122 	return cmd - batch;
1123 }
1124 
1125 struct bo_setup {
1126 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1127 			 u32 *batch, size_t max_size);
1128 };
1129 
1130 struct bo_setup_state {
1131 	/* Input: */
1132 	struct xe_lrc		*lrc;
1133 	struct xe_hw_engine	*hwe;
1134 	size_t			max_size;
1135 	size_t                  reserve_dw;
1136 	unsigned int		offset;
1137 	const struct bo_setup	*funcs;
1138 	unsigned int		num_funcs;
1139 
1140 	/* State: */
1141 	u32			*buffer;
1142 	u32			*ptr;
1143 	unsigned int		written;
1144 };
1145 
1146 static int setup_bo(struct bo_setup_state *state)
1147 {
1148 	ssize_t remain;
1149 
1150 	if (state->lrc->bo->vmap.is_iomem) {
1151 		if (!state->buffer)
1152 			return -ENOMEM;
1153 		state->ptr = state->buffer;
1154 	} else {
1155 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1156 	}
1157 
1158 	remain = state->max_size / sizeof(u32);
1159 
1160 	for (size_t i = 0; i < state->num_funcs; i++) {
1161 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1162 						    state->ptr, remain);
1163 
1164 		remain -= len;
1165 
1166 		/*
1167 		 * Caller has asked for at least reserve_dw to remain unused.
1168 		 */
1169 		if (len < 0 ||
1170 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1171 			goto fail;
1172 
1173 		state->ptr += len;
1174 		state->written += len;
1175 	}
1176 
1177 	return 0;
1178 
1179 fail:
1180 	return -ENOSPC;
1181 }
1182 
1183 static void finish_bo(struct bo_setup_state *state)
1184 {
1185 	if (!state->buffer)
1186 		return;
1187 
1188 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1189 			 state->offset, state->buffer,
1190 			 state->written * sizeof(u32));
1191 }
1192 
1193 /**
1194  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1195  * @lrc: the &xe_lrc struct instance
1196  * @hwe: the &xe_hw_engine struct instance
1197  * @scratch: preallocated scratch buffer for temporary storage
1198  * Return: 0 on success, negative error code on failure
1199  */
1200 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1201 {
1202 	static const struct bo_setup funcs[] = {
1203 		{ .setup = setup_timestamp_wa },
1204 		{ .setup = setup_invalidate_state_cache_wa },
1205 		{ .setup = setup_utilization_wa },
1206 	};
1207 	struct bo_setup_state state = {
1208 		.lrc = lrc,
1209 		.hwe = hwe,
1210 		.max_size = LRC_WA_BB_SIZE,
1211 		.buffer = scratch,
1212 		.reserve_dw = 1,
1213 		.offset = __xe_lrc_wa_bb_offset(lrc),
1214 		.funcs = funcs,
1215 		.num_funcs = ARRAY_SIZE(funcs),
1216 	};
1217 	int ret;
1218 
1219 	ret = setup_bo(&state);
1220 	if (ret)
1221 		return ret;
1222 
1223 	*state.ptr++ = MI_BATCH_BUFFER_END;
1224 	state.written++;
1225 
1226 	finish_bo(&state);
1227 
1228 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1229 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1230 
1231 	return 0;
1232 }
1233 
1234 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1235 {
1236 	u32 *buf = NULL;
1237 	int ret;
1238 
1239 	if (lrc->bo->vmap.is_iomem)
1240 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1241 
1242 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1243 
1244 	kfree(buf);
1245 
1246 	return ret;
1247 }
1248 
1249 static int
1250 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1251 {
1252 	static struct bo_setup rcs_funcs[] = {
1253 		{ .setup = setup_timestamp_wa },
1254 	};
1255 	struct bo_setup_state state = {
1256 		.lrc = lrc,
1257 		.hwe = hwe,
1258 		.max_size = (63 * 64) /* max 63 cachelines */,
1259 		.buffer = NULL,
1260 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1261 	};
1262 	int ret;
1263 
1264 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1265 		return 0;
1266 
1267 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1268 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1269 		state.funcs = rcs_funcs;
1270 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1271 	}
1272 
1273 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1274 		return 0;
1275 
1276 	if (lrc->bo->vmap.is_iomem)
1277 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1278 
1279 	ret = setup_bo(&state);
1280 	if (ret) {
1281 		kfree(state.buffer);
1282 		return ret;
1283 	}
1284 
1285 	/*
1286 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1287 	 * execute: size for indirect ctx must be a multiple of 64.
1288 	 */
1289 	while (state.written & 0xf) {
1290 		*state.ptr++ = MI_NOOP;
1291 		state.written++;
1292 	}
1293 
1294 	finish_bo(&state);
1295 	kfree(state.buffer);
1296 
1297 	xe_lrc_write_ctx_reg(lrc,
1298 			     CTX_CS_INDIRECT_CTX,
1299 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1300 			     /* Size in CLs. */
1301 			     (state.written * sizeof(u32) / 64));
1302 	xe_lrc_write_ctx_reg(lrc,
1303 			     CTX_CS_INDIRECT_CTX_OFFSET,
1304 			     CTX_INDIRECT_CTX_OFFSET_DEFAULT);
1305 
1306 	return 0;
1307 }
1308 
1309 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1310 		       struct xe_vm *vm, u32 ring_size, u16 msix_vec,
1311 		       u32 init_flags)
1312 {
1313 	struct xe_gt *gt = hwe->gt;
1314 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1315 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1316 	struct xe_tile *tile = gt_to_tile(gt);
1317 	struct xe_device *xe = gt_to_xe(gt);
1318 	struct iosys_map map;
1319 	u32 arb_enable;
1320 	u32 bo_flags;
1321 	int err;
1322 
1323 	kref_init(&lrc->refcount);
1324 	lrc->gt = gt;
1325 	lrc->size = lrc_size;
1326 	lrc->flags = 0;
1327 	lrc->ring.size = ring_size;
1328 	lrc->ring.tail = 0;
1329 
1330 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1331 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1332 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1333 	}
1334 
1335 	if (xe_gt_has_indirect_ring_state(gt))
1336 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1337 
1338 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1339 		   XE_BO_FLAG_GGTT_INVALIDATE;
1340 	if (vm && vm->xef) /* userspace */
1341 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE;
1342 
1343 	lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, bo_size,
1344 				       ttm_bo_type_kernel,
1345 				       bo_flags);
1346 	if (IS_ERR(lrc->bo))
1347 		return PTR_ERR(lrc->bo);
1348 
1349 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1350 			     hwe->fence_irq, hwe->name);
1351 
1352 	/*
1353 	 * Init Per-Process of HW status Page, LRC / context state to known
1354 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1355 	 * it's the early submission to record the lrc: build a new empty one from
1356 	 * scratch.
1357 	 */
1358 	map = __xe_lrc_pphwsp_map(lrc);
1359 	if (gt->default_lrc[hwe->class]) {
1360 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1361 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1362 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1363 				 lrc_size - LRC_PPHWSP_SIZE);
1364 	} else {
1365 		void *init_data = empty_lrc_data(hwe);
1366 
1367 		if (!init_data) {
1368 			err = -ENOMEM;
1369 			goto err_lrc_finish;
1370 		}
1371 
1372 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1373 		kfree(init_data);
1374 	}
1375 
1376 	if (vm) {
1377 		xe_lrc_set_ppgtt(lrc, vm);
1378 
1379 		if (vm->xef)
1380 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1381 	}
1382 
1383 	if (xe_device_has_msix(xe)) {
1384 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1385 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1386 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1387 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1388 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1389 	}
1390 
1391 	if (xe_gt_has_indirect_ring_state(gt)) {
1392 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1393 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1394 
1395 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1396 					      __xe_lrc_ring_ggtt_addr(lrc));
1397 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1398 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1399 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1400 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1401 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1402 	} else {
1403 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1404 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1405 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1406 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1407 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1408 	}
1409 
1410 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1411 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1412 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1413 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1414 
1415 	if (init_flags & XE_LRC_CREATE_PXP)
1416 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1417 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1418 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1419 
1420 	lrc->ctx_timestamp = 0;
1421 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1422 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1423 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1424 
1425 	if (xe->info.has_asid && vm)
1426 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1427 
1428 	lrc->desc = LRC_VALID;
1429 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1430 	/* TODO: Priority */
1431 
1432 	/* While this appears to have something about privileged batches or
1433 	 * some such, it really just means PPGTT mode.
1434 	 */
1435 	if (vm)
1436 		lrc->desc |= LRC_PRIVILEGE;
1437 
1438 	if (GRAPHICS_VERx100(xe) < 1250) {
1439 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1440 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1441 	}
1442 
1443 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1444 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1445 
1446 	map = __xe_lrc_seqno_map(lrc);
1447 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1448 
1449 	map = __xe_lrc_start_seqno_map(lrc);
1450 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1451 
1452 	err = setup_wa_bb(lrc, hwe);
1453 	if (err)
1454 		goto err_lrc_finish;
1455 
1456 	err = setup_indirect_ctx(lrc, hwe);
1457 	if (err)
1458 		goto err_lrc_finish;
1459 
1460 	return 0;
1461 
1462 err_lrc_finish:
1463 	xe_lrc_finish(lrc);
1464 	return err;
1465 }
1466 
1467 /**
1468  * xe_lrc_create - Create a LRC
1469  * @hwe: Hardware Engine
1470  * @vm: The VM (address space)
1471  * @ring_size: LRC ring size
1472  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1473  * @flags: LRC initialization flags
1474  *
1475  * Allocate and initialize the Logical Ring Context (LRC).
1476  *
1477  * Return pointer to created LRC upon success and an error pointer
1478  * upon failure.
1479  */
1480 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1481 			     u32 ring_size, u16 msix_vec, u32 flags)
1482 {
1483 	struct xe_lrc *lrc;
1484 	int err;
1485 
1486 	lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1487 	if (!lrc)
1488 		return ERR_PTR(-ENOMEM);
1489 
1490 	err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1491 	if (err) {
1492 		kfree(lrc);
1493 		return ERR_PTR(err);
1494 	}
1495 
1496 	return lrc;
1497 }
1498 
1499 /**
1500  * xe_lrc_destroy - Destroy the LRC
1501  * @ref: reference to LRC
1502  *
1503  * Called when ref == 0, release resources held by the Logical Ring Context
1504  * (LRC) and free the LRC memory.
1505  */
1506 void xe_lrc_destroy(struct kref *ref)
1507 {
1508 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1509 
1510 	xe_lrc_finish(lrc);
1511 	kfree(lrc);
1512 }
1513 
1514 /**
1515  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1516  * @lrc: the &xe_lrc struct instance
1517  */
1518 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1519 {
1520 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1521 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1522 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1523 
1524 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1525 					      __xe_lrc_ring_ggtt_addr(lrc));
1526 	} else {
1527 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1528 	}
1529 }
1530 
1531 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1532 {
1533 	if (xe_lrc_has_indirect_ring_state(lrc))
1534 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1535 	else
1536 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1537 }
1538 
1539 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1540 {
1541 	if (xe_lrc_has_indirect_ring_state(lrc))
1542 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1543 	else
1544 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1545 }
1546 
1547 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1548 {
1549 	if (xe_lrc_has_indirect_ring_state(lrc))
1550 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1551 	else
1552 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1553 }
1554 
1555 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1556 {
1557 	if (xe_lrc_has_indirect_ring_state(lrc))
1558 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1559 	else
1560 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1561 }
1562 
1563 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1564 {
1565 	if (xe_lrc_has_indirect_ring_state(lrc))
1566 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1567 	else
1568 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1569 }
1570 
1571 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1572 {
1573 	const u32 head = xe_lrc_ring_head(lrc);
1574 	const u32 tail = lrc->ring.tail;
1575 	const u32 size = lrc->ring.size;
1576 
1577 	return ((head - tail - 1) & (size - 1)) + 1;
1578 }
1579 
1580 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1581 				const void *data, size_t size)
1582 {
1583 	struct xe_device *xe = lrc_to_xe(lrc);
1584 
1585 	iosys_map_incr(&ring, lrc->ring.tail);
1586 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1587 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1588 }
1589 
1590 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1591 {
1592 	struct xe_device *xe = lrc_to_xe(lrc);
1593 	struct iosys_map ring;
1594 	u32 rhs;
1595 	size_t aligned_size;
1596 
1597 	xe_assert(xe, IS_ALIGNED(size, 4));
1598 	aligned_size = ALIGN(size, 8);
1599 
1600 	ring = __xe_lrc_ring_map(lrc);
1601 
1602 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1603 	rhs = lrc->ring.size - lrc->ring.tail;
1604 	if (size > rhs) {
1605 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1606 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1607 	} else {
1608 		__xe_lrc_write_ring(lrc, ring, data, size);
1609 	}
1610 
1611 	if (aligned_size > size) {
1612 		u32 noop = MI_NOOP;
1613 
1614 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1615 	}
1616 }
1617 
1618 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1619 {
1620 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1621 }
1622 
1623 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1624 {
1625 	return __xe_lrc_seqno_ggtt_addr(lrc);
1626 }
1627 
1628 /**
1629  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1630  *
1631  * Allocate but don't initialize an lrc seqno fence.
1632  *
1633  * Return: Pointer to the allocated fence or
1634  * negative error pointer on error.
1635  */
1636 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1637 {
1638 	return xe_hw_fence_alloc();
1639 }
1640 
1641 /**
1642  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1643  * @fence: Pointer to the fence to free.
1644  *
1645  * Frees an lrc seqno fence that hasn't yet been
1646  * initialized.
1647  */
1648 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1649 {
1650 	xe_hw_fence_free(fence);
1651 }
1652 
1653 /**
1654  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1655  * @lrc: Pointer to the lrc.
1656  * @fence: Pointer to the fence to initialize.
1657  *
1658  * Initializes a pre-allocated lrc seqno fence.
1659  * After initialization, the fence is subject to normal
1660  * dma-fence refcounting.
1661  */
1662 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1663 {
1664 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1665 }
1666 
1667 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1668 {
1669 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1670 
1671 	return xe_map_read32(lrc_to_xe(lrc), &map);
1672 }
1673 
1674 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1675 {
1676 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1677 
1678 	return xe_map_read32(lrc_to_xe(lrc), &map);
1679 }
1680 
1681 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1682 {
1683 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1684 }
1685 
1686 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1687 {
1688 	return __xe_lrc_parallel_ggtt_addr(lrc);
1689 }
1690 
1691 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1692 {
1693 	return __xe_lrc_parallel_map(lrc);
1694 }
1695 
1696 /**
1697  * xe_lrc_engine_id() - Read engine id value
1698  * @lrc: Pointer to the lrc.
1699  *
1700  * Returns: context id value
1701  */
1702 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1703 {
1704 	struct xe_device *xe = lrc_to_xe(lrc);
1705 	struct iosys_map map;
1706 
1707 	map = __xe_lrc_engine_id_map(lrc);
1708 	return xe_map_read32(xe, &map);
1709 }
1710 
1711 static int instr_dw(u32 cmd_header)
1712 {
1713 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1714 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1715 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1716 		return 1;
1717 
1718 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1719 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1720 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1721 
1722 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1723 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1724 }
1725 
1726 static int dump_mi_command(struct drm_printer *p,
1727 			   struct xe_gt *gt,
1728 			   u32 *dw,
1729 			   int remaining_dw)
1730 {
1731 	u32 inst_header = *dw;
1732 	u32 numdw = instr_dw(inst_header);
1733 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1734 	int num_noop;
1735 
1736 	/* First check for commands that don't have/use a '# DW' field */
1737 	switch (inst_header & MI_OPCODE) {
1738 	case MI_NOOP:
1739 		num_noop = 1;
1740 		while (num_noop < remaining_dw &&
1741 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1742 			num_noop++;
1743 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1744 		return num_noop;
1745 
1746 	case MI_TOPOLOGY_FILTER:
1747 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1748 		return 1;
1749 
1750 	case MI_BATCH_BUFFER_END:
1751 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1752 		/* Return 'remaining_dw' to consume the rest of the LRC */
1753 		return remaining_dw;
1754 	}
1755 
1756 	/*
1757 	 * Any remaining commands include a # of dwords.  We should make sure
1758 	 * it doesn't exceed the remaining size of the LRC.
1759 	 */
1760 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1761 		numdw = remaining_dw;
1762 
1763 	switch (inst_header & MI_OPCODE) {
1764 	case MI_LOAD_REGISTER_IMM:
1765 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1766 			   inst_header, (numdw - 1) / 2);
1767 		for (int i = 1; i < numdw; i += 2)
1768 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1769 		return numdw;
1770 
1771 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1772 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1773 			   inst_header,
1774 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1775 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1776 		if (numdw == 4)
1777 			drm_printf(p, " - %#6x = %#010llx\n",
1778 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1779 		else
1780 			drm_printf(p, " - %*ph (%s)\n",
1781 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1782 				   numdw < 4 ? "truncated" : "malformed");
1783 		return numdw;
1784 
1785 	case MI_FORCE_WAKEUP:
1786 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1787 		return numdw;
1788 
1789 	default:
1790 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1791 			   inst_header, opcode, numdw);
1792 		return numdw;
1793 	}
1794 }
1795 
1796 static int dump_gfxpipe_command(struct drm_printer *p,
1797 				struct xe_gt *gt,
1798 				u32 *dw,
1799 				int remaining_dw)
1800 {
1801 	u32 numdw = instr_dw(*dw);
1802 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1803 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1804 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1805 
1806 	/*
1807 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1808 	 * remaining size of the LRC.
1809 	 */
1810 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1811 		numdw = remaining_dw;
1812 
1813 	switch (*dw & GFXPIPE_MATCH_MASK) {
1814 #define MATCH(cmd) \
1815 	case cmd: \
1816 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1817 		return numdw
1818 #define MATCH3D(cmd) \
1819 	case CMD_##cmd: \
1820 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1821 		return numdw
1822 
1823 	MATCH(STATE_BASE_ADDRESS);
1824 	MATCH(STATE_SIP);
1825 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1826 	MATCH(STATE_COMPUTE_MODE);
1827 	MATCH3D(3DSTATE_BTD);
1828 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1829 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1830 
1831 	MATCH3D(3DSTATE_VF_STATISTICS);
1832 
1833 	MATCH(PIPELINE_SELECT);
1834 
1835 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1836 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1837 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1838 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1839 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1840 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1841 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1842 	MATCH3D(3DSTATE_INDEX_BUFFER);
1843 	MATCH3D(3DSTATE_VF);
1844 	MATCH3D(3DSTATE_MULTISAMPLE);
1845 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1846 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1847 	MATCH3D(3DSTATE_VS);
1848 	MATCH3D(3DSTATE_GS);
1849 	MATCH3D(3DSTATE_CLIP);
1850 	MATCH3D(3DSTATE_SF);
1851 	MATCH3D(3DSTATE_WM);
1852 	MATCH3D(3DSTATE_CONSTANT_VS);
1853 	MATCH3D(3DSTATE_CONSTANT_GS);
1854 	MATCH3D(3DSTATE_CONSTANT_PS);
1855 	MATCH3D(3DSTATE_SAMPLE_MASK);
1856 	MATCH3D(3DSTATE_CONSTANT_HS);
1857 	MATCH3D(3DSTATE_CONSTANT_DS);
1858 	MATCH3D(3DSTATE_HS);
1859 	MATCH3D(3DSTATE_TE);
1860 	MATCH3D(3DSTATE_DS);
1861 	MATCH3D(3DSTATE_STREAMOUT);
1862 	MATCH3D(3DSTATE_SBE);
1863 	MATCH3D(3DSTATE_PS);
1864 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1865 	MATCH3D(3DSTATE_CPS_POINTERS);
1866 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1867 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1868 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1869 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1870 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1871 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1872 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1873 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1874 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1875 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1876 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1877 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1878 	MATCH3D(3DSTATE_VF_INSTANCING);
1879 	MATCH3D(3DSTATE_VF_SGVS);
1880 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1881 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1882 	MATCH3D(3DSTATE_PS_BLEND);
1883 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1884 	MATCH3D(3DSTATE_PS_EXTRA);
1885 	MATCH3D(3DSTATE_RASTER);
1886 	MATCH3D(3DSTATE_SBE_SWIZ);
1887 	MATCH3D(3DSTATE_WM_HZ_OP);
1888 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1889 	MATCH3D(3DSTATE_VF_SGVS_2);
1890 	MATCH3D(3DSTATE_VFG);
1891 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1892 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1893 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1894 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1895 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1896 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1897 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1898 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1899 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1900 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1901 	MATCH3D(3DSTATE_AMFS);
1902 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1903 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1904 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1905 	MATCH3D(3DSTATE_MESH_CONTROL);
1906 	MATCH3D(3DSTATE_MESH_DISTRIB);
1907 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1908 	MATCH3D(3DSTATE_MESH_SHADER);
1909 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1910 	MATCH3D(3DSTATE_TASK_CONTROL);
1911 	MATCH3D(3DSTATE_TASK_SHADER);
1912 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1913 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1914 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1915 	MATCH3D(3DSTATE_CLIP_MESH);
1916 	MATCH3D(3DSTATE_SBE_MESH);
1917 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1918 	MATCH3D(3DSTATE_COARSE_PIXEL);
1919 
1920 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1921 	MATCH3D(3DSTATE_CHROMA_KEY);
1922 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1923 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1924 	MATCH3D(3DSTATE_LINE_STIPPLE);
1925 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1926 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1927 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1928 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1929 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1930 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1931 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1932 	MATCH3D(3DSTATE_SO_DECL_LIST);
1933 	MATCH3D(3DSTATE_SO_BUFFER);
1934 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1935 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1936 	MATCH3D(3DSTATE_3D_MODE);
1937 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1938 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1939 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1940 
1941 	default:
1942 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1943 			   *dw, pipeline, opcode, subopcode, numdw);
1944 		return numdw;
1945 	}
1946 }
1947 
1948 static int dump_gfx_state_command(struct drm_printer *p,
1949 				  struct xe_gt *gt,
1950 				  u32 *dw,
1951 				  int remaining_dw)
1952 {
1953 	u32 numdw = instr_dw(*dw);
1954 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1955 
1956 	/*
1957 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1958 	 * remaining size of the LRC.
1959 	 */
1960 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1961 		numdw = remaining_dw;
1962 
1963 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1964 	MATCH(STATE_WRITE_INLINE);
1965 
1966 	default:
1967 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1968 			   *dw, opcode, numdw);
1969 		return numdw;
1970 	}
1971 }
1972 
1973 void xe_lrc_dump_default(struct drm_printer *p,
1974 			 struct xe_gt *gt,
1975 			 enum xe_engine_class hwe_class)
1976 {
1977 	u32 *dw;
1978 	int remaining_dw, num_dw;
1979 
1980 	if (!gt->default_lrc[hwe_class]) {
1981 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1982 		return;
1983 	}
1984 
1985 	/*
1986 	 * Skip the beginning of the LRC since it contains the per-process
1987 	 * hardware status page.
1988 	 */
1989 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1990 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
1991 
1992 	while (remaining_dw > 0) {
1993 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1994 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1995 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1996 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1997 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1998 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1999 		} else {
2000 			num_dw = min(instr_dw(*dw), remaining_dw);
2001 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2002 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2003 				   num_dw);
2004 		}
2005 
2006 		dw += num_dw;
2007 		remaining_dw -= num_dw;
2008 	}
2009 }
2010 
2011 struct instr_state {
2012 	u32 instr;
2013 	u16 num_dw;
2014 };
2015 
2016 static const struct instr_state xe_hpg_svg_state[] = {
2017 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2018 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2019 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2020 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2021 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2022 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2023 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2024 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2025 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2026 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2027 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2028 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2029 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2030 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2031 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2032 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2033 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2034 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2035 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2036 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2037 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2038 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2039 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2040 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2041 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2042 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2043 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2044 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2045 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2046 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2047 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2048 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2049 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2050 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2051 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2052 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2053 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2054 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2055 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2056 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2057 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2058 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2059 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2060 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2061 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2062 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2063 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2064 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2065 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2066 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2067 };
2068 
2069 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2070 {
2071 	struct xe_gt *gt = q->hwe->gt;
2072 	struct xe_device *xe = gt_to_xe(gt);
2073 	const struct instr_state *state_table = NULL;
2074 	int state_table_size = 0;
2075 
2076 	/*
2077 	 * Wa_14019789679
2078 	 *
2079 	 * If the driver doesn't explicitly emit the SVG instructions while
2080 	 * setting up the default LRC, the context switch will write 0's
2081 	 * (noops) into the LRC memory rather than the expected instruction
2082 	 * headers.  Application contexts start out as a copy of the default
2083 	 * LRC, and if they also do not emit specific settings for some SVG
2084 	 * state, then on context restore they'll unintentionally inherit
2085 	 * whatever state setting the previous context had programmed into the
2086 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2087 	 * prevent the hardware from resetting that state back to any specific
2088 	 * value).
2089 	 *
2090 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2091 	 * since that's a specific state setting that can easily cause GPU
2092 	 * hangs if unintentionally inherited.  However to be safe we'll
2093 	 * continue to emit all of the SVG state since it's best not to leak
2094 	 * any of the state between contexts, even if that leakage is harmless.
2095 	 */
2096 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2097 		state_table = xe_hpg_svg_state;
2098 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2099 	}
2100 
2101 	if (!state_table) {
2102 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2103 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2104 		return cs;
2105 	}
2106 
2107 	for (int i = 0; i < state_table_size; i++) {
2108 		u32 instr = state_table[i].instr;
2109 		u16 num_dw = state_table[i].num_dw;
2110 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2111 
2112 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2113 		xe_gt_assert(gt, num_dw != 0);
2114 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2115 
2116 		/*
2117 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2118 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2119 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2120 		 * Just make the replacement here rather than defining a
2121 		 * whole separate table for the single trivial change.
2122 		 */
2123 		if (GRAPHICS_VER(xe) >= 20 &&
2124 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2125 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2126 
2127 		*cs = instr;
2128 		if (!is_single_dw)
2129 			*cs |= (num_dw - 2);
2130 
2131 		cs += num_dw;
2132 	}
2133 
2134 	return cs;
2135 }
2136 
2137 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2138 {
2139 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
2140 
2141 	if (!snapshot)
2142 		return NULL;
2143 
2144 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2145 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2146 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2147 	snapshot->head = xe_lrc_ring_head(lrc);
2148 	snapshot->tail.internal = lrc->ring.tail;
2149 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2150 	snapshot->start = xe_lrc_ring_start(lrc);
2151 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2152 	snapshot->seqno = xe_lrc_seqno(lrc);
2153 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2154 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2155 	snapshot->lrc_size = lrc->size;
2156 	snapshot->lrc_snapshot = NULL;
2157 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2158 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2159 	return snapshot;
2160 }
2161 
2162 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2163 {
2164 	struct xe_bo *bo;
2165 	struct iosys_map src;
2166 
2167 	if (!snapshot)
2168 		return;
2169 
2170 	bo = snapshot->lrc_bo;
2171 	snapshot->lrc_bo = NULL;
2172 
2173 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2174 	if (!snapshot->lrc_snapshot)
2175 		goto put_bo;
2176 
2177 	xe_bo_lock(bo, false);
2178 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2179 		xe_map_memcpy_from(xe_bo_device(bo),
2180 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2181 				   snapshot->lrc_size);
2182 		ttm_bo_vunmap(&bo->ttm, &src);
2183 	} else {
2184 		kvfree(snapshot->lrc_snapshot);
2185 		snapshot->lrc_snapshot = NULL;
2186 	}
2187 	xe_bo_unlock(bo);
2188 put_bo:
2189 	xe_bo_put(bo);
2190 }
2191 
2192 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2193 {
2194 	unsigned long i;
2195 
2196 	if (!snapshot)
2197 		return;
2198 
2199 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2200 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2201 		   snapshot->ring_addr);
2202 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2203 		   snapshot->indirect_context_desc);
2204 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2205 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2206 		   snapshot->tail.internal, snapshot->tail.memory);
2207 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2208 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2209 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2210 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2211 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2212 
2213 	if (!snapshot->lrc_snapshot)
2214 		return;
2215 
2216 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2217 	drm_puts(p, "\t[HWSP].data: ");
2218 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2219 		u32 *val = snapshot->lrc_snapshot + i;
2220 		char dumped[ASCII85_BUFSZ];
2221 
2222 		drm_puts(p, ascii85_encode(*val, dumped));
2223 	}
2224 
2225 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2226 	drm_puts(p, "\t[HWCTX].data: ");
2227 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2228 		u32 *val = snapshot->lrc_snapshot + i;
2229 		char dumped[ASCII85_BUFSZ];
2230 
2231 		drm_puts(p, ascii85_encode(*val, dumped));
2232 	}
2233 	drm_puts(p, "\n");
2234 }
2235 
2236 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2237 {
2238 	if (!snapshot)
2239 		return;
2240 
2241 	kvfree(snapshot->lrc_snapshot);
2242 	if (snapshot->lrc_bo)
2243 		xe_bo_put(snapshot->lrc_bo);
2244 
2245 	kfree(snapshot);
2246 }
2247 
2248 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2249 {
2250 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2251 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2252 	struct xe_hw_engine *hwe;
2253 	u64 val;
2254 
2255 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2256 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2257 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2258 			    class, instance))
2259 		return -1;
2260 
2261 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2262 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2263 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2264 	else
2265 		val = xe_mmio_read32(&hwe->gt->mmio,
2266 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2267 
2268 	*reg_ctx_ts = val;
2269 
2270 	return 0;
2271 }
2272 
2273 /**
2274  * xe_lrc_update_timestamp() - Update ctx timestamp
2275  * @lrc: Pointer to the lrc.
2276  * @old_ts: Old timestamp value
2277  *
2278  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2279  * update saved value. With support for active contexts, the calculation may be
2280  * slightly racy, so follow a read-again logic to ensure that the context is
2281  * still active before returning the right timestamp.
2282  *
2283  * Returns: New ctx timestamp value
2284  */
2285 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2286 {
2287 	u64 lrc_ts, reg_ts;
2288 	u32 engine_id;
2289 
2290 	*old_ts = lrc->ctx_timestamp;
2291 
2292 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2293 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2294 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2295 		lrc->ctx_timestamp = lrc_ts;
2296 		goto done;
2297 	}
2298 
2299 	if (lrc_ts == CONTEXT_ACTIVE) {
2300 		engine_id = xe_lrc_engine_id(lrc);
2301 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2302 			lrc->ctx_timestamp = reg_ts;
2303 
2304 		/* read lrc again to ensure context is still active */
2305 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2306 	}
2307 
2308 	/*
2309 	 * If context switched out, just use the lrc_ts. Note that this needs to
2310 	 * be a separate if condition.
2311 	 */
2312 	if (lrc_ts != CONTEXT_ACTIVE)
2313 		lrc->ctx_timestamp = lrc_ts;
2314 
2315 done:
2316 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2317 
2318 	return lrc->ctx_timestamp;
2319 }
2320 
2321 /**
2322  * xe_lrc_ring_is_idle() - LRC is idle
2323  * @lrc: Pointer to the lrc.
2324  *
2325  * Compare LRC ring head and tail to determine if idle.
2326  *
2327  * Return: True is ring is idle, False otherwise
2328  */
2329 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2330 {
2331 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2332 }
2333