xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision f5bd7da05a5988506dedcb3e67aecb3a13a4cdf0)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_lrc_layout.h"
18 #include "xe_bb.h"
19 #include "xe_bo.h"
20 #include "xe_configfs.h"
21 #include "xe_device.h"
22 #include "xe_drm_client.h"
23 #include "xe_exec_queue_types.h"
24 #include "xe_gt.h"
25 #include "xe_gt_printk.h"
26 #include "xe_hw_fence.h"
27 #include "xe_map.h"
28 #include "xe_memirq.h"
29 #include "xe_mmio.h"
30 #include "xe_sriov.h"
31 #include "xe_trace_lrc.h"
32 #include "xe_vm.h"
33 #include "xe_wa.h"
34 
35 #define LRC_VALID				BIT_ULL(0)
36 #define LRC_PRIVILEGE				BIT_ULL(8)
37 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
38 #define LRC_LEGACY_64B_CONTEXT			3
39 
40 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
41 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
42 
43 #define LRC_PPHWSP_SIZE				SZ_4K
44 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
45 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
46 
47 #define LRC_PRIORITY				GENMASK_ULL(10, 9)
48 #define LRC_PRIORITY_LOW			0
49 #define LRC_PRIORITY_NORMAL			1
50 #define LRC_PRIORITY_HIGH			2
51 
52 /*
53  * Layout of the LRC and associated data allocated as
54  * lrc->bo:
55  *
56  *   Region                       Size
57  *  +============================+=================================+ <- __xe_lrc_ring_offset()
58  *  | Ring                       | ring_size, see                  |
59  *  |                            | xe_lrc_init()                   |
60  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
61  *  | PPHWSP (includes SW state) | 4K                              |
62  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
63  *  | Engine Context Image       | n * 4K, see                     |
64  *  |                            | xe_gt_lrc_size()                |
65  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
66  *  | Indirect Ring State Page   | 0 or 4k, see                    |
67  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
68  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
69  *  | Indirect Context Page      | 0 or 4k, see                    |
70  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
71  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
72  *  | WA BB Per Ctx              | 4k                              |
73  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
74  */
75 
76 static struct xe_device *
77 lrc_to_xe(struct xe_lrc *lrc)
78 {
79 	return gt_to_xe(lrc->fence_ctx.gt);
80 }
81 
82 static bool
83 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
84 {
85 	struct xe_device *xe = gt_to_xe(gt);
86 
87 	if (XE_GT_WA(gt, 16010904313) &&
88 	    (class == XE_ENGINE_CLASS_RENDER ||
89 	     class == XE_ENGINE_CLASS_COMPUTE))
90 		return true;
91 
92 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
93 					       class, NULL))
94 		return true;
95 
96 	return false;
97 }
98 
99 /**
100  * xe_gt_lrc_hang_replay_size() - Hang replay size
101  * @gt: The GT
102  * @class: Hardware engine class
103  *
104  * Determine size of GPU hang replay state for a GT and hardware engine class.
105  *
106  * Return: Size of GPU hang replay size
107  */
108 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
109 {
110 	struct xe_device *xe = gt_to_xe(gt);
111 	size_t size = 0;
112 
113 	/* Engine context image */
114 	switch (class) {
115 	case XE_ENGINE_CLASS_RENDER:
116 		if (GRAPHICS_VERx100(xe) >= 3510)
117 			size += 7 * SZ_4K;
118 		else if (GRAPHICS_VER(xe) >= 20)
119 			size += 3 * SZ_4K;
120 		else
121 			size += 13 * SZ_4K;
122 		break;
123 	case XE_ENGINE_CLASS_COMPUTE:
124 		if (GRAPHICS_VERx100(xe) >= 3510)
125 			size += 5 * SZ_4K;
126 		else if (GRAPHICS_VER(xe) >= 20)
127 			size += 2 * SZ_4K;
128 		else
129 			size += 13 * SZ_4K;
130 		break;
131 	default:
132 		WARN(1, "Unknown engine class: %d", class);
133 		fallthrough;
134 	case XE_ENGINE_CLASS_COPY:
135 	case XE_ENGINE_CLASS_VIDEO_DECODE:
136 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
137 	case XE_ENGINE_CLASS_OTHER:
138 		size += 1 * SZ_4K;
139 	}
140 
141 	return size;
142 }
143 
144 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
145 {
146 	size_t size = xe_gt_lrc_hang_replay_size(gt, class);
147 
148 	/* Add indirect ring state page */
149 	if (xe_gt_has_indirect_ring_state(gt))
150 		size += LRC_INDIRECT_RING_STATE_SIZE;
151 
152 	return size + LRC_PPHWSP_SIZE;
153 }
154 
155 /*
156  * The per-platform tables are u8-encoded in @data. Decode @data and set the
157  * addresses' offset and commands in @regs. The following encoding is used
158  * for each byte. There are 2 steps: decoding commands and decoding addresses.
159  *
160  * Commands:
161  * [7]: create NOPs - number of NOPs are set in lower bits
162  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
163  *      MI_LRI_FORCE_POSTED
164  * [5:0]: Number of NOPs or registers to set values to in case of
165  *        MI_LOAD_REGISTER_IMM
166  *
167  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
168  * number of registers. They are set by using the REG/REG16 macros: the former
169  * is used for offsets smaller than 0x200 while the latter is for values bigger
170  * than that. Those macros already set all the bits documented below correctly:
171  *
172  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
173  *      follow, for the lower bits
174  * [6:0]: Register offset, without considering the engine base.
175  *
176  * This function only tweaks the commands and register offsets. Values are not
177  * filled out.
178  */
179 static void set_offsets(u32 *regs,
180 			const u8 *data,
181 			const struct xe_hw_engine *hwe)
182 #define NOP(x) (BIT(7) | (x))
183 #define LRI(count, flags) ((flags) << 6 | (count) | \
184 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
185 #define POSTED BIT(0)
186 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
187 #define REG16(x) \
188 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
189 	(((x) >> 2) & 0x7f)
190 {
191 	const u32 base = hwe->mmio_base;
192 
193 	while (*data) {
194 		u8 count, flags;
195 
196 		if (*data & BIT(7)) { /* skip */
197 			count = *data++ & ~BIT(7);
198 			regs += count;
199 			continue;
200 		}
201 
202 		count = *data & 0x3f;
203 		flags = *data >> 6;
204 		data++;
205 
206 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
207 		if (flags & POSTED)
208 			*regs |= MI_LRI_FORCE_POSTED;
209 		*regs |= MI_LRI_LRM_CS_MMIO;
210 		regs++;
211 
212 		xe_gt_assert(hwe->gt, count);
213 		do {
214 			u32 offset = 0;
215 			u8 v;
216 
217 			do {
218 				v = *data++;
219 				offset <<= 7;
220 				offset |= v & ~BIT(7);
221 			} while (v & BIT(7));
222 
223 			regs[0] = base + (offset << 2);
224 			regs += 2;
225 		} while (--count);
226 	}
227 
228 	*regs = MI_BATCH_BUFFER_END | BIT(0);
229 }
230 
231 static const u8 gen12_xcs_offsets[] = {
232 	NOP(1),
233 	LRI(13, POSTED),
234 	REG16(0x244),
235 	REG(0x034),
236 	REG(0x030),
237 	REG(0x038),
238 	REG(0x03c),
239 	REG(0x168),
240 	REG(0x140),
241 	REG(0x110),
242 	REG(0x1c0),
243 	REG(0x1c4),
244 	REG(0x1c8),
245 	REG(0x180),
246 	REG16(0x2b4),
247 
248 	NOP(5),
249 	LRI(9, POSTED),
250 	REG16(0x3a8),
251 	REG16(0x28c),
252 	REG16(0x288),
253 	REG16(0x284),
254 	REG16(0x280),
255 	REG16(0x27c),
256 	REG16(0x278),
257 	REG16(0x274),
258 	REG16(0x270),
259 
260 	0
261 };
262 
263 static const u8 dg2_xcs_offsets[] = {
264 	NOP(1),
265 	LRI(15, POSTED),
266 	REG16(0x244),
267 	REG(0x034),
268 	REG(0x030),
269 	REG(0x038),
270 	REG(0x03c),
271 	REG(0x168),
272 	REG(0x140),
273 	REG(0x110),
274 	REG(0x1c0),
275 	REG(0x1c4),
276 	REG(0x1c8),
277 	REG(0x180),
278 	REG16(0x2b4),
279 	REG(0x120),
280 	REG(0x124),
281 
282 	NOP(1),
283 	LRI(9, POSTED),
284 	REG16(0x3a8),
285 	REG16(0x28c),
286 	REG16(0x288),
287 	REG16(0x284),
288 	REG16(0x280),
289 	REG16(0x27c),
290 	REG16(0x278),
291 	REG16(0x274),
292 	REG16(0x270),
293 
294 	0
295 };
296 
297 static const u8 gen12_rcs_offsets[] = {
298 	NOP(1),
299 	LRI(13, POSTED),
300 	REG16(0x244),
301 	REG(0x034),
302 	REG(0x030),
303 	REG(0x038),
304 	REG(0x03c),
305 	REG(0x168),
306 	REG(0x140),
307 	REG(0x110),
308 	REG(0x1c0),
309 	REG(0x1c4),
310 	REG(0x1c8),
311 	REG(0x180),
312 	REG16(0x2b4),
313 
314 	NOP(5),
315 	LRI(9, POSTED),
316 	REG16(0x3a8),
317 	REG16(0x28c),
318 	REG16(0x288),
319 	REG16(0x284),
320 	REG16(0x280),
321 	REG16(0x27c),
322 	REG16(0x278),
323 	REG16(0x274),
324 	REG16(0x270),
325 
326 	LRI(3, POSTED),
327 	REG(0x1b0),
328 	REG16(0x5a8),
329 	REG16(0x5ac),
330 
331 	NOP(6),
332 	LRI(1, 0),
333 	REG(0x0c8),
334 	NOP(3 + 9 + 1),
335 
336 	LRI(51, POSTED),
337 	REG16(0x588),
338 	REG16(0x588),
339 	REG16(0x588),
340 	REG16(0x588),
341 	REG16(0x588),
342 	REG16(0x588),
343 	REG(0x028),
344 	REG(0x09c),
345 	REG(0x0c0),
346 	REG(0x178),
347 	REG(0x17c),
348 	REG16(0x358),
349 	REG(0x170),
350 	REG(0x150),
351 	REG(0x154),
352 	REG(0x158),
353 	REG16(0x41c),
354 	REG16(0x600),
355 	REG16(0x604),
356 	REG16(0x608),
357 	REG16(0x60c),
358 	REG16(0x610),
359 	REG16(0x614),
360 	REG16(0x618),
361 	REG16(0x61c),
362 	REG16(0x620),
363 	REG16(0x624),
364 	REG16(0x628),
365 	REG16(0x62c),
366 	REG16(0x630),
367 	REG16(0x634),
368 	REG16(0x638),
369 	REG16(0x63c),
370 	REG16(0x640),
371 	REG16(0x644),
372 	REG16(0x648),
373 	REG16(0x64c),
374 	REG16(0x650),
375 	REG16(0x654),
376 	REG16(0x658),
377 	REG16(0x65c),
378 	REG16(0x660),
379 	REG16(0x664),
380 	REG16(0x668),
381 	REG16(0x66c),
382 	REG16(0x670),
383 	REG16(0x674),
384 	REG16(0x678),
385 	REG16(0x67c),
386 	REG(0x068),
387 	REG(0x084),
388 	NOP(1),
389 
390 	0
391 };
392 
393 static const u8 xehp_rcs_offsets[] = {
394 	NOP(1),
395 	LRI(13, POSTED),
396 	REG16(0x244),
397 	REG(0x034),
398 	REG(0x030),
399 	REG(0x038),
400 	REG(0x03c),
401 	REG(0x168),
402 	REG(0x140),
403 	REG(0x110),
404 	REG(0x1c0),
405 	REG(0x1c4),
406 	REG(0x1c8),
407 	REG(0x180),
408 	REG16(0x2b4),
409 
410 	NOP(5),
411 	LRI(9, POSTED),
412 	REG16(0x3a8),
413 	REG16(0x28c),
414 	REG16(0x288),
415 	REG16(0x284),
416 	REG16(0x280),
417 	REG16(0x27c),
418 	REG16(0x278),
419 	REG16(0x274),
420 	REG16(0x270),
421 
422 	LRI(3, POSTED),
423 	REG(0x1b0),
424 	REG16(0x5a8),
425 	REG16(0x5ac),
426 
427 	NOP(6),
428 	LRI(1, 0),
429 	REG(0x0c8),
430 
431 	0
432 };
433 
434 static const u8 dg2_rcs_offsets[] = {
435 	NOP(1),
436 	LRI(15, POSTED),
437 	REG16(0x244),
438 	REG(0x034),
439 	REG(0x030),
440 	REG(0x038),
441 	REG(0x03c),
442 	REG(0x168),
443 	REG(0x140),
444 	REG(0x110),
445 	REG(0x1c0),
446 	REG(0x1c4),
447 	REG(0x1c8),
448 	REG(0x180),
449 	REG16(0x2b4),
450 	REG(0x120),
451 	REG(0x124),
452 
453 	NOP(1),
454 	LRI(9, POSTED),
455 	REG16(0x3a8),
456 	REG16(0x28c),
457 	REG16(0x288),
458 	REG16(0x284),
459 	REG16(0x280),
460 	REG16(0x27c),
461 	REG16(0x278),
462 	REG16(0x274),
463 	REG16(0x270),
464 
465 	LRI(3, POSTED),
466 	REG(0x1b0),
467 	REG16(0x5a8),
468 	REG16(0x5ac),
469 
470 	NOP(6),
471 	LRI(1, 0),
472 	REG(0x0c8),
473 
474 	0
475 };
476 
477 static const u8 mtl_rcs_offsets[] = {
478 	NOP(1),
479 	LRI(15, POSTED),
480 	REG16(0x244),
481 	REG(0x034),
482 	REG(0x030),
483 	REG(0x038),
484 	REG(0x03c),
485 	REG(0x168),
486 	REG(0x140),
487 	REG(0x110),
488 	REG(0x1c0),
489 	REG(0x1c4),
490 	REG(0x1c8),
491 	REG(0x180),
492 	REG16(0x2b4),
493 	REG(0x120),
494 	REG(0x124),
495 
496 	NOP(1),
497 	LRI(9, POSTED),
498 	REG16(0x3a8),
499 	REG16(0x28c),
500 	REG16(0x288),
501 	REG16(0x284),
502 	REG16(0x280),
503 	REG16(0x27c),
504 	REG16(0x278),
505 	REG16(0x274),
506 	REG16(0x270),
507 
508 	NOP(2),
509 	LRI(2, POSTED),
510 	REG16(0x5a8),
511 	REG16(0x5ac),
512 
513 	NOP(6),
514 	LRI(1, 0),
515 	REG(0x0c8),
516 
517 	0
518 };
519 
520 #define XE2_CTX_COMMON \
521 	NOP(1),                 /* [0x00] */ \
522 	LRI(15, POSTED),        /* [0x01] */ \
523 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
524 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
525 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
526 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
527 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
528 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
529 	REG(0x140),             /* [0x0e] BB_ADDR */ \
530 	REG(0x110),             /* [0x10] BB_STATE */ \
531 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
532 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
533 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
534 	REG(0x180),             /* [0x18] CCID */ \
535 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
536 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
537 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
538 	\
539 	NOP(1),                 /* [0x20] */ \
540 	LRI(9, POSTED),         /* [0x21] */ \
541 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
542 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
543 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
544 	REG16(0x284),           /* [0x28] dummy reg */ \
545 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
546 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
547 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
548 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
549 	REG16(0x270)            /* [0x32] PTBP_LDW */
550 
551 static const u8 xe2_rcs_offsets[] = {
552 	XE2_CTX_COMMON,
553 
554 	NOP(2),                 /* [0x34] */
555 	LRI(2, POSTED),         /* [0x36] */
556 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
557 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
558 
559 	NOP(6),                 /* [0x41] */
560 	LRI(1, 0),              /* [0x47] */
561 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
562 
563 	0
564 };
565 
566 static const u8 xe2_bcs_offsets[] = {
567 	XE2_CTX_COMMON,
568 
569 	NOP(4 + 8 + 1),         /* [0x34] */
570 	LRI(2, POSTED),         /* [0x41] */
571 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
572 	REG16(0x204),           /* [0x44] BLIT_CCTL */
573 
574 	0
575 };
576 
577 static const u8 xe2_xcs_offsets[] = {
578 	XE2_CTX_COMMON,
579 
580 	0
581 };
582 
583 static const u8 xe2_indirect_ring_state_offsets[] = {
584 	NOP(1),                 /* [0x00] */
585 	LRI(5, POSTED),         /* [0x01] */
586 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
587 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
588 	REG(0x038),             /* [0x06] RING_BUFFER_START */
589 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
590 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
591 
592 	NOP(5),                 /* [0x0c] */
593 	LRI(9, POSTED),         /* [0x11] */
594 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
595 	REG(0x140),             /* [0x14] BB_ADDR */
596 	REG(0x110),             /* [0x16] BB_STATE */
597 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
598 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
599 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
600 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
601 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
602 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
603 
604 	NOP(12),                 /* [0x00] */
605 
606 	0
607 };
608 
609 #undef REG16
610 #undef REG
611 #undef LRI
612 #undef NOP
613 
614 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
615 {
616 	if (class == XE_ENGINE_CLASS_RENDER) {
617 		if (GRAPHICS_VER(xe) >= 20)
618 			return xe2_rcs_offsets;
619 		else if (GRAPHICS_VERx100(xe) >= 1270)
620 			return mtl_rcs_offsets;
621 		else if (GRAPHICS_VERx100(xe) >= 1255)
622 			return dg2_rcs_offsets;
623 		else if (GRAPHICS_VERx100(xe) >= 1250)
624 			return xehp_rcs_offsets;
625 		else
626 			return gen12_rcs_offsets;
627 	} else if (class == XE_ENGINE_CLASS_COPY) {
628 		if (GRAPHICS_VER(xe) >= 20)
629 			return xe2_bcs_offsets;
630 		else
631 			return gen12_xcs_offsets;
632 	} else {
633 		if (GRAPHICS_VER(xe) >= 20)
634 			return xe2_xcs_offsets;
635 		else if (GRAPHICS_VERx100(xe) >= 1255)
636 			return dg2_xcs_offsets;
637 		else
638 			return gen12_xcs_offsets;
639 	}
640 }
641 
642 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
643 {
644 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
645 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
646 
647 	if (xe_gt_has_indirect_ring_state(hwe->gt))
648 		regs[CTX_CONTEXT_CONTROL] |=
649 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
650 }
651 
652 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
653 {
654 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
655 	struct xe_device *xe = gt_to_xe(hwe->gt);
656 	u8 num_regs;
657 
658 	if (!xe_device_uses_memirq(xe))
659 		return;
660 
661 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
662 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
663 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
664 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
665 
666 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
667 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
668 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
669 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
670 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
671 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
672 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
673 
674 	if (xe_device_has_msix(xe)) {
675 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
676 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
677 	}
678 }
679 
680 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
681 {
682 	struct xe_device *xe = gt_to_xe(hwe->gt);
683 
684 	if (GRAPHICS_VERx100(xe) >= 1250)
685 		return 0x70;
686 	else
687 		return 0x60;
688 }
689 
690 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
691 {
692 	int x;
693 
694 	x = lrc_ring_mi_mode(hwe);
695 	regs[x + 1] &= ~STOP_RING;
696 	regs[x + 1] |= STOP_RING << 16;
697 }
698 
699 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
700 {
701 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
702 }
703 
704 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
705 {
706 	return 0;
707 }
708 
709 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
710 {
711 	return lrc->ring.size;
712 }
713 
714 /* Make the magic macros work */
715 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
716 #define __xe_lrc_regs_offset xe_lrc_regs_offset
717 
718 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512
719 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
720 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
721 
722 #define LRC_SEQNO_OFFSET 0
723 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8)
724 
725 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
726 {
727 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
728 }
729 
730 /**
731  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
732  * @xe: the &xe_device struct instance
733  *
734  * Returns: Size of the LRC registers area for current platform
735  */
736 size_t xe_lrc_reg_size(struct xe_device *xe)
737 {
738 	if (GRAPHICS_VERx100(xe) >= 1250)
739 		return 96 * sizeof(u32);
740 	else
741 		return 80 * sizeof(u32);
742 }
743 
744 size_t xe_lrc_skip_size(struct xe_device *xe)
745 {
746 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
747 }
748 
749 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
750 {
751 	return LRC_SEQNO_OFFSET;
752 }
753 
754 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
755 {
756 	return LRC_START_SEQNO_OFFSET;
757 }
758 
759 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
760 {
761 	/* This is stored in the driver-defined portion of PPHWSP */
762 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
763 }
764 
765 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
766 {
767 	/* The parallel is stored in the driver-defined portion of PPHWSP */
768 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
769 }
770 
771 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
772 {
773 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
774 }
775 
776 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
777 {
778 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
779 }
780 
781 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
782 {
783 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
784 }
785 
786 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
787 {
788 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
789 		     LRC_INDIRECT_RING_STATE_SIZE;
790 
791 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
792 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
793 
794 	return offset;
795 }
796 
797 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
798 {
799 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
800 }
801 
802 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
803 {
804 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
805 }
806 
807 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \
808 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
809 { \
810 	struct xe_bo *bo = (bo_expr); \
811 	struct iosys_map map = bo->vmap; \
812 \
813 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
814 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
815 	return map; \
816 } \
817 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
818 { \
819 	struct xe_bo *bo = (bo_expr); \
820 \
821 	return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \
822 } \
823 
824 DECL_MAP_ADDR_HELPERS(ring, lrc->bo)
825 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo)
826 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo)
827 DECL_MAP_ADDR_HELPERS(regs, lrc->bo)
828 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo)
829 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo)
830 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo)
831 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo)
832 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo)
833 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo)
834 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo)
835 
836 #undef DECL_MAP_ADDR_HELPERS
837 
838 /**
839  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
840  * @lrc: Pointer to the lrc.
841  *
842  * Returns: ctx timestamp GGTT address
843  */
844 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
845 {
846 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
847 }
848 
849 /**
850  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
851  * @lrc: Pointer to the lrc.
852  *
853  * Returns: ctx timestamp udw GGTT address
854  */
855 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
856 {
857 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
858 }
859 
860 /**
861  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
862  * @lrc: Pointer to the lrc.
863  *
864  * Returns: ctx timestamp value
865  */
866 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
867 {
868 	struct xe_device *xe = lrc_to_xe(lrc);
869 	struct iosys_map map;
870 	u32 ldw, udw = 0;
871 
872 	map = __xe_lrc_ctx_timestamp_map(lrc);
873 	ldw = xe_map_read32(xe, &map);
874 
875 	if (xe->info.has_64bit_timestamp) {
876 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
877 		udw = xe_map_read32(xe, &map);
878 	}
879 
880 	return (u64)udw << 32 | ldw;
881 }
882 
883 /**
884  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
885  * @lrc: Pointer to the lrc.
886  *
887  * Returns: ctx timestamp job GGTT address
888  */
889 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
890 {
891 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
892 }
893 
894 /**
895  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
896  * @lrc: Pointer to the lrc.
897  *
898  * Returns: ctx timestamp job value
899  */
900 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
901 {
902 	struct xe_device *xe = lrc_to_xe(lrc);
903 	struct iosys_map map;
904 
905 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
906 	return xe_map_read32(xe, &map);
907 }
908 
909 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
910 {
911 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
912 }
913 
914 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
915 {
916 	if (!xe_lrc_has_indirect_ring_state(lrc))
917 		return 0;
918 
919 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
920 }
921 
922 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
923 {
924 	struct xe_device *xe = lrc_to_xe(lrc);
925 	struct iosys_map map;
926 
927 	map = __xe_lrc_indirect_ring_map(lrc);
928 	iosys_map_incr(&map, reg_nr * sizeof(u32));
929 	return xe_map_read32(xe, &map);
930 }
931 
932 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
933 					  int reg_nr, u32 val)
934 {
935 	struct xe_device *xe = lrc_to_xe(lrc);
936 	struct iosys_map map;
937 
938 	map = __xe_lrc_indirect_ring_map(lrc);
939 	iosys_map_incr(&map, reg_nr * sizeof(u32));
940 	xe_map_write32(xe, &map, val);
941 }
942 
943 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
944 {
945 	struct xe_device *xe = lrc_to_xe(lrc);
946 	struct iosys_map map;
947 
948 	map = __xe_lrc_regs_map(lrc);
949 	iosys_map_incr(&map, reg_nr * sizeof(u32));
950 	return xe_map_read32(xe, &map);
951 }
952 
953 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
954 {
955 	struct xe_device *xe = lrc_to_xe(lrc);
956 	struct iosys_map map;
957 
958 	map = __xe_lrc_regs_map(lrc);
959 	iosys_map_incr(&map, reg_nr * sizeof(u32));
960 	xe_map_write32(xe, &map, val);
961 }
962 
963 static void *empty_lrc_data(struct xe_hw_engine *hwe)
964 {
965 	struct xe_gt *gt = hwe->gt;
966 	void *data;
967 	u32 *regs;
968 
969 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
970 	if (!data)
971 		return NULL;
972 
973 	/* 1st page: Per-Process of HW status Page */
974 	regs = data + LRC_PPHWSP_SIZE;
975 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
976 	set_context_control(regs, hwe);
977 	set_memory_based_intr(regs, hwe);
978 	reset_stop_ring(regs, hwe);
979 	if (xe_gt_has_indirect_ring_state(gt)) {
980 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
981 		       LRC_INDIRECT_RING_STATE_SIZE;
982 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
983 	}
984 
985 	return data;
986 }
987 
988 /**
989  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
990  * of given engine.
991  * @hwe: the &xe_hw_engine struct instance
992  */
993 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
994 {
995 	struct xe_gt *gt = hwe->gt;
996 	u32 *regs;
997 
998 	if (!gt->default_lrc[hwe->class])
999 		return;
1000 
1001 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
1002 	set_memory_based_intr(regs, hwe);
1003 }
1004 
1005 /**
1006  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
1007  * for given LRC.
1008  * @lrc: the &xe_lrc struct instance
1009  * @hwe: the &xe_hw_engine struct instance
1010  * @regs: scratch buffer to be used as temporary storage
1011  */
1012 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1013 					    u32 *regs)
1014 {
1015 	struct xe_gt *gt = hwe->gt;
1016 	struct iosys_map map;
1017 	size_t regs_len;
1018 
1019 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
1020 		return;
1021 
1022 	map = __xe_lrc_regs_map(lrc);
1023 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1024 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1025 	set_memory_based_intr(regs, hwe);
1026 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1027 }
1028 
1029 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1030 {
1031 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1032 
1033 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1034 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1035 }
1036 
1037 static void xe_lrc_finish(struct xe_lrc *lrc)
1038 {
1039 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1040 	xe_bo_unpin_map_no_vm(lrc->bo);
1041 	xe_bo_unpin_map_no_vm(lrc->seqno_bo);
1042 }
1043 
1044 /*
1045  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1046  * in calculating active context run ticks.
1047  *
1048  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1049  * context, but only gets updated when the context switches out. In order to
1050  * check how long a context has been active before it switches out, two things
1051  * are required:
1052  *
1053  * (1) Determine if the context is running:
1054  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1055  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1056  * initialized. During a query, we just check for this value to determine if the
1057  * context is active. If the context switched out, it would overwrite this
1058  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1059  * the last part of context restore, so reusing this LRC location will not
1060  * clobber anything.
1061  *
1062  * (2) Calculate the time that the context has been active for:
1063  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1064  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1065  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1066  * engine instance. Since we do not know which instance the context is running
1067  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1068  * store it in the PPHSWP.
1069  */
1070 #define CONTEXT_ACTIVE 1ULL
1071 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1072 				    struct xe_hw_engine *hwe,
1073 				    u32 *batch,
1074 				    size_t max_len)
1075 {
1076 	u32 *cmd = batch;
1077 
1078 	if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1079 		return 0;
1080 
1081 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1082 		return -ENOSPC;
1083 
1084 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1085 	*cmd++ = ENGINE_ID(0).addr;
1086 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1087 	*cmd++ = 0;
1088 
1089 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1090 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1091 	*cmd++ = 0;
1092 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1093 
1094 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1095 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1096 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1097 		*cmd++ = 0;
1098 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1099 	}
1100 
1101 	return cmd - batch;
1102 }
1103 
1104 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1105 				  u32 *batch, size_t max_len)
1106 {
1107 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1108 	u32 *cmd = batch;
1109 
1110 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1111 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1112 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1113 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1114 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1115 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1116 		return 0;
1117 
1118 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1119 		return -ENOSPC;
1120 
1121 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1122 		 MI_LRM_ASYNC;
1123 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1124 	*cmd++ = ts_addr;
1125 	*cmd++ = 0;
1126 
1127 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1128 		 MI_LRM_ASYNC;
1129 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1130 	*cmd++ = ts_addr;
1131 	*cmd++ = 0;
1132 
1133 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1134 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1135 	*cmd++ = ts_addr;
1136 	*cmd++ = 0;
1137 
1138 	return cmd - batch;
1139 }
1140 
1141 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1142 						  struct xe_hw_engine *hwe,
1143 						  u32 *batch, size_t max_len)
1144 {
1145 	struct xe_device *xe = gt_to_xe(lrc->gt);
1146 	const u32 *user_batch;
1147 	u32 *cmd = batch;
1148 	u32 count;
1149 
1150 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1151 						    hwe->class, &user_batch);
1152 	if (!count)
1153 		return 0;
1154 
1155 	if (count > max_len)
1156 		return -ENOSPC;
1157 
1158 	/*
1159 	 * This should be used only for tests and validation. Taint the kernel
1160 	 * as anything could be submitted directly in context switches
1161 	 */
1162 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1163 
1164 	memcpy(cmd, user_batch, count * sizeof(u32));
1165 	cmd += count;
1166 
1167 	return cmd - batch;
1168 }
1169 
1170 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1171 						 struct xe_hw_engine *hwe,
1172 						 u32 *batch, size_t max_len)
1173 {
1174 	struct xe_device *xe = gt_to_xe(lrc->gt);
1175 	const u32 *user_batch;
1176 	u32 *cmd = batch;
1177 	u32 count;
1178 
1179 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1180 						   hwe->class, &user_batch);
1181 	if (!count)
1182 		return 0;
1183 
1184 	if (count > max_len)
1185 		return -ENOSPC;
1186 
1187 	/*
1188 	 * This should be used only for tests and validation. Taint the kernel
1189 	 * as anything could be submitted directly in context switches
1190 	 */
1191 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1192 
1193 	memcpy(cmd, user_batch, count * sizeof(u32));
1194 	cmd += count;
1195 
1196 	return cmd - batch;
1197 }
1198 
1199 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1200 					       struct xe_hw_engine *hwe,
1201 					       u32 *batch, size_t max_len)
1202 {
1203 	u32 *cmd = batch;
1204 
1205 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1206 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1207 		return 0;
1208 
1209 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1210 		return -ENOSPC;
1211 
1212 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1213 	*cmd++ = CS_DEBUG_MODE2(0).addr;
1214 	*cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1215 
1216 	return cmd - batch;
1217 }
1218 
1219 struct bo_setup {
1220 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1221 			 u32 *batch, size_t max_size);
1222 };
1223 
1224 struct bo_setup_state {
1225 	/* Input: */
1226 	struct xe_lrc		*lrc;
1227 	struct xe_hw_engine	*hwe;
1228 	size_t			max_size;
1229 	size_t                  reserve_dw;
1230 	unsigned int		offset;
1231 	const struct bo_setup	*funcs;
1232 	unsigned int		num_funcs;
1233 
1234 	/* State: */
1235 	u32			*buffer;
1236 	u32			*ptr;
1237 	unsigned int		written;
1238 };
1239 
1240 static int setup_bo(struct bo_setup_state *state)
1241 {
1242 	ssize_t remain;
1243 
1244 	if (state->lrc->bo->vmap.is_iomem) {
1245 		xe_gt_assert(state->hwe->gt, state->buffer);
1246 		state->ptr = state->buffer;
1247 	} else {
1248 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1249 	}
1250 
1251 	remain = state->max_size / sizeof(u32);
1252 
1253 	for (size_t i = 0; i < state->num_funcs; i++) {
1254 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1255 						    state->ptr, remain);
1256 
1257 		remain -= len;
1258 
1259 		/*
1260 		 * Caller has asked for at least reserve_dw to remain unused.
1261 		 */
1262 		if (len < 0 ||
1263 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1264 			goto fail;
1265 
1266 		state->ptr += len;
1267 		state->written += len;
1268 	}
1269 
1270 	return 0;
1271 
1272 fail:
1273 	return -ENOSPC;
1274 }
1275 
1276 static void finish_bo(struct bo_setup_state *state)
1277 {
1278 	if (!state->lrc->bo->vmap.is_iomem)
1279 		return;
1280 
1281 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1282 			 state->offset, state->buffer,
1283 			 state->written * sizeof(u32));
1284 }
1285 
1286 /**
1287  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1288  * @lrc: the &xe_lrc struct instance
1289  * @hwe: the &xe_hw_engine struct instance
1290  * @scratch: preallocated scratch buffer for temporary storage
1291  * Return: 0 on success, negative error code on failure
1292  */
1293 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1294 {
1295 	static const struct bo_setup funcs[] = {
1296 		{ .setup = setup_timestamp_wa },
1297 		{ .setup = setup_invalidate_state_cache_wa },
1298 		{ .setup = setup_utilization_wa },
1299 		{ .setup = setup_configfs_post_ctx_restore_bb },
1300 	};
1301 	struct bo_setup_state state = {
1302 		.lrc = lrc,
1303 		.hwe = hwe,
1304 		.max_size = LRC_WA_BB_SIZE,
1305 		.buffer = scratch,
1306 		.reserve_dw = 1,
1307 		.offset = __xe_lrc_wa_bb_offset(lrc),
1308 		.funcs = funcs,
1309 		.num_funcs = ARRAY_SIZE(funcs),
1310 	};
1311 	int ret;
1312 
1313 	ret = setup_bo(&state);
1314 	if (ret)
1315 		return ret;
1316 
1317 	*state.ptr++ = MI_BATCH_BUFFER_END;
1318 	state.written++;
1319 
1320 	finish_bo(&state);
1321 
1322 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1323 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1324 
1325 	return 0;
1326 }
1327 
1328 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1329 {
1330 	u32 *buf = NULL;
1331 	int ret;
1332 
1333 	if (lrc->bo->vmap.is_iomem) {
1334 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1335 		if (!buf)
1336 			return -ENOMEM;
1337 	}
1338 
1339 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1340 
1341 	kfree(buf);
1342 
1343 	return ret;
1344 }
1345 
1346 static int
1347 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1348 {
1349 	static const struct bo_setup rcs_funcs[] = {
1350 		{ .setup = setup_timestamp_wa },
1351 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1352 	};
1353 	static const struct bo_setup xcs_funcs[] = {
1354 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1355 	};
1356 	struct bo_setup_state state = {
1357 		.lrc = lrc,
1358 		.hwe = hwe,
1359 		.max_size = (63 * 64) /* max 63 cachelines */,
1360 		.buffer = NULL,
1361 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1362 	};
1363 	int ret;
1364 
1365 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1366 		return 0;
1367 
1368 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1369 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1370 		state.funcs = rcs_funcs;
1371 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1372 	} else {
1373 		state.funcs = xcs_funcs;
1374 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1375 	}
1376 
1377 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1378 		return 0;
1379 
1380 	if (lrc->bo->vmap.is_iomem) {
1381 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1382 		if (!state.buffer)
1383 			return -ENOMEM;
1384 	}
1385 
1386 	ret = setup_bo(&state);
1387 	if (ret) {
1388 		kfree(state.buffer);
1389 		return ret;
1390 	}
1391 
1392 	/*
1393 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1394 	 * execute: size for indirect ctx must be a multiple of 64.
1395 	 */
1396 	while (state.written & 0xf) {
1397 		*state.ptr++ = MI_NOOP;
1398 		state.written++;
1399 	}
1400 
1401 	finish_bo(&state);
1402 	kfree(state.buffer);
1403 
1404 	/*
1405 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1406 	 * varies per engine class, but the default is good enough
1407 	 */
1408 	xe_lrc_write_ctx_reg(lrc,
1409 			     CTX_CS_INDIRECT_CTX,
1410 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1411 			     /* Size in CLs. */
1412 			     (state.written * sizeof(u32) / 64));
1413 
1414 	return 0;
1415 }
1416 
1417 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1418 {
1419 	struct xe_device *xe = gt_to_xe(lrc->gt);
1420 
1421 	xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1422 		       priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1423 
1424 	/* xe_multi_queue_priority is directly mapped to LRC priority values */
1425 	return priority;
1426 }
1427 
1428 /**
1429  * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1430  * @lrc: Logical Ring Context
1431  * @priority: Multi queue priority of the exec queue
1432  *
1433  * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1434  */
1435 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1436 {
1437 	lrc->desc &= ~LRC_PRIORITY;
1438 	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1439 }
1440 
1441 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1442 			   void *replay_state, u16 msix_vec, u32 init_flags)
1443 {
1444 	struct xe_gt *gt = hwe->gt;
1445 	struct xe_tile *tile = gt_to_tile(gt);
1446 	struct xe_device *xe = gt_to_xe(gt);
1447 	struct iosys_map map;
1448 	u32 arb_enable;
1449 	int err;
1450 
1451 	/*
1452 	 * Init Per-Process of HW status Page, LRC / context state to known
1453 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1454 	 * it's the early submission to record the lrc: build a new empty one from
1455 	 * scratch.
1456 	 */
1457 	map = __xe_lrc_pphwsp_map(lrc);
1458 	if (gt->default_lrc[hwe->class] || replay_state) {
1459 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1460 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1461 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1462 				 lrc->size - LRC_PPHWSP_SIZE);
1463 		if (replay_state)
1464 			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1465 					 replay_state, lrc->replay_size);
1466 	} else {
1467 		void *init_data = empty_lrc_data(hwe);
1468 
1469 		if (!init_data) {
1470 			return -ENOMEM;
1471 		}
1472 
1473 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
1474 		kfree(init_data);
1475 	}
1476 
1477 	if (vm)
1478 		xe_lrc_set_ppgtt(lrc, vm);
1479 
1480 	if (xe_device_has_msix(xe)) {
1481 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1482 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1483 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1484 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1485 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1486 	}
1487 
1488 	if (xe_gt_has_indirect_ring_state(gt)) {
1489 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1490 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1491 
1492 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1493 					      __xe_lrc_ring_ggtt_addr(lrc));
1494 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1495 
1496 		/* Match head and tail pointers */
1497 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail);
1498 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1499 
1500 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1501 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1502 	} else {
1503 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1504 
1505 		/* Match head and tail pointers */
1506 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail);
1507 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1508 
1509 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1510 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1511 	}
1512 
1513 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1514 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1515 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1516 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1517 
1518 	if (init_flags & XE_LRC_CREATE_PXP)
1519 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1520 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1521 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1522 
1523 	lrc->ctx_timestamp = 0;
1524 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1525 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1526 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1527 
1528 	if (xe->info.has_asid && vm)
1529 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1530 
1531 	lrc->desc = LRC_VALID;
1532 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1533 	/* TODO: Priority */
1534 
1535 	/* While this appears to have something about privileged batches or
1536 	 * some such, it really just means PPGTT mode.
1537 	 */
1538 	if (vm)
1539 		lrc->desc |= LRC_PRIVILEGE;
1540 
1541 	if (GRAPHICS_VERx100(xe) < 1250) {
1542 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1543 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1544 	}
1545 
1546 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1547 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1548 
1549 	map = __xe_lrc_seqno_map(lrc);
1550 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1551 
1552 	map = __xe_lrc_start_seqno_map(lrc);
1553 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1554 
1555 	err = setup_wa_bb(lrc, hwe);
1556 	if (err)
1557 		return err;
1558 
1559 	err = setup_indirect_ctx(lrc, hwe);
1560 
1561 	return err;
1562 }
1563 
1564 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1565 		       void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
1566 {
1567 	struct xe_gt *gt = hwe->gt;
1568 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1569 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1570 	struct xe_tile *tile = gt_to_tile(gt);
1571 	struct xe_device *xe = gt_to_xe(gt);
1572 	struct xe_bo *bo;
1573 	u32 bo_flags;
1574 	int err;
1575 
1576 	kref_init(&lrc->refcount);
1577 	lrc->gt = gt;
1578 	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1579 	lrc->size = lrc_size;
1580 	lrc->flags = 0;
1581 	lrc->ring.size = ring_size;
1582 	lrc->ring.tail = 0;
1583 
1584 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1585 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1586 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1587 	}
1588 
1589 	if (xe_gt_has_indirect_ring_state(gt))
1590 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1591 
1592 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1593 		   XE_BO_FLAG_GGTT_INVALIDATE;
1594 
1595 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1596 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1597 
1598 	bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
1599 				       ttm_bo_type_kernel,
1600 				       bo_flags, false);
1601 	if (IS_ERR(lrc->bo))
1602 		return PTR_ERR(lrc->bo);
1603 
1604 	lrc->bo = bo;
1605 
1606 	bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
1607 				       ttm_bo_type_kernel,
1608 				       XE_BO_FLAG_GGTT |
1609 				       XE_BO_FLAG_GGTT_INVALIDATE |
1610 				       XE_BO_FLAG_SYSTEM, false);
1611 	if (IS_ERR(bo)) {
1612 		err = PTR_ERR(bo);
1613 		goto err_lrc_finish;
1614 	}
1615 	lrc->seqno_bo = bo;
1616 
1617 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1618 			     hwe->fence_irq, hwe->name);
1619 
1620 	err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags);
1621 	if (err)
1622 		goto err_lrc_finish;
1623 
1624 	if (vm && vm->xef)
1625 		xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1626 
1627 	return 0;
1628 
1629 err_lrc_finish:
1630 	xe_lrc_finish(lrc);
1631 	return err;
1632 }
1633 
1634 /**
1635  * xe_lrc_create - Create a LRC
1636  * @hwe: Hardware Engine
1637  * @vm: The VM (address space)
1638  * @replay_state: GPU hang replay state
1639  * @ring_size: LRC ring size
1640  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1641  * @flags: LRC initialization flags
1642  *
1643  * Allocate and initialize the Logical Ring Context (LRC).
1644  *
1645  * Return pointer to created LRC upon success and an error pointer
1646  * upon failure.
1647  */
1648 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1649 			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1650 {
1651 	struct xe_lrc *lrc;
1652 	int err;
1653 
1654 	lrc = kzalloc_obj(*lrc);
1655 	if (!lrc)
1656 		return ERR_PTR(-ENOMEM);
1657 
1658 	err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1659 	if (err) {
1660 		kfree(lrc);
1661 		return ERR_PTR(err);
1662 	}
1663 
1664 	return lrc;
1665 }
1666 
1667 /**
1668  * xe_lrc_destroy - Destroy the LRC
1669  * @ref: reference to LRC
1670  *
1671  * Called when ref == 0, release resources held by the Logical Ring Context
1672  * (LRC) and free the LRC memory.
1673  */
1674 void xe_lrc_destroy(struct kref *ref)
1675 {
1676 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1677 
1678 	xe_lrc_finish(lrc);
1679 	kfree(lrc);
1680 }
1681 
1682 /**
1683  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1684  * @lrc: the &xe_lrc struct instance
1685  */
1686 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1687 {
1688 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1689 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1690 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1691 
1692 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1693 					      __xe_lrc_ring_ggtt_addr(lrc));
1694 	} else {
1695 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1696 	}
1697 }
1698 
1699 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1700 {
1701 	if (xe_lrc_has_indirect_ring_state(lrc))
1702 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1703 	else
1704 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1705 }
1706 
1707 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1708 {
1709 	if (xe_lrc_has_indirect_ring_state(lrc))
1710 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1711 	else
1712 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1713 }
1714 
1715 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1716 {
1717 	if (xe_lrc_has_indirect_ring_state(lrc))
1718 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1719 	else
1720 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1721 }
1722 
1723 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1724 {
1725 	if (xe_lrc_has_indirect_ring_state(lrc))
1726 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1727 	else
1728 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1729 }
1730 
1731 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1732 {
1733 	if (xe_lrc_has_indirect_ring_state(lrc))
1734 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1735 	else
1736 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1737 }
1738 
1739 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1740 {
1741 	const u32 head = xe_lrc_ring_head(lrc);
1742 	const u32 tail = lrc->ring.tail;
1743 	const u32 size = lrc->ring.size;
1744 
1745 	return ((head - tail - 1) & (size - 1)) + 1;
1746 }
1747 
1748 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1749 				const void *data, size_t size)
1750 {
1751 	struct xe_device *xe = lrc_to_xe(lrc);
1752 
1753 	iosys_map_incr(&ring, lrc->ring.tail);
1754 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1755 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1756 }
1757 
1758 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1759 {
1760 	struct xe_device *xe = lrc_to_xe(lrc);
1761 	struct iosys_map ring;
1762 	u32 rhs;
1763 	size_t aligned_size;
1764 
1765 	xe_assert(xe, IS_ALIGNED(size, 4));
1766 	aligned_size = ALIGN(size, 8);
1767 
1768 	ring = __xe_lrc_ring_map(lrc);
1769 
1770 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1771 	rhs = lrc->ring.size - lrc->ring.tail;
1772 	if (size > rhs) {
1773 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1774 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1775 	} else {
1776 		__xe_lrc_write_ring(lrc, ring, data, size);
1777 	}
1778 
1779 	if (aligned_size > size) {
1780 		u32 noop = MI_NOOP;
1781 
1782 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1783 	}
1784 }
1785 
1786 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1787 {
1788 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1789 }
1790 
1791 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1792 {
1793 	return __xe_lrc_seqno_ggtt_addr(lrc);
1794 }
1795 
1796 /**
1797  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1798  *
1799  * Allocate but don't initialize an lrc seqno fence.
1800  *
1801  * Return: Pointer to the allocated fence or
1802  * negative error pointer on error.
1803  */
1804 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1805 {
1806 	return xe_hw_fence_alloc();
1807 }
1808 
1809 /**
1810  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1811  * @fence: Pointer to the fence to free.
1812  *
1813  * Frees an lrc seqno fence that hasn't yet been
1814  * initialized.
1815  */
1816 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1817 {
1818 	xe_hw_fence_free(fence);
1819 }
1820 
1821 /**
1822  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1823  * @lrc: Pointer to the lrc.
1824  * @fence: Pointer to the fence to initialize.
1825  *
1826  * Initializes a pre-allocated lrc seqno fence.
1827  * After initialization, the fence is subject to normal
1828  * dma-fence refcounting.
1829  */
1830 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1831 {
1832 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1833 }
1834 
1835 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1836 {
1837 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1838 
1839 	return xe_map_read32(lrc_to_xe(lrc), &map);
1840 }
1841 
1842 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1843 {
1844 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1845 
1846 	return xe_map_read32(lrc_to_xe(lrc), &map);
1847 }
1848 
1849 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1850 {
1851 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1852 }
1853 
1854 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1855 {
1856 	return __xe_lrc_parallel_ggtt_addr(lrc);
1857 }
1858 
1859 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1860 {
1861 	return __xe_lrc_parallel_map(lrc);
1862 }
1863 
1864 /**
1865  * xe_lrc_engine_id() - Read engine id value
1866  * @lrc: Pointer to the lrc.
1867  *
1868  * Returns: context id value
1869  */
1870 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1871 {
1872 	struct xe_device *xe = lrc_to_xe(lrc);
1873 	struct iosys_map map;
1874 
1875 	map = __xe_lrc_engine_id_map(lrc);
1876 	return xe_map_read32(xe, &map);
1877 }
1878 
1879 static int instr_dw(u32 cmd_header)
1880 {
1881 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1882 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1883 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1884 		return 1;
1885 
1886 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1887 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1888 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1889 
1890 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1891 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1892 }
1893 
1894 static int dump_mi_command(struct drm_printer *p,
1895 			   struct xe_gt *gt,
1896 			   u32 *dw,
1897 			   int remaining_dw)
1898 {
1899 	u32 inst_header = *dw;
1900 	u32 numdw = instr_dw(inst_header);
1901 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1902 	int num_noop;
1903 
1904 	/* First check for commands that don't have/use a '# DW' field */
1905 	switch (inst_header & MI_OPCODE) {
1906 	case MI_NOOP:
1907 		num_noop = 1;
1908 		while (num_noop < remaining_dw &&
1909 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1910 			num_noop++;
1911 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1912 		return num_noop;
1913 
1914 	case MI_TOPOLOGY_FILTER:
1915 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1916 		return 1;
1917 
1918 	case MI_BATCH_BUFFER_END:
1919 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1920 		/* Return 'remaining_dw' to consume the rest of the LRC */
1921 		return remaining_dw;
1922 	}
1923 
1924 	/*
1925 	 * Any remaining commands include a # of dwords.  We should make sure
1926 	 * it doesn't exceed the remaining size of the LRC.
1927 	 */
1928 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1929 		numdw = remaining_dw;
1930 
1931 	switch (inst_header & MI_OPCODE) {
1932 	case MI_LOAD_REGISTER_IMM:
1933 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1934 			   inst_header, (numdw - 1) / 2);
1935 		for (int i = 1; i < numdw; i += 2)
1936 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1937 		return numdw;
1938 
1939 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1940 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1941 			   inst_header,
1942 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1943 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1944 		if (numdw == 4)
1945 			drm_printf(p, " - %#6x = %#010llx\n",
1946 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1947 		else
1948 			drm_printf(p, " - %*ph (%s)\n",
1949 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1950 				   numdw < 4 ? "truncated" : "malformed");
1951 		return numdw;
1952 
1953 	case MI_FORCE_WAKEUP:
1954 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1955 		return numdw;
1956 
1957 	default:
1958 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1959 			   inst_header, opcode, numdw);
1960 		return numdw;
1961 	}
1962 }
1963 
1964 static int dump_gfxpipe_command(struct drm_printer *p,
1965 				struct xe_gt *gt,
1966 				u32 *dw,
1967 				int remaining_dw)
1968 {
1969 	u32 numdw = instr_dw(*dw);
1970 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1971 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1972 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1973 
1974 	/*
1975 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1976 	 * remaining size of the LRC.
1977 	 */
1978 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1979 		numdw = remaining_dw;
1980 
1981 	switch (*dw & GFXPIPE_MATCH_MASK) {
1982 #define MATCH(cmd) \
1983 	case cmd: \
1984 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1985 		return numdw
1986 #define MATCH3D(cmd) \
1987 	case CMD_##cmd: \
1988 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1989 		return numdw
1990 
1991 	MATCH(STATE_BASE_ADDRESS);
1992 	MATCH(STATE_SIP);
1993 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1994 	MATCH(STATE_COMPUTE_MODE);
1995 	MATCH3D(3DSTATE_BTD);
1996 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1997 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1998 
1999 	MATCH3D(3DSTATE_VF_STATISTICS);
2000 
2001 	MATCH(PIPELINE_SELECT);
2002 
2003 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
2004 	MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN);
2005 	MATCH3D(3DSTATE_CLEAR_PARAMS);
2006 	MATCH3D(3DSTATE_DEPTH_BUFFER);
2007 	MATCH3D(3DSTATE_STENCIL_BUFFER);
2008 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
2009 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
2010 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
2011 	MATCH3D(3DSTATE_INDEX_BUFFER);
2012 	MATCH3D(3DSTATE_VF);
2013 	MATCH3D(3DSTATE_MULTISAMPLE);
2014 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
2015 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
2016 	MATCH3D(3DSTATE_VS);
2017 	MATCH3D(3DSTATE_GS);
2018 	MATCH3D(3DSTATE_CLIP);
2019 	MATCH3D(3DSTATE_SF);
2020 	MATCH3D(3DSTATE_WM);
2021 	MATCH3D(3DSTATE_CONSTANT_VS);
2022 	MATCH3D(3DSTATE_CONSTANT_GS);
2023 	MATCH3D(3DSTATE_CONSTANT_PS);
2024 	MATCH3D(3DSTATE_SAMPLE_MASK);
2025 	MATCH3D(3DSTATE_CONSTANT_HS);
2026 	MATCH3D(3DSTATE_CONSTANT_DS);
2027 	MATCH3D(3DSTATE_HS);
2028 	MATCH3D(3DSTATE_TE);
2029 	MATCH3D(3DSTATE_DS);
2030 	MATCH3D(3DSTATE_STREAMOUT);
2031 	MATCH3D(3DSTATE_SBE);
2032 	MATCH3D(3DSTATE_PS);
2033 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
2034 	MATCH3D(3DSTATE_CPS_POINTERS);
2035 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2036 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2037 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2038 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2039 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2040 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2041 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2042 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2043 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2044 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2045 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2046 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2047 	MATCH3D(3DSTATE_VF_INSTANCING);
2048 	MATCH3D(3DSTATE_VF_SGVS);
2049 	MATCH3D(3DSTATE_VF_TOPOLOGY);
2050 	MATCH3D(3DSTATE_WM_CHROMAKEY);
2051 	MATCH3D(3DSTATE_PS_BLEND);
2052 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2053 	MATCH3D(3DSTATE_PS_EXTRA);
2054 	MATCH3D(3DSTATE_RASTER);
2055 	MATCH3D(3DSTATE_SBE_SWIZ);
2056 	MATCH3D(3DSTATE_WM_HZ_OP);
2057 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2058 	MATCH3D(3DSTATE_VF_SGVS_2);
2059 	MATCH3D(3DSTATE_VFG);
2060 	MATCH3D(3DSTATE_URB_ALLOC_VS);
2061 	MATCH3D(3DSTATE_URB_ALLOC_HS);
2062 	MATCH3D(3DSTATE_URB_ALLOC_DS);
2063 	MATCH3D(3DSTATE_URB_ALLOC_GS);
2064 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2065 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2066 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2067 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2068 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2069 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2070 	MATCH3D(3DSTATE_AMFS);
2071 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
2072 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2073 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2074 	MATCH3D(3DSTATE_MESH_CONTROL);
2075 	MATCH3D(3DSTATE_MESH_DISTRIB);
2076 	MATCH3D(3DSTATE_TASK_REDISTRIB);
2077 	MATCH3D(3DSTATE_MESH_SHADER);
2078 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
2079 	MATCH3D(3DSTATE_TASK_CONTROL);
2080 	MATCH3D(3DSTATE_TASK_SHADER);
2081 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
2082 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
2083 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
2084 	MATCH3D(3DSTATE_CLIP_MESH);
2085 	MATCH3D(3DSTATE_SBE_MESH);
2086 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2087 	MATCH3D(3DSTATE_COARSE_PIXEL);
2088 	MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT);
2089 	MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT);
2090 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2);
2091 	MATCH3D(3DSTATE_CC_STATE_POINTERS_2);
2092 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2);
2093 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2);
2094 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2);
2095 
2096 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2097 	MATCH3D(3DSTATE_URB_MEMORY);
2098 	MATCH3D(3DSTATE_CHROMA_KEY);
2099 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2100 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2101 	MATCH3D(3DSTATE_LINE_STIPPLE);
2102 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2103 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2104 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2105 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2106 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2107 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2108 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2109 	MATCH3D(3DSTATE_SO_DECL_LIST);
2110 	MATCH3D(3DSTATE_SO_BUFFER);
2111 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2112 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2113 	MATCH3D(3DSTATE_3D_MODE);
2114 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2115 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2116 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2117 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2);
2118 
2119 	default:
2120 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2121 			   *dw, pipeline, opcode, subopcode, numdw);
2122 		return numdw;
2123 	}
2124 }
2125 
2126 static int dump_gfx_state_command(struct drm_printer *p,
2127 				  struct xe_gt *gt,
2128 				  u32 *dw,
2129 				  int remaining_dw)
2130 {
2131 	u32 numdw = instr_dw(*dw);
2132 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2133 
2134 	/*
2135 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2136 	 * remaining size of the LRC.
2137 	 */
2138 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2139 		numdw = remaining_dw;
2140 
2141 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2142 	MATCH(STATE_WRITE_INLINE);
2143 
2144 	default:
2145 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2146 			   *dw, opcode, numdw);
2147 		return numdw;
2148 	}
2149 }
2150 
2151 void xe_lrc_dump_default(struct drm_printer *p,
2152 			 struct xe_gt *gt,
2153 			 enum xe_engine_class hwe_class)
2154 {
2155 	u32 *dw;
2156 	int remaining_dw, num_dw;
2157 
2158 	if (!gt->default_lrc[hwe_class]) {
2159 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2160 		return;
2161 	}
2162 
2163 	/*
2164 	 * Skip the beginning of the LRC since it contains the per-process
2165 	 * hardware status page.
2166 	 */
2167 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2168 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2169 
2170 	while (remaining_dw > 0) {
2171 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2172 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
2173 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2174 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
2175 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2176 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
2177 		} else {
2178 			num_dw = min(instr_dw(*dw), remaining_dw);
2179 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2180 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2181 				   num_dw);
2182 		}
2183 
2184 		dw += num_dw;
2185 		remaining_dw -= num_dw;
2186 	}
2187 }
2188 
2189 /*
2190  * Lookup the value of a register within the offset/value pairs of an
2191  * MI_LOAD_REGISTER_IMM instruction.
2192  *
2193  * Return -ENOENT if the register is not present in the MI_LRI instruction.
2194  */
2195 static int lookup_reg_in_mi_lri(u32 offset, u32 *value,
2196 				const u32 *dword_pair, int num_regs)
2197 {
2198 	for (int i = 0; i < num_regs; i++) {
2199 		if (dword_pair[2 * i] == offset) {
2200 			*value = dword_pair[2 * i + 1];
2201 			return 0;
2202 		}
2203 	}
2204 
2205 	return -ENOENT;
2206 }
2207 
2208 /*
2209  * Lookup the value of a register in a specific engine type's default LRC.
2210  *
2211  * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register
2212  * cannot be found in the default LRC.
2213  */
2214 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt,
2215 				    enum xe_engine_class hwe_class,
2216 				    u32 offset,
2217 				    u32 *value)
2218 {
2219 	u32 *dw;
2220 	int remaining_dw, ret;
2221 
2222 	if (!gt->default_lrc[hwe_class])
2223 		return -EINVAL;
2224 
2225 	/*
2226 	 * Skip the beginning of the LRC since it contains the per-process
2227 	 * hardware status page.
2228 	 */
2229 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2230 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2231 
2232 	while (remaining_dw > 0) {
2233 		u32 num_dw = instr_dw(*dw);
2234 
2235 		if (num_dw > remaining_dw)
2236 			num_dw = remaining_dw;
2237 
2238 		switch (*dw & XE_INSTR_CMD_TYPE) {
2239 		case XE_INSTR_MI:
2240 			switch (*dw & MI_OPCODE) {
2241 			case MI_BATCH_BUFFER_END:
2242 				/* End of LRC; register not found */
2243 				return -ENOENT;
2244 
2245 			case MI_NOOP:
2246 			case MI_TOPOLOGY_FILTER:
2247 				/*
2248 				 * MI_NOOP and MI_TOPOLOGY_FILTER don't have
2249 				 * a length field and are always 1-dword
2250 				 * instructions.
2251 				 */
2252 				remaining_dw--;
2253 				dw++;
2254 				break;
2255 
2256 			case MI_LOAD_REGISTER_IMM:
2257 				ret = lookup_reg_in_mi_lri(offset, value,
2258 							   dw + 1, (num_dw - 1) / 2);
2259 				if (ret == 0)
2260 					return 0;
2261 
2262 				fallthrough;
2263 
2264 			default:
2265 				/*
2266 				 * Jump to next instruction based on length
2267 				 * field.
2268 				 */
2269 				remaining_dw -= num_dw;
2270 				dw += num_dw;
2271 				break;
2272 			}
2273 			break;
2274 
2275 		default:
2276 			/* Jump to next instruction based on length field. */
2277 			remaining_dw -= num_dw;
2278 			dw += num_dw;
2279 		}
2280 	}
2281 
2282 	return -ENOENT;
2283 }
2284 
2285 struct instr_state {
2286 	u32 instr;
2287 	u16 num_dw;
2288 };
2289 
2290 static const struct instr_state xe_hpg_svg_state[] = {
2291 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2292 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2293 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2294 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2295 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2296 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2297 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2298 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2299 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2300 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2301 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2302 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2303 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2304 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2305 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2306 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2307 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2308 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2309 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2310 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2311 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2312 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2313 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2314 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2315 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2316 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2317 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2318 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2319 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2320 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2321 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2322 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2323 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2324 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2325 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2326 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2327 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2328 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2329 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2330 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2331 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2332 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2333 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2334 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2335 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2336 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2337 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2338 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2339 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2340 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2341 };
2342 
2343 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2344 {
2345 	struct xe_gt *gt = q->hwe->gt;
2346 	struct xe_device *xe = gt_to_xe(gt);
2347 	const struct instr_state *state_table = NULL;
2348 	int state_table_size = 0;
2349 
2350 	/*
2351 	 * Wa_14019789679
2352 	 *
2353 	 * If the driver doesn't explicitly emit the SVG instructions while
2354 	 * setting up the default LRC, the context switch will write 0's
2355 	 * (noops) into the LRC memory rather than the expected instruction
2356 	 * headers.  Application contexts start out as a copy of the default
2357 	 * LRC, and if they also do not emit specific settings for some SVG
2358 	 * state, then on context restore they'll unintentionally inherit
2359 	 * whatever state setting the previous context had programmed into the
2360 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2361 	 * prevent the hardware from resetting that state back to any specific
2362 	 * value).
2363 	 *
2364 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2365 	 * since that's a specific state setting that can easily cause GPU
2366 	 * hangs if unintentionally inherited.  However to be safe we'll
2367 	 * continue to emit all of the SVG state since it's best not to leak
2368 	 * any of the state between contexts, even if that leakage is harmless.
2369 	 */
2370 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2371 		state_table = xe_hpg_svg_state;
2372 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2373 	}
2374 
2375 	if (!state_table) {
2376 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2377 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2378 		return cs;
2379 	}
2380 
2381 	for (int i = 0; i < state_table_size; i++) {
2382 		u32 instr = state_table[i].instr;
2383 		u16 num_dw = state_table[i].num_dw;
2384 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2385 
2386 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2387 		xe_gt_assert(gt, num_dw != 0);
2388 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2389 
2390 		/*
2391 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2392 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2393 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2394 		 * Just make the replacement here rather than defining a
2395 		 * whole separate table for the single trivial change.
2396 		 */
2397 		if (GRAPHICS_VER(xe) >= 20 &&
2398 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2399 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2400 
2401 		*cs = instr;
2402 		if (!is_single_dw)
2403 			*cs |= (num_dw - 2);
2404 
2405 		cs += num_dw;
2406 	}
2407 
2408 	return cs;
2409 }
2410 
2411 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2412 {
2413 	struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2414 
2415 	if (!snapshot)
2416 		return NULL;
2417 
2418 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2419 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2420 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2421 	snapshot->head = xe_lrc_ring_head(lrc);
2422 	snapshot->tail.internal = lrc->ring.tail;
2423 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2424 	snapshot->start = xe_lrc_ring_start(lrc);
2425 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2426 	snapshot->seqno = xe_lrc_seqno(lrc);
2427 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2428 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2429 	snapshot->lrc_size = lrc->size;
2430 	snapshot->replay_offset = 0;
2431 	snapshot->replay_size = lrc->replay_size;
2432 	snapshot->lrc_snapshot = NULL;
2433 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2434 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2435 	return snapshot;
2436 }
2437 
2438 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2439 {
2440 	struct xe_bo *bo;
2441 	struct iosys_map src;
2442 
2443 	if (!snapshot)
2444 		return;
2445 
2446 	bo = snapshot->lrc_bo;
2447 	snapshot->lrc_bo = NULL;
2448 
2449 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2450 	if (!snapshot->lrc_snapshot)
2451 		goto put_bo;
2452 
2453 	xe_bo_lock(bo, false);
2454 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2455 		xe_map_memcpy_from(xe_bo_device(bo),
2456 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2457 				   snapshot->lrc_size);
2458 		ttm_bo_vunmap(&bo->ttm, &src);
2459 	} else {
2460 		kvfree(snapshot->lrc_snapshot);
2461 		snapshot->lrc_snapshot = NULL;
2462 	}
2463 	xe_bo_unlock(bo);
2464 put_bo:
2465 	xe_bo_put(bo);
2466 }
2467 
2468 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2469 {
2470 	unsigned long i;
2471 
2472 	if (!snapshot)
2473 		return;
2474 
2475 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2476 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2477 		   snapshot->ring_addr);
2478 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2479 		   snapshot->indirect_context_desc);
2480 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2481 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2482 		   snapshot->tail.internal, snapshot->tail.memory);
2483 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2484 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2485 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2486 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2487 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2488 
2489 	if (!snapshot->lrc_snapshot)
2490 		return;
2491 
2492 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2493 	drm_puts(p, "\t[HWSP].data: ");
2494 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2495 		u32 *val = snapshot->lrc_snapshot + i;
2496 		char dumped[ASCII85_BUFSZ];
2497 
2498 		drm_puts(p, ascii85_encode(*val, dumped));
2499 	}
2500 
2501 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2502 	drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2503 	drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2504 
2505 	drm_puts(p, "\t[HWCTX].data: ");
2506 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2507 		u32 *val = snapshot->lrc_snapshot + i;
2508 		char dumped[ASCII85_BUFSZ];
2509 
2510 		drm_puts(p, ascii85_encode(*val, dumped));
2511 	}
2512 	drm_puts(p, "\n");
2513 }
2514 
2515 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2516 {
2517 	if (!snapshot)
2518 		return;
2519 
2520 	kvfree(snapshot->lrc_snapshot);
2521 	if (snapshot->lrc_bo)
2522 		xe_bo_put(snapshot->lrc_bo);
2523 
2524 	kfree(snapshot);
2525 }
2526 
2527 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2528 {
2529 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2530 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2531 	struct xe_hw_engine *hwe;
2532 	u64 val;
2533 
2534 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2535 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2536 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2537 			    class, instance))
2538 		return -1;
2539 
2540 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2541 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2542 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2543 	else
2544 		val = xe_mmio_read32(&hwe->gt->mmio,
2545 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2546 
2547 	*reg_ctx_ts = val;
2548 
2549 	return 0;
2550 }
2551 
2552 /**
2553  * xe_lrc_timestamp() - Current ctx timestamp
2554  * @lrc: Pointer to the lrc.
2555  *
2556  * Return latest ctx timestamp. With support for active contexts, the
2557  * calculation may bb slightly racy, so follow a read-again logic to ensure that
2558  * the context is still active before returning the right timestamp.
2559  *
2560  * Returns: New ctx timestamp value
2561  */
2562 u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2563 {
2564 	u64 lrc_ts, reg_ts, new_ts;
2565 	u32 engine_id;
2566 
2567 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2568 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2569 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2570 		new_ts = lrc_ts;
2571 		goto done;
2572 	}
2573 
2574 	if (lrc_ts == CONTEXT_ACTIVE) {
2575 		engine_id = xe_lrc_engine_id(lrc);
2576 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2577 			new_ts = reg_ts;
2578 
2579 		/* read lrc again to ensure context is still active */
2580 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2581 	}
2582 
2583 	/*
2584 	 * If context switched out, just use the lrc_ts. Note that this needs to
2585 	 * be a separate if condition.
2586 	 */
2587 	if (lrc_ts != CONTEXT_ACTIVE)
2588 		new_ts = lrc_ts;
2589 
2590 done:
2591 	return new_ts;
2592 }
2593 
2594 /**
2595  * xe_lrc_update_timestamp() - Update ctx timestamp
2596  * @lrc: Pointer to the lrc.
2597  * @old_ts: Old timestamp value
2598  *
2599  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2600  * update saved value.
2601  *
2602  * Returns: New ctx timestamp value
2603  */
2604 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2605 {
2606 	*old_ts = lrc->ctx_timestamp;
2607 	lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
2608 
2609 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2610 
2611 	return lrc->ctx_timestamp;
2612 }
2613 
2614 /**
2615  * xe_lrc_ring_is_idle() - LRC is idle
2616  * @lrc: Pointer to the lrc.
2617  *
2618  * Compare LRC ring head and tail to determine if idle.
2619  *
2620  * Return: True is ring is idle, False otherwise
2621  */
2622 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2623 {
2624 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2625 }
2626