xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision bf4afc53b77aeaa48b5409da5c8da6bb4eff7f43)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_lrc_layout.h"
18 #include "xe_bb.h"
19 #include "xe_bo.h"
20 #include "xe_configfs.h"
21 #include "xe_device.h"
22 #include "xe_drm_client.h"
23 #include "xe_exec_queue_types.h"
24 #include "xe_gt.h"
25 #include "xe_gt_printk.h"
26 #include "xe_hw_fence.h"
27 #include "xe_map.h"
28 #include "xe_memirq.h"
29 #include "xe_mmio.h"
30 #include "xe_sriov.h"
31 #include "xe_trace_lrc.h"
32 #include "xe_vm.h"
33 #include "xe_wa.h"
34 
35 #define LRC_VALID				BIT_ULL(0)
36 #define LRC_PRIVILEGE				BIT_ULL(8)
37 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
38 #define LRC_LEGACY_64B_CONTEXT			3
39 
40 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
41 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
42 
43 #define LRC_PPHWSP_SIZE				SZ_4K
44 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
45 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
46 
47 #define LRC_PRIORITY				GENMASK_ULL(10, 9)
48 #define LRC_PRIORITY_LOW			0
49 #define LRC_PRIORITY_NORMAL			1
50 #define LRC_PRIORITY_HIGH			2
51 
52 /*
53  * Layout of the LRC and associated data allocated as
54  * lrc->bo:
55  *
56  *   Region                       Size
57  *  +============================+=================================+ <- __xe_lrc_ring_offset()
58  *  | Ring                       | ring_size, see                  |
59  *  |                            | xe_lrc_init()                   |
60  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
61  *  | PPHWSP (includes SW state) | 4K                              |
62  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
63  *  | Engine Context Image       | n * 4K, see                     |
64  *  |                            | xe_gt_lrc_size()                |
65  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
66  *  | Indirect Ring State Page   | 0 or 4k, see                    |
67  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
68  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
69  *  | Indirect Context Page      | 0 or 4k, see                    |
70  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
71  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
72  *  | WA BB Per Ctx              | 4k                              |
73  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
74  */
75 
76 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)77 lrc_to_xe(struct xe_lrc *lrc)
78 {
79 	return gt_to_xe(lrc->fence_ctx.gt);
80 }
81 
82 static bool
gt_engine_needs_indirect_ctx(struct xe_gt * gt,enum xe_engine_class class)83 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
84 {
85 	struct xe_device *xe = gt_to_xe(gt);
86 
87 	if (XE_GT_WA(gt, 16010904313) &&
88 	    (class == XE_ENGINE_CLASS_RENDER ||
89 	     class == XE_ENGINE_CLASS_COMPUTE))
90 		return true;
91 
92 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
93 					       class, NULL))
94 		return true;
95 
96 	return false;
97 }
98 
99 /**
100  * xe_gt_lrc_hang_replay_size() - Hang replay size
101  * @gt: The GT
102  * @class: Hardware engine class
103  *
104  * Determine size of GPU hang replay state for a GT and hardware engine class.
105  *
106  * Return: Size of GPU hang replay size
107  */
xe_gt_lrc_hang_replay_size(struct xe_gt * gt,enum xe_engine_class class)108 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
109 {
110 	struct xe_device *xe = gt_to_xe(gt);
111 	size_t size = 0;
112 
113 	/* Engine context image */
114 	switch (class) {
115 	case XE_ENGINE_CLASS_RENDER:
116 		if (GRAPHICS_VER(xe) >= 20)
117 			size += 3 * SZ_4K;
118 		else
119 			size += 13 * SZ_4K;
120 		break;
121 	case XE_ENGINE_CLASS_COMPUTE:
122 		if (GRAPHICS_VER(xe) >= 20)
123 			size += 2 * SZ_4K;
124 		else
125 			size += 13 * SZ_4K;
126 		break;
127 	default:
128 		WARN(1, "Unknown engine class: %d", class);
129 		fallthrough;
130 	case XE_ENGINE_CLASS_COPY:
131 	case XE_ENGINE_CLASS_VIDEO_DECODE:
132 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
133 	case XE_ENGINE_CLASS_OTHER:
134 		size += 1 * SZ_4K;
135 	}
136 
137 	return size;
138 }
139 
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)140 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
141 {
142 	size_t size = xe_gt_lrc_hang_replay_size(gt, class);
143 
144 	/* Add indirect ring state page */
145 	if (xe_gt_has_indirect_ring_state(gt))
146 		size += LRC_INDIRECT_RING_STATE_SIZE;
147 
148 	return size + LRC_PPHWSP_SIZE;
149 }
150 
151 /*
152  * The per-platform tables are u8-encoded in @data. Decode @data and set the
153  * addresses' offset and commands in @regs. The following encoding is used
154  * for each byte. There are 2 steps: decoding commands and decoding addresses.
155  *
156  * Commands:
157  * [7]: create NOPs - number of NOPs are set in lower bits
158  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
159  *      MI_LRI_FORCE_POSTED
160  * [5:0]: Number of NOPs or registers to set values to in case of
161  *        MI_LOAD_REGISTER_IMM
162  *
163  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
164  * number of registers. They are set by using the REG/REG16 macros: the former
165  * is used for offsets smaller than 0x200 while the latter is for values bigger
166  * than that. Those macros already set all the bits documented below correctly:
167  *
168  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
169  *      follow, for the lower bits
170  * [6:0]: Register offset, without considering the engine base.
171  *
172  * This function only tweaks the commands and register offsets. Values are not
173  * filled out.
174  */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)175 static void set_offsets(u32 *regs,
176 			const u8 *data,
177 			const struct xe_hw_engine *hwe)
178 #define NOP(x) (BIT(7) | (x))
179 #define LRI(count, flags) ((flags) << 6 | (count) | \
180 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
181 #define POSTED BIT(0)
182 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
183 #define REG16(x) \
184 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
185 	(((x) >> 2) & 0x7f)
186 {
187 	const u32 base = hwe->mmio_base;
188 
189 	while (*data) {
190 		u8 count, flags;
191 
192 		if (*data & BIT(7)) { /* skip */
193 			count = *data++ & ~BIT(7);
194 			regs += count;
195 			continue;
196 		}
197 
198 		count = *data & 0x3f;
199 		flags = *data >> 6;
200 		data++;
201 
202 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
203 		if (flags & POSTED)
204 			*regs |= MI_LRI_FORCE_POSTED;
205 		*regs |= MI_LRI_LRM_CS_MMIO;
206 		regs++;
207 
208 		xe_gt_assert(hwe->gt, count);
209 		do {
210 			u32 offset = 0;
211 			u8 v;
212 
213 			do {
214 				v = *data++;
215 				offset <<= 7;
216 				offset |= v & ~BIT(7);
217 			} while (v & BIT(7));
218 
219 			regs[0] = base + (offset << 2);
220 			regs += 2;
221 		} while (--count);
222 	}
223 
224 	*regs = MI_BATCH_BUFFER_END | BIT(0);
225 }
226 
227 static const u8 gen12_xcs_offsets[] = {
228 	NOP(1),
229 	LRI(13, POSTED),
230 	REG16(0x244),
231 	REG(0x034),
232 	REG(0x030),
233 	REG(0x038),
234 	REG(0x03c),
235 	REG(0x168),
236 	REG(0x140),
237 	REG(0x110),
238 	REG(0x1c0),
239 	REG(0x1c4),
240 	REG(0x1c8),
241 	REG(0x180),
242 	REG16(0x2b4),
243 
244 	NOP(5),
245 	LRI(9, POSTED),
246 	REG16(0x3a8),
247 	REG16(0x28c),
248 	REG16(0x288),
249 	REG16(0x284),
250 	REG16(0x280),
251 	REG16(0x27c),
252 	REG16(0x278),
253 	REG16(0x274),
254 	REG16(0x270),
255 
256 	0
257 };
258 
259 static const u8 dg2_xcs_offsets[] = {
260 	NOP(1),
261 	LRI(15, POSTED),
262 	REG16(0x244),
263 	REG(0x034),
264 	REG(0x030),
265 	REG(0x038),
266 	REG(0x03c),
267 	REG(0x168),
268 	REG(0x140),
269 	REG(0x110),
270 	REG(0x1c0),
271 	REG(0x1c4),
272 	REG(0x1c8),
273 	REG(0x180),
274 	REG16(0x2b4),
275 	REG(0x120),
276 	REG(0x124),
277 
278 	NOP(1),
279 	LRI(9, POSTED),
280 	REG16(0x3a8),
281 	REG16(0x28c),
282 	REG16(0x288),
283 	REG16(0x284),
284 	REG16(0x280),
285 	REG16(0x27c),
286 	REG16(0x278),
287 	REG16(0x274),
288 	REG16(0x270),
289 
290 	0
291 };
292 
293 static const u8 gen12_rcs_offsets[] = {
294 	NOP(1),
295 	LRI(13, POSTED),
296 	REG16(0x244),
297 	REG(0x034),
298 	REG(0x030),
299 	REG(0x038),
300 	REG(0x03c),
301 	REG(0x168),
302 	REG(0x140),
303 	REG(0x110),
304 	REG(0x1c0),
305 	REG(0x1c4),
306 	REG(0x1c8),
307 	REG(0x180),
308 	REG16(0x2b4),
309 
310 	NOP(5),
311 	LRI(9, POSTED),
312 	REG16(0x3a8),
313 	REG16(0x28c),
314 	REG16(0x288),
315 	REG16(0x284),
316 	REG16(0x280),
317 	REG16(0x27c),
318 	REG16(0x278),
319 	REG16(0x274),
320 	REG16(0x270),
321 
322 	LRI(3, POSTED),
323 	REG(0x1b0),
324 	REG16(0x5a8),
325 	REG16(0x5ac),
326 
327 	NOP(6),
328 	LRI(1, 0),
329 	REG(0x0c8),
330 	NOP(3 + 9 + 1),
331 
332 	LRI(51, POSTED),
333 	REG16(0x588),
334 	REG16(0x588),
335 	REG16(0x588),
336 	REG16(0x588),
337 	REG16(0x588),
338 	REG16(0x588),
339 	REG(0x028),
340 	REG(0x09c),
341 	REG(0x0c0),
342 	REG(0x178),
343 	REG(0x17c),
344 	REG16(0x358),
345 	REG(0x170),
346 	REG(0x150),
347 	REG(0x154),
348 	REG(0x158),
349 	REG16(0x41c),
350 	REG16(0x600),
351 	REG16(0x604),
352 	REG16(0x608),
353 	REG16(0x60c),
354 	REG16(0x610),
355 	REG16(0x614),
356 	REG16(0x618),
357 	REG16(0x61c),
358 	REG16(0x620),
359 	REG16(0x624),
360 	REG16(0x628),
361 	REG16(0x62c),
362 	REG16(0x630),
363 	REG16(0x634),
364 	REG16(0x638),
365 	REG16(0x63c),
366 	REG16(0x640),
367 	REG16(0x644),
368 	REG16(0x648),
369 	REG16(0x64c),
370 	REG16(0x650),
371 	REG16(0x654),
372 	REG16(0x658),
373 	REG16(0x65c),
374 	REG16(0x660),
375 	REG16(0x664),
376 	REG16(0x668),
377 	REG16(0x66c),
378 	REG16(0x670),
379 	REG16(0x674),
380 	REG16(0x678),
381 	REG16(0x67c),
382 	REG(0x068),
383 	REG(0x084),
384 	NOP(1),
385 
386 	0
387 };
388 
389 static const u8 xehp_rcs_offsets[] = {
390 	NOP(1),
391 	LRI(13, POSTED),
392 	REG16(0x244),
393 	REG(0x034),
394 	REG(0x030),
395 	REG(0x038),
396 	REG(0x03c),
397 	REG(0x168),
398 	REG(0x140),
399 	REG(0x110),
400 	REG(0x1c0),
401 	REG(0x1c4),
402 	REG(0x1c8),
403 	REG(0x180),
404 	REG16(0x2b4),
405 
406 	NOP(5),
407 	LRI(9, POSTED),
408 	REG16(0x3a8),
409 	REG16(0x28c),
410 	REG16(0x288),
411 	REG16(0x284),
412 	REG16(0x280),
413 	REG16(0x27c),
414 	REG16(0x278),
415 	REG16(0x274),
416 	REG16(0x270),
417 
418 	LRI(3, POSTED),
419 	REG(0x1b0),
420 	REG16(0x5a8),
421 	REG16(0x5ac),
422 
423 	NOP(6),
424 	LRI(1, 0),
425 	REG(0x0c8),
426 
427 	0
428 };
429 
430 static const u8 dg2_rcs_offsets[] = {
431 	NOP(1),
432 	LRI(15, POSTED),
433 	REG16(0x244),
434 	REG(0x034),
435 	REG(0x030),
436 	REG(0x038),
437 	REG(0x03c),
438 	REG(0x168),
439 	REG(0x140),
440 	REG(0x110),
441 	REG(0x1c0),
442 	REG(0x1c4),
443 	REG(0x1c8),
444 	REG(0x180),
445 	REG16(0x2b4),
446 	REG(0x120),
447 	REG(0x124),
448 
449 	NOP(1),
450 	LRI(9, POSTED),
451 	REG16(0x3a8),
452 	REG16(0x28c),
453 	REG16(0x288),
454 	REG16(0x284),
455 	REG16(0x280),
456 	REG16(0x27c),
457 	REG16(0x278),
458 	REG16(0x274),
459 	REG16(0x270),
460 
461 	LRI(3, POSTED),
462 	REG(0x1b0),
463 	REG16(0x5a8),
464 	REG16(0x5ac),
465 
466 	NOP(6),
467 	LRI(1, 0),
468 	REG(0x0c8),
469 
470 	0
471 };
472 
473 static const u8 mtl_rcs_offsets[] = {
474 	NOP(1),
475 	LRI(15, POSTED),
476 	REG16(0x244),
477 	REG(0x034),
478 	REG(0x030),
479 	REG(0x038),
480 	REG(0x03c),
481 	REG(0x168),
482 	REG(0x140),
483 	REG(0x110),
484 	REG(0x1c0),
485 	REG(0x1c4),
486 	REG(0x1c8),
487 	REG(0x180),
488 	REG16(0x2b4),
489 	REG(0x120),
490 	REG(0x124),
491 
492 	NOP(1),
493 	LRI(9, POSTED),
494 	REG16(0x3a8),
495 	REG16(0x28c),
496 	REG16(0x288),
497 	REG16(0x284),
498 	REG16(0x280),
499 	REG16(0x27c),
500 	REG16(0x278),
501 	REG16(0x274),
502 	REG16(0x270),
503 
504 	NOP(2),
505 	LRI(2, POSTED),
506 	REG16(0x5a8),
507 	REG16(0x5ac),
508 
509 	NOP(6),
510 	LRI(1, 0),
511 	REG(0x0c8),
512 
513 	0
514 };
515 
516 #define XE2_CTX_COMMON \
517 	NOP(1),                 /* [0x00] */ \
518 	LRI(15, POSTED),        /* [0x01] */ \
519 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
520 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
521 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
522 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
523 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
524 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
525 	REG(0x140),             /* [0x0e] BB_ADDR */ \
526 	REG(0x110),             /* [0x10] BB_STATE */ \
527 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
528 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
529 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
530 	REG(0x180),             /* [0x18] CCID */ \
531 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
532 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
533 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
534 	\
535 	NOP(1),                 /* [0x20] */ \
536 	LRI(9, POSTED),         /* [0x21] */ \
537 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
538 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
539 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
540 	REG16(0x284),           /* [0x28] dummy reg */ \
541 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
542 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
543 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
544 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
545 	REG16(0x270)            /* [0x32] PTBP_LDW */
546 
547 static const u8 xe2_rcs_offsets[] = {
548 	XE2_CTX_COMMON,
549 
550 	NOP(2),                 /* [0x34] */
551 	LRI(2, POSTED),         /* [0x36] */
552 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
553 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
554 
555 	NOP(6),                 /* [0x41] */
556 	LRI(1, 0),              /* [0x47] */
557 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
558 
559 	0
560 };
561 
562 static const u8 xe2_bcs_offsets[] = {
563 	XE2_CTX_COMMON,
564 
565 	NOP(4 + 8 + 1),         /* [0x34] */
566 	LRI(2, POSTED),         /* [0x41] */
567 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
568 	REG16(0x204),           /* [0x44] BLIT_CCTL */
569 
570 	0
571 };
572 
573 static const u8 xe2_xcs_offsets[] = {
574 	XE2_CTX_COMMON,
575 
576 	0
577 };
578 
579 static const u8 xe2_indirect_ring_state_offsets[] = {
580 	NOP(1),                 /* [0x00] */
581 	LRI(5, POSTED),         /* [0x01] */
582 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
583 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
584 	REG(0x038),             /* [0x06] RING_BUFFER_START */
585 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
586 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
587 
588 	NOP(5),                 /* [0x0c] */
589 	LRI(9, POSTED),         /* [0x11] */
590 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
591 	REG(0x140),             /* [0x14] BB_ADDR */
592 	REG(0x110),             /* [0x16] BB_STATE */
593 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
594 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
595 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
596 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
597 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
598 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
599 
600 	NOP(12),                 /* [0x00] */
601 
602 	0
603 };
604 
605 #undef REG16
606 #undef REG
607 #undef LRI
608 #undef NOP
609 
reg_offsets(struct xe_device * xe,enum xe_engine_class class)610 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
611 {
612 	if (class == XE_ENGINE_CLASS_RENDER) {
613 		if (GRAPHICS_VER(xe) >= 20)
614 			return xe2_rcs_offsets;
615 		else if (GRAPHICS_VERx100(xe) >= 1270)
616 			return mtl_rcs_offsets;
617 		else if (GRAPHICS_VERx100(xe) >= 1255)
618 			return dg2_rcs_offsets;
619 		else if (GRAPHICS_VERx100(xe) >= 1250)
620 			return xehp_rcs_offsets;
621 		else
622 			return gen12_rcs_offsets;
623 	} else if (class == XE_ENGINE_CLASS_COPY) {
624 		if (GRAPHICS_VER(xe) >= 20)
625 			return xe2_bcs_offsets;
626 		else
627 			return gen12_xcs_offsets;
628 	} else {
629 		if (GRAPHICS_VER(xe) >= 20)
630 			return xe2_xcs_offsets;
631 		else if (GRAPHICS_VERx100(xe) >= 1255)
632 			return dg2_xcs_offsets;
633 		else
634 			return gen12_xcs_offsets;
635 	}
636 }
637 
set_context_control(u32 * regs,struct xe_hw_engine * hwe)638 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
639 {
640 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
641 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
642 
643 	if (xe_gt_has_indirect_ring_state(hwe->gt))
644 		regs[CTX_CONTEXT_CONTROL] |=
645 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
646 }
647 
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)648 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
649 {
650 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
651 	struct xe_device *xe = gt_to_xe(hwe->gt);
652 	u8 num_regs;
653 
654 	if (!xe_device_uses_memirq(xe))
655 		return;
656 
657 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
658 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
659 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
660 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
661 
662 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
663 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
664 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
665 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
666 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
667 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
668 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
669 
670 	if (xe_device_has_msix(xe)) {
671 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
672 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
673 	}
674 }
675 
lrc_ring_mi_mode(struct xe_hw_engine * hwe)676 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
677 {
678 	struct xe_device *xe = gt_to_xe(hwe->gt);
679 
680 	if (GRAPHICS_VERx100(xe) >= 1250)
681 		return 0x70;
682 	else
683 		return 0x60;
684 }
685 
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)686 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
687 {
688 	int x;
689 
690 	x = lrc_ring_mi_mode(hwe);
691 	regs[x + 1] &= ~STOP_RING;
692 	regs[x + 1] |= STOP_RING << 16;
693 }
694 
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)695 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
696 {
697 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
698 }
699 
__xe_lrc_ring_offset(struct xe_lrc * lrc)700 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
701 {
702 	return 0;
703 }
704 
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)705 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
706 {
707 	return lrc->ring.size;
708 }
709 
710 /* Make the magic macros work */
711 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
712 #define __xe_lrc_regs_offset xe_lrc_regs_offset
713 
714 #define LRC_SEQNO_PPHWSP_OFFSET 512
715 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
716 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
717 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
718 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
719 
xe_lrc_regs_offset(struct xe_lrc * lrc)720 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
721 {
722 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
723 }
724 
725 /**
726  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
727  * @xe: the &xe_device struct instance
728  *
729  * Returns: Size of the LRC registers area for current platform
730  */
xe_lrc_reg_size(struct xe_device * xe)731 size_t xe_lrc_reg_size(struct xe_device *xe)
732 {
733 	if (GRAPHICS_VERx100(xe) >= 1250)
734 		return 96 * sizeof(u32);
735 	else
736 		return 80 * sizeof(u32);
737 }
738 
xe_lrc_skip_size(struct xe_device * xe)739 size_t xe_lrc_skip_size(struct xe_device *xe)
740 {
741 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
742 }
743 
__xe_lrc_seqno_offset(struct xe_lrc * lrc)744 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
745 {
746 	/* The seqno is stored in the driver-defined portion of PPHWSP */
747 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
748 }
749 
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)750 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
751 {
752 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
753 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
754 }
755 
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)756 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
757 {
758 	/* This is stored in the driver-defined portion of PPHWSP */
759 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
760 }
761 
__xe_lrc_parallel_offset(struct xe_lrc * lrc)762 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
763 {
764 	/* The parallel is stored in the driver-defined portion of PPHWSP */
765 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
766 }
767 
__xe_lrc_engine_id_offset(struct xe_lrc * lrc)768 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
769 {
770 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
771 }
772 
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)773 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
774 {
775 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
776 }
777 
__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc * lrc)778 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
779 {
780 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
781 }
782 
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)783 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
784 {
785 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
786 		     LRC_INDIRECT_RING_STATE_SIZE;
787 
788 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
789 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
790 
791 	return offset;
792 }
793 
__xe_lrc_indirect_ctx_offset(struct xe_lrc * lrc)794 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
795 {
796 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
797 }
798 
__xe_lrc_wa_bb_offset(struct xe_lrc * lrc)799 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
800 {
801 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
802 }
803 
804 #define DECL_MAP_ADDR_HELPERS(elem) \
805 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
806 { \
807 	struct iosys_map map = lrc->bo->vmap; \
808 \
809 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
810 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
811 	return map; \
812 } \
813 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
814 { \
815 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
816 } \
817 
818 DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)819 DECL_MAP_ADDR_HELPERS(pphwsp)
820 DECL_MAP_ADDR_HELPERS(seqno)
821 DECL_MAP_ADDR_HELPERS(regs)
822 DECL_MAP_ADDR_HELPERS(start_seqno)
823 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
824 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
825 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
826 DECL_MAP_ADDR_HELPERS(parallel)
827 DECL_MAP_ADDR_HELPERS(indirect_ring)
828 DECL_MAP_ADDR_HELPERS(engine_id)
829 
830 #undef DECL_MAP_ADDR_HELPERS
831 
832 /**
833  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
834  * @lrc: Pointer to the lrc.
835  *
836  * Returns: ctx timestamp GGTT address
837  */
838 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
839 {
840 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
841 }
842 
843 /**
844  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
845  * @lrc: Pointer to the lrc.
846  *
847  * Returns: ctx timestamp udw GGTT address
848  */
xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc * lrc)849 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
850 {
851 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
852 }
853 
854 /**
855  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
856  * @lrc: Pointer to the lrc.
857  *
858  * Returns: ctx timestamp value
859  */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)860 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
861 {
862 	struct xe_device *xe = lrc_to_xe(lrc);
863 	struct iosys_map map;
864 	u32 ldw, udw = 0;
865 
866 	map = __xe_lrc_ctx_timestamp_map(lrc);
867 	ldw = xe_map_read32(xe, &map);
868 
869 	if (xe->info.has_64bit_timestamp) {
870 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
871 		udw = xe_map_read32(xe, &map);
872 	}
873 
874 	return (u64)udw << 32 | ldw;
875 }
876 
877 /**
878  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
879  * @lrc: Pointer to the lrc.
880  *
881  * Returns: ctx timestamp job GGTT address
882  */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)883 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
884 {
885 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
886 }
887 
888 /**
889  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
890  * @lrc: Pointer to the lrc.
891  *
892  * Returns: ctx timestamp job value
893  */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)894 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
895 {
896 	struct xe_device *xe = lrc_to_xe(lrc);
897 	struct iosys_map map;
898 
899 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
900 	return xe_map_read32(xe, &map);
901 }
902 
xe_lrc_ggtt_addr(struct xe_lrc * lrc)903 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
904 {
905 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
906 }
907 
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)908 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
909 {
910 	if (!xe_lrc_has_indirect_ring_state(lrc))
911 		return 0;
912 
913 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
914 }
915 
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)916 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
917 {
918 	struct xe_device *xe = lrc_to_xe(lrc);
919 	struct iosys_map map;
920 
921 	map = __xe_lrc_indirect_ring_map(lrc);
922 	iosys_map_incr(&map, reg_nr * sizeof(u32));
923 	return xe_map_read32(xe, &map);
924 }
925 
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)926 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
927 					  int reg_nr, u32 val)
928 {
929 	struct xe_device *xe = lrc_to_xe(lrc);
930 	struct iosys_map map;
931 
932 	map = __xe_lrc_indirect_ring_map(lrc);
933 	iosys_map_incr(&map, reg_nr * sizeof(u32));
934 	xe_map_write32(xe, &map, val);
935 }
936 
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)937 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
938 {
939 	struct xe_device *xe = lrc_to_xe(lrc);
940 	struct iosys_map map;
941 
942 	map = __xe_lrc_regs_map(lrc);
943 	iosys_map_incr(&map, reg_nr * sizeof(u32));
944 	return xe_map_read32(xe, &map);
945 }
946 
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)947 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
948 {
949 	struct xe_device *xe = lrc_to_xe(lrc);
950 	struct iosys_map map;
951 
952 	map = __xe_lrc_regs_map(lrc);
953 	iosys_map_incr(&map, reg_nr * sizeof(u32));
954 	xe_map_write32(xe, &map, val);
955 }
956 
empty_lrc_data(struct xe_hw_engine * hwe)957 static void *empty_lrc_data(struct xe_hw_engine *hwe)
958 {
959 	struct xe_gt *gt = hwe->gt;
960 	void *data;
961 	u32 *regs;
962 
963 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
964 	if (!data)
965 		return NULL;
966 
967 	/* 1st page: Per-Process of HW status Page */
968 	regs = data + LRC_PPHWSP_SIZE;
969 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
970 	set_context_control(regs, hwe);
971 	set_memory_based_intr(regs, hwe);
972 	reset_stop_ring(regs, hwe);
973 	if (xe_gt_has_indirect_ring_state(gt)) {
974 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
975 		       LRC_INDIRECT_RING_STATE_SIZE;
976 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
977 	}
978 
979 	return data;
980 }
981 
982 /**
983  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
984  * of given engine.
985  * @hwe: the &xe_hw_engine struct instance
986  */
xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine * hwe)987 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
988 {
989 	struct xe_gt *gt = hwe->gt;
990 	u32 *regs;
991 
992 	if (!gt->default_lrc[hwe->class])
993 		return;
994 
995 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
996 	set_memory_based_intr(regs, hwe);
997 }
998 
999 /**
1000  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
1001  * for given LRC.
1002  * @lrc: the &xe_lrc struct instance
1003  * @hwe: the &xe_hw_engine struct instance
1004  * @regs: scratch buffer to be used as temporary storage
1005  */
xe_lrc_update_memirq_regs_with_address(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * regs)1006 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1007 					    u32 *regs)
1008 {
1009 	struct xe_gt *gt = hwe->gt;
1010 	struct iosys_map map;
1011 	size_t regs_len;
1012 
1013 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
1014 		return;
1015 
1016 	map = __xe_lrc_regs_map(lrc);
1017 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1018 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1019 	set_memory_based_intr(regs, hwe);
1020 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1021 }
1022 
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)1023 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1024 {
1025 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1026 
1027 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1028 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1029 }
1030 
xe_lrc_finish(struct xe_lrc * lrc)1031 static void xe_lrc_finish(struct xe_lrc *lrc)
1032 {
1033 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1034 	xe_bo_unpin_map_no_vm(lrc->bo);
1035 }
1036 
1037 /*
1038  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1039  * in calculating active context run ticks.
1040  *
1041  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1042  * context, but only gets updated when the context switches out. In order to
1043  * check how long a context has been active before it switches out, two things
1044  * are required:
1045  *
1046  * (1) Determine if the context is running:
1047  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1048  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1049  * initialized. During a query, we just check for this value to determine if the
1050  * context is active. If the context switched out, it would overwrite this
1051  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1052  * the last part of context restore, so reusing this LRC location will not
1053  * clobber anything.
1054  *
1055  * (2) Calculate the time that the context has been active for:
1056  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1057  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1058  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1059  * engine instance. Since we do not know which instance the context is running
1060  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1061  * store it in the PPHSWP.
1062  */
1063 #define CONTEXT_ACTIVE 1ULL
setup_utilization_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1064 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1065 				    struct xe_hw_engine *hwe,
1066 				    u32 *batch,
1067 				    size_t max_len)
1068 {
1069 	u32 *cmd = batch;
1070 
1071 	if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1072 		return 0;
1073 
1074 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1075 		return -ENOSPC;
1076 
1077 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1078 	*cmd++ = ENGINE_ID(0).addr;
1079 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1080 	*cmd++ = 0;
1081 
1082 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1083 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1084 	*cmd++ = 0;
1085 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1086 
1087 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1088 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1089 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1090 		*cmd++ = 0;
1091 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1092 	}
1093 
1094 	return cmd - batch;
1095 }
1096 
setup_timestamp_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1097 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1098 				  u32 *batch, size_t max_len)
1099 {
1100 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1101 	u32 *cmd = batch;
1102 
1103 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1104 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1105 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1106 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1107 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1108 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1109 		return 0;
1110 
1111 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1112 		return -ENOSPC;
1113 
1114 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1115 		 MI_LRM_ASYNC;
1116 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1117 	*cmd++ = ts_addr;
1118 	*cmd++ = 0;
1119 
1120 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1121 		 MI_LRM_ASYNC;
1122 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1123 	*cmd++ = ts_addr;
1124 	*cmd++ = 0;
1125 
1126 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1127 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1128 	*cmd++ = ts_addr;
1129 	*cmd++ = 0;
1130 
1131 	return cmd - batch;
1132 }
1133 
setup_configfs_post_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1134 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1135 						  struct xe_hw_engine *hwe,
1136 						  u32 *batch, size_t max_len)
1137 {
1138 	struct xe_device *xe = gt_to_xe(lrc->gt);
1139 	const u32 *user_batch;
1140 	u32 *cmd = batch;
1141 	u32 count;
1142 
1143 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1144 						    hwe->class, &user_batch);
1145 	if (!count)
1146 		return 0;
1147 
1148 	if (count > max_len)
1149 		return -ENOSPC;
1150 
1151 	/*
1152 	 * This should be used only for tests and validation. Taint the kernel
1153 	 * as anything could be submitted directly in context switches
1154 	 */
1155 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1156 
1157 	memcpy(cmd, user_batch, count * sizeof(u32));
1158 	cmd += count;
1159 
1160 	return cmd - batch;
1161 }
1162 
setup_configfs_mid_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1163 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1164 						 struct xe_hw_engine *hwe,
1165 						 u32 *batch, size_t max_len)
1166 {
1167 	struct xe_device *xe = gt_to_xe(lrc->gt);
1168 	const u32 *user_batch;
1169 	u32 *cmd = batch;
1170 	u32 count;
1171 
1172 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1173 						   hwe->class, &user_batch);
1174 	if (!count)
1175 		return 0;
1176 
1177 	if (count > max_len)
1178 		return -ENOSPC;
1179 
1180 	/*
1181 	 * This should be used only for tests and validation. Taint the kernel
1182 	 * as anything could be submitted directly in context switches
1183 	 */
1184 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1185 
1186 	memcpy(cmd, user_batch, count * sizeof(u32));
1187 	cmd += count;
1188 
1189 	return cmd - batch;
1190 }
1191 
setup_invalidate_state_cache_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1192 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1193 					       struct xe_hw_engine *hwe,
1194 					       u32 *batch, size_t max_len)
1195 {
1196 	u32 *cmd = batch;
1197 
1198 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1199 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1200 		return 0;
1201 
1202 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1203 		return -ENOSPC;
1204 
1205 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1206 	*cmd++ = CS_DEBUG_MODE2(0).addr;
1207 	*cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1208 
1209 	return cmd - batch;
1210 }
1211 
1212 struct bo_setup {
1213 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1214 			 u32 *batch, size_t max_size);
1215 };
1216 
1217 struct bo_setup_state {
1218 	/* Input: */
1219 	struct xe_lrc		*lrc;
1220 	struct xe_hw_engine	*hwe;
1221 	size_t			max_size;
1222 	size_t                  reserve_dw;
1223 	unsigned int		offset;
1224 	const struct bo_setup	*funcs;
1225 	unsigned int		num_funcs;
1226 
1227 	/* State: */
1228 	u32			*buffer;
1229 	u32			*ptr;
1230 	unsigned int		written;
1231 };
1232 
setup_bo(struct bo_setup_state * state)1233 static int setup_bo(struct bo_setup_state *state)
1234 {
1235 	ssize_t remain;
1236 
1237 	if (state->lrc->bo->vmap.is_iomem) {
1238 		xe_gt_assert(state->hwe->gt, state->buffer);
1239 		state->ptr = state->buffer;
1240 	} else {
1241 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1242 	}
1243 
1244 	remain = state->max_size / sizeof(u32);
1245 
1246 	for (size_t i = 0; i < state->num_funcs; i++) {
1247 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1248 						    state->ptr, remain);
1249 
1250 		remain -= len;
1251 
1252 		/*
1253 		 * Caller has asked for at least reserve_dw to remain unused.
1254 		 */
1255 		if (len < 0 ||
1256 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1257 			goto fail;
1258 
1259 		state->ptr += len;
1260 		state->written += len;
1261 	}
1262 
1263 	return 0;
1264 
1265 fail:
1266 	return -ENOSPC;
1267 }
1268 
finish_bo(struct bo_setup_state * state)1269 static void finish_bo(struct bo_setup_state *state)
1270 {
1271 	if (!state->lrc->bo->vmap.is_iomem)
1272 		return;
1273 
1274 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1275 			 state->offset, state->buffer,
1276 			 state->written * sizeof(u32));
1277 }
1278 
1279 /**
1280  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1281  * @lrc: the &xe_lrc struct instance
1282  * @hwe: the &xe_hw_engine struct instance
1283  * @scratch: preallocated scratch buffer for temporary storage
1284  * Return: 0 on success, negative error code on failure
1285  */
xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * scratch)1286 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1287 {
1288 	static const struct bo_setup funcs[] = {
1289 		{ .setup = setup_timestamp_wa },
1290 		{ .setup = setup_invalidate_state_cache_wa },
1291 		{ .setup = setup_utilization_wa },
1292 		{ .setup = setup_configfs_post_ctx_restore_bb },
1293 	};
1294 	struct bo_setup_state state = {
1295 		.lrc = lrc,
1296 		.hwe = hwe,
1297 		.max_size = LRC_WA_BB_SIZE,
1298 		.buffer = scratch,
1299 		.reserve_dw = 1,
1300 		.offset = __xe_lrc_wa_bb_offset(lrc),
1301 		.funcs = funcs,
1302 		.num_funcs = ARRAY_SIZE(funcs),
1303 	};
1304 	int ret;
1305 
1306 	ret = setup_bo(&state);
1307 	if (ret)
1308 		return ret;
1309 
1310 	*state.ptr++ = MI_BATCH_BUFFER_END;
1311 	state.written++;
1312 
1313 	finish_bo(&state);
1314 
1315 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1316 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1317 
1318 	return 0;
1319 }
1320 
setup_wa_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1321 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1322 {
1323 	u32 *buf = NULL;
1324 	int ret;
1325 
1326 	if (lrc->bo->vmap.is_iomem) {
1327 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1328 		if (!buf)
1329 			return -ENOMEM;
1330 	}
1331 
1332 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1333 
1334 	kfree(buf);
1335 
1336 	return ret;
1337 }
1338 
1339 static int
setup_indirect_ctx(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1340 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1341 {
1342 	static const struct bo_setup rcs_funcs[] = {
1343 		{ .setup = setup_timestamp_wa },
1344 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1345 	};
1346 	static const struct bo_setup xcs_funcs[] = {
1347 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1348 	};
1349 	struct bo_setup_state state = {
1350 		.lrc = lrc,
1351 		.hwe = hwe,
1352 		.max_size = (63 * 64) /* max 63 cachelines */,
1353 		.buffer = NULL,
1354 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1355 	};
1356 	int ret;
1357 
1358 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1359 		return 0;
1360 
1361 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1362 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1363 		state.funcs = rcs_funcs;
1364 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1365 	} else {
1366 		state.funcs = xcs_funcs;
1367 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1368 	}
1369 
1370 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1371 		return 0;
1372 
1373 	if (lrc->bo->vmap.is_iomem) {
1374 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1375 		if (!state.buffer)
1376 			return -ENOMEM;
1377 	}
1378 
1379 	ret = setup_bo(&state);
1380 	if (ret) {
1381 		kfree(state.buffer);
1382 		return ret;
1383 	}
1384 
1385 	/*
1386 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1387 	 * execute: size for indirect ctx must be a multiple of 64.
1388 	 */
1389 	while (state.written & 0xf) {
1390 		*state.ptr++ = MI_NOOP;
1391 		state.written++;
1392 	}
1393 
1394 	finish_bo(&state);
1395 	kfree(state.buffer);
1396 
1397 	/*
1398 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1399 	 * varies per engine class, but the default is good enough
1400 	 */
1401 	xe_lrc_write_ctx_reg(lrc,
1402 			     CTX_CS_INDIRECT_CTX,
1403 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1404 			     /* Size in CLs. */
1405 			     (state.written * sizeof(u32) / 64));
1406 
1407 	return 0;
1408 }
1409 
xe_multi_queue_prio_to_lrc(struct xe_lrc * lrc,enum xe_multi_queue_priority priority)1410 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1411 {
1412 	struct xe_device *xe = gt_to_xe(lrc->gt);
1413 
1414 	xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1415 		       priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1416 
1417 	/* xe_multi_queue_priority is directly mapped to LRC priority values */
1418 	return priority;
1419 }
1420 
1421 /**
1422  * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1423  * @lrc: Logical Ring Context
1424  * @priority: Multi queue priority of the exec queue
1425  *
1426  * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1427  */
xe_lrc_set_multi_queue_priority(struct xe_lrc * lrc,enum xe_multi_queue_priority priority)1428 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1429 {
1430 	lrc->desc &= ~LRC_PRIORITY;
1431 	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1432 }
1433 
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,void * replay_state,u32 ring_size,u16 msix_vec,u32 init_flags)1434 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1435 		       struct xe_vm *vm, void *replay_state, u32 ring_size,
1436 		       u16 msix_vec,
1437 		       u32 init_flags)
1438 {
1439 	struct xe_gt *gt = hwe->gt;
1440 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1441 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1442 	struct xe_tile *tile = gt_to_tile(gt);
1443 	struct xe_device *xe = gt_to_xe(gt);
1444 	struct iosys_map map;
1445 	u32 arb_enable;
1446 	u32 bo_flags;
1447 	int err;
1448 
1449 	kref_init(&lrc->refcount);
1450 	lrc->gt = gt;
1451 	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1452 	lrc->size = lrc_size;
1453 	lrc->flags = 0;
1454 	lrc->ring.size = ring_size;
1455 	lrc->ring.tail = 0;
1456 
1457 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1458 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1459 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1460 	}
1461 
1462 	if (xe_gt_has_indirect_ring_state(gt))
1463 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1464 
1465 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1466 		   XE_BO_FLAG_GGTT_INVALIDATE;
1467 
1468 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1469 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1470 
1471 	lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
1472 					    bo_size,
1473 					    ttm_bo_type_kernel,
1474 					    bo_flags, false);
1475 	if (IS_ERR(lrc->bo))
1476 		return PTR_ERR(lrc->bo);
1477 
1478 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1479 			     hwe->fence_irq, hwe->name);
1480 
1481 	/*
1482 	 * Init Per-Process of HW status Page, LRC / context state to known
1483 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1484 	 * it's the early submission to record the lrc: build a new empty one from
1485 	 * scratch.
1486 	 */
1487 	map = __xe_lrc_pphwsp_map(lrc);
1488 	if (gt->default_lrc[hwe->class] || replay_state) {
1489 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1490 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1491 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1492 				 lrc_size - LRC_PPHWSP_SIZE);
1493 		if (replay_state)
1494 			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1495 					 replay_state, lrc->replay_size);
1496 	} else {
1497 		void *init_data = empty_lrc_data(hwe);
1498 
1499 		if (!init_data) {
1500 			err = -ENOMEM;
1501 			goto err_lrc_finish;
1502 		}
1503 
1504 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1505 		kfree(init_data);
1506 	}
1507 
1508 	if (vm) {
1509 		xe_lrc_set_ppgtt(lrc, vm);
1510 
1511 		if (vm->xef)
1512 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1513 	}
1514 
1515 	if (xe_device_has_msix(xe)) {
1516 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1517 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1518 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1519 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1520 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1521 	}
1522 
1523 	if (xe_gt_has_indirect_ring_state(gt)) {
1524 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1525 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1526 
1527 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1528 					      __xe_lrc_ring_ggtt_addr(lrc));
1529 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1530 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1531 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1532 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1533 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1534 	} else {
1535 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1536 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1537 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1538 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1539 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1540 	}
1541 
1542 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1543 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1544 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1545 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1546 
1547 	if (init_flags & XE_LRC_CREATE_PXP)
1548 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1549 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1550 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1551 
1552 	lrc->ctx_timestamp = 0;
1553 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1554 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1555 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1556 
1557 	if (xe->info.has_asid && vm)
1558 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1559 
1560 	lrc->desc = LRC_VALID;
1561 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1562 	/* TODO: Priority */
1563 
1564 	/* While this appears to have something about privileged batches or
1565 	 * some such, it really just means PPGTT mode.
1566 	 */
1567 	if (vm)
1568 		lrc->desc |= LRC_PRIVILEGE;
1569 
1570 	if (GRAPHICS_VERx100(xe) < 1250) {
1571 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1572 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1573 	}
1574 
1575 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1576 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1577 
1578 	map = __xe_lrc_seqno_map(lrc);
1579 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1580 
1581 	map = __xe_lrc_start_seqno_map(lrc);
1582 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1583 
1584 	err = setup_wa_bb(lrc, hwe);
1585 	if (err)
1586 		goto err_lrc_finish;
1587 
1588 	err = setup_indirect_ctx(lrc, hwe);
1589 	if (err)
1590 		goto err_lrc_finish;
1591 
1592 	return 0;
1593 
1594 err_lrc_finish:
1595 	xe_lrc_finish(lrc);
1596 	return err;
1597 }
1598 
1599 /**
1600  * xe_lrc_create - Create a LRC
1601  * @hwe: Hardware Engine
1602  * @vm: The VM (address space)
1603  * @replay_state: GPU hang replay state
1604  * @ring_size: LRC ring size
1605  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1606  * @flags: LRC initialization flags
1607  *
1608  * Allocate and initialize the Logical Ring Context (LRC).
1609  *
1610  * Return pointer to created LRC upon success and an error pointer
1611  * upon failure.
1612  */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,void * replay_state,u32 ring_size,u16 msix_vec,u32 flags)1613 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1614 			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1615 {
1616 	struct xe_lrc *lrc;
1617 	int err;
1618 
1619 	lrc = kzalloc_obj(*lrc);
1620 	if (!lrc)
1621 		return ERR_PTR(-ENOMEM);
1622 
1623 	err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1624 	if (err) {
1625 		kfree(lrc);
1626 		return ERR_PTR(err);
1627 	}
1628 
1629 	return lrc;
1630 }
1631 
1632 /**
1633  * xe_lrc_destroy - Destroy the LRC
1634  * @ref: reference to LRC
1635  *
1636  * Called when ref == 0, release resources held by the Logical Ring Context
1637  * (LRC) and free the LRC memory.
1638  */
xe_lrc_destroy(struct kref * ref)1639 void xe_lrc_destroy(struct kref *ref)
1640 {
1641 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1642 
1643 	xe_lrc_finish(lrc);
1644 	kfree(lrc);
1645 }
1646 
1647 /**
1648  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1649  * @lrc: the &xe_lrc struct instance
1650  */
xe_lrc_update_hwctx_regs_with_address(struct xe_lrc * lrc)1651 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1652 {
1653 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1654 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1655 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1656 
1657 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1658 					      __xe_lrc_ring_ggtt_addr(lrc));
1659 	} else {
1660 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1661 	}
1662 }
1663 
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1664 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1665 {
1666 	if (xe_lrc_has_indirect_ring_state(lrc))
1667 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1668 	else
1669 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1670 }
1671 
xe_lrc_ring_tail(struct xe_lrc * lrc)1672 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1673 {
1674 	if (xe_lrc_has_indirect_ring_state(lrc))
1675 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1676 	else
1677 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1678 }
1679 
xe_lrc_ring_start(struct xe_lrc * lrc)1680 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1681 {
1682 	if (xe_lrc_has_indirect_ring_state(lrc))
1683 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1684 	else
1685 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1686 }
1687 
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1688 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1689 {
1690 	if (xe_lrc_has_indirect_ring_state(lrc))
1691 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1692 	else
1693 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1694 }
1695 
xe_lrc_ring_head(struct xe_lrc * lrc)1696 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1697 {
1698 	if (xe_lrc_has_indirect_ring_state(lrc))
1699 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1700 	else
1701 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1702 }
1703 
xe_lrc_ring_space(struct xe_lrc * lrc)1704 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1705 {
1706 	const u32 head = xe_lrc_ring_head(lrc);
1707 	const u32 tail = lrc->ring.tail;
1708 	const u32 size = lrc->ring.size;
1709 
1710 	return ((head - tail - 1) & (size - 1)) + 1;
1711 }
1712 
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1713 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1714 				const void *data, size_t size)
1715 {
1716 	struct xe_device *xe = lrc_to_xe(lrc);
1717 
1718 	iosys_map_incr(&ring, lrc->ring.tail);
1719 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1720 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1721 }
1722 
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1723 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1724 {
1725 	struct xe_device *xe = lrc_to_xe(lrc);
1726 	struct iosys_map ring;
1727 	u32 rhs;
1728 	size_t aligned_size;
1729 
1730 	xe_assert(xe, IS_ALIGNED(size, 4));
1731 	aligned_size = ALIGN(size, 8);
1732 
1733 	ring = __xe_lrc_ring_map(lrc);
1734 
1735 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1736 	rhs = lrc->ring.size - lrc->ring.tail;
1737 	if (size > rhs) {
1738 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1739 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1740 	} else {
1741 		__xe_lrc_write_ring(lrc, ring, data, size);
1742 	}
1743 
1744 	if (aligned_size > size) {
1745 		u32 noop = MI_NOOP;
1746 
1747 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1748 	}
1749 }
1750 
xe_lrc_descriptor(struct xe_lrc * lrc)1751 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1752 {
1753 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1754 }
1755 
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1756 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1757 {
1758 	return __xe_lrc_seqno_ggtt_addr(lrc);
1759 }
1760 
1761 /**
1762  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1763  *
1764  * Allocate but don't initialize an lrc seqno fence.
1765  *
1766  * Return: Pointer to the allocated fence or
1767  * negative error pointer on error.
1768  */
xe_lrc_alloc_seqno_fence(void)1769 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1770 {
1771 	return xe_hw_fence_alloc();
1772 }
1773 
1774 /**
1775  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1776  * @fence: Pointer to the fence to free.
1777  *
1778  * Frees an lrc seqno fence that hasn't yet been
1779  * initialized.
1780  */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1781 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1782 {
1783 	xe_hw_fence_free(fence);
1784 }
1785 
1786 /**
1787  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1788  * @lrc: Pointer to the lrc.
1789  * @fence: Pointer to the fence to initialize.
1790  *
1791  * Initializes a pre-allocated lrc seqno fence.
1792  * After initialization, the fence is subject to normal
1793  * dma-fence refcounting.
1794  */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1795 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1796 {
1797 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1798 }
1799 
xe_lrc_seqno(struct xe_lrc * lrc)1800 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1801 {
1802 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1803 
1804 	return xe_map_read32(lrc_to_xe(lrc), &map);
1805 }
1806 
xe_lrc_start_seqno(struct xe_lrc * lrc)1807 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1808 {
1809 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1810 
1811 	return xe_map_read32(lrc_to_xe(lrc), &map);
1812 }
1813 
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1814 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1815 {
1816 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1817 }
1818 
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1819 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1820 {
1821 	return __xe_lrc_parallel_ggtt_addr(lrc);
1822 }
1823 
xe_lrc_parallel_map(struct xe_lrc * lrc)1824 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1825 {
1826 	return __xe_lrc_parallel_map(lrc);
1827 }
1828 
1829 /**
1830  * xe_lrc_engine_id() - Read engine id value
1831  * @lrc: Pointer to the lrc.
1832  *
1833  * Returns: context id value
1834  */
xe_lrc_engine_id(struct xe_lrc * lrc)1835 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1836 {
1837 	struct xe_device *xe = lrc_to_xe(lrc);
1838 	struct iosys_map map;
1839 
1840 	map = __xe_lrc_engine_id_map(lrc);
1841 	return xe_map_read32(xe, &map);
1842 }
1843 
instr_dw(u32 cmd_header)1844 static int instr_dw(u32 cmd_header)
1845 {
1846 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1847 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1848 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1849 		return 1;
1850 
1851 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1852 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1853 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1854 
1855 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1856 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1857 }
1858 
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1859 static int dump_mi_command(struct drm_printer *p,
1860 			   struct xe_gt *gt,
1861 			   u32 *dw,
1862 			   int remaining_dw)
1863 {
1864 	u32 inst_header = *dw;
1865 	u32 numdw = instr_dw(inst_header);
1866 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1867 	int num_noop;
1868 
1869 	/* First check for commands that don't have/use a '# DW' field */
1870 	switch (inst_header & MI_OPCODE) {
1871 	case MI_NOOP:
1872 		num_noop = 1;
1873 		while (num_noop < remaining_dw &&
1874 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1875 			num_noop++;
1876 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1877 		return num_noop;
1878 
1879 	case MI_TOPOLOGY_FILTER:
1880 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1881 		return 1;
1882 
1883 	case MI_BATCH_BUFFER_END:
1884 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1885 		/* Return 'remaining_dw' to consume the rest of the LRC */
1886 		return remaining_dw;
1887 	}
1888 
1889 	/*
1890 	 * Any remaining commands include a # of dwords.  We should make sure
1891 	 * it doesn't exceed the remaining size of the LRC.
1892 	 */
1893 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1894 		numdw = remaining_dw;
1895 
1896 	switch (inst_header & MI_OPCODE) {
1897 	case MI_LOAD_REGISTER_IMM:
1898 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1899 			   inst_header, (numdw - 1) / 2);
1900 		for (int i = 1; i < numdw; i += 2)
1901 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1902 		return numdw;
1903 
1904 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1905 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1906 			   inst_header,
1907 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1908 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1909 		if (numdw == 4)
1910 			drm_printf(p, " - %#6x = %#010llx\n",
1911 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1912 		else
1913 			drm_printf(p, " - %*ph (%s)\n",
1914 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1915 				   numdw < 4 ? "truncated" : "malformed");
1916 		return numdw;
1917 
1918 	case MI_FORCE_WAKEUP:
1919 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1920 		return numdw;
1921 
1922 	default:
1923 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1924 			   inst_header, opcode, numdw);
1925 		return numdw;
1926 	}
1927 }
1928 
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1929 static int dump_gfxpipe_command(struct drm_printer *p,
1930 				struct xe_gt *gt,
1931 				u32 *dw,
1932 				int remaining_dw)
1933 {
1934 	u32 numdw = instr_dw(*dw);
1935 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1936 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1937 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1938 
1939 	/*
1940 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1941 	 * remaining size of the LRC.
1942 	 */
1943 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1944 		numdw = remaining_dw;
1945 
1946 	switch (*dw & GFXPIPE_MATCH_MASK) {
1947 #define MATCH(cmd) \
1948 	case cmd: \
1949 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1950 		return numdw
1951 #define MATCH3D(cmd) \
1952 	case CMD_##cmd: \
1953 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1954 		return numdw
1955 
1956 	MATCH(STATE_BASE_ADDRESS);
1957 	MATCH(STATE_SIP);
1958 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1959 	MATCH(STATE_COMPUTE_MODE);
1960 	MATCH3D(3DSTATE_BTD);
1961 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1962 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1963 
1964 	MATCH3D(3DSTATE_VF_STATISTICS);
1965 
1966 	MATCH(PIPELINE_SELECT);
1967 
1968 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1969 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1970 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1971 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1972 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1973 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1974 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1975 	MATCH3D(3DSTATE_INDEX_BUFFER);
1976 	MATCH3D(3DSTATE_VF);
1977 	MATCH3D(3DSTATE_MULTISAMPLE);
1978 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1979 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1980 	MATCH3D(3DSTATE_VS);
1981 	MATCH3D(3DSTATE_GS);
1982 	MATCH3D(3DSTATE_CLIP);
1983 	MATCH3D(3DSTATE_SF);
1984 	MATCH3D(3DSTATE_WM);
1985 	MATCH3D(3DSTATE_CONSTANT_VS);
1986 	MATCH3D(3DSTATE_CONSTANT_GS);
1987 	MATCH3D(3DSTATE_CONSTANT_PS);
1988 	MATCH3D(3DSTATE_SAMPLE_MASK);
1989 	MATCH3D(3DSTATE_CONSTANT_HS);
1990 	MATCH3D(3DSTATE_CONSTANT_DS);
1991 	MATCH3D(3DSTATE_HS);
1992 	MATCH3D(3DSTATE_TE);
1993 	MATCH3D(3DSTATE_DS);
1994 	MATCH3D(3DSTATE_STREAMOUT);
1995 	MATCH3D(3DSTATE_SBE);
1996 	MATCH3D(3DSTATE_PS);
1997 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1998 	MATCH3D(3DSTATE_CPS_POINTERS);
1999 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2000 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2001 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2002 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2003 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2004 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2005 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2006 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2007 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2008 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2009 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2010 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2011 	MATCH3D(3DSTATE_VF_INSTANCING);
2012 	MATCH3D(3DSTATE_VF_SGVS);
2013 	MATCH3D(3DSTATE_VF_TOPOLOGY);
2014 	MATCH3D(3DSTATE_WM_CHROMAKEY);
2015 	MATCH3D(3DSTATE_PS_BLEND);
2016 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2017 	MATCH3D(3DSTATE_PS_EXTRA);
2018 	MATCH3D(3DSTATE_RASTER);
2019 	MATCH3D(3DSTATE_SBE_SWIZ);
2020 	MATCH3D(3DSTATE_WM_HZ_OP);
2021 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2022 	MATCH3D(3DSTATE_VF_SGVS_2);
2023 	MATCH3D(3DSTATE_VFG);
2024 	MATCH3D(3DSTATE_URB_ALLOC_VS);
2025 	MATCH3D(3DSTATE_URB_ALLOC_HS);
2026 	MATCH3D(3DSTATE_URB_ALLOC_DS);
2027 	MATCH3D(3DSTATE_URB_ALLOC_GS);
2028 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2029 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2030 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2031 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2032 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2033 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2034 	MATCH3D(3DSTATE_AMFS);
2035 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
2036 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2037 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2038 	MATCH3D(3DSTATE_MESH_CONTROL);
2039 	MATCH3D(3DSTATE_MESH_DISTRIB);
2040 	MATCH3D(3DSTATE_TASK_REDISTRIB);
2041 	MATCH3D(3DSTATE_MESH_SHADER);
2042 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
2043 	MATCH3D(3DSTATE_TASK_CONTROL);
2044 	MATCH3D(3DSTATE_TASK_SHADER);
2045 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
2046 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
2047 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
2048 	MATCH3D(3DSTATE_CLIP_MESH);
2049 	MATCH3D(3DSTATE_SBE_MESH);
2050 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2051 	MATCH3D(3DSTATE_COARSE_PIXEL);
2052 
2053 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2054 	MATCH3D(3DSTATE_CHROMA_KEY);
2055 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2056 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2057 	MATCH3D(3DSTATE_LINE_STIPPLE);
2058 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2059 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2060 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2061 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2062 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2063 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2064 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2065 	MATCH3D(3DSTATE_SO_DECL_LIST);
2066 	MATCH3D(3DSTATE_SO_BUFFER);
2067 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2068 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2069 	MATCH3D(3DSTATE_3D_MODE);
2070 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2071 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2072 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2073 
2074 	default:
2075 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2076 			   *dw, pipeline, opcode, subopcode, numdw);
2077 		return numdw;
2078 	}
2079 }
2080 
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)2081 static int dump_gfx_state_command(struct drm_printer *p,
2082 				  struct xe_gt *gt,
2083 				  u32 *dw,
2084 				  int remaining_dw)
2085 {
2086 	u32 numdw = instr_dw(*dw);
2087 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2088 
2089 	/*
2090 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2091 	 * remaining size of the LRC.
2092 	 */
2093 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2094 		numdw = remaining_dw;
2095 
2096 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2097 	MATCH(STATE_WRITE_INLINE);
2098 
2099 	default:
2100 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2101 			   *dw, opcode, numdw);
2102 		return numdw;
2103 	}
2104 }
2105 
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)2106 void xe_lrc_dump_default(struct drm_printer *p,
2107 			 struct xe_gt *gt,
2108 			 enum xe_engine_class hwe_class)
2109 {
2110 	u32 *dw;
2111 	int remaining_dw, num_dw;
2112 
2113 	if (!gt->default_lrc[hwe_class]) {
2114 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2115 		return;
2116 	}
2117 
2118 	/*
2119 	 * Skip the beginning of the LRC since it contains the per-process
2120 	 * hardware status page.
2121 	 */
2122 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2123 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2124 
2125 	while (remaining_dw > 0) {
2126 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2127 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
2128 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2129 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
2130 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2131 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
2132 		} else {
2133 			num_dw = min(instr_dw(*dw), remaining_dw);
2134 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2135 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2136 				   num_dw);
2137 		}
2138 
2139 		dw += num_dw;
2140 		remaining_dw -= num_dw;
2141 	}
2142 }
2143 
2144 struct instr_state {
2145 	u32 instr;
2146 	u16 num_dw;
2147 };
2148 
2149 static const struct instr_state xe_hpg_svg_state[] = {
2150 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2151 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2152 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2153 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2154 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2155 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2156 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2157 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2158 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2159 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2160 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2161 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2162 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2163 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2164 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2165 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2166 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2167 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2168 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2169 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2170 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2171 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2172 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2173 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2174 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2175 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2176 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2177 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2178 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2179 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2180 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2181 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2182 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2183 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2184 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2185 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2186 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2187 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2188 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2189 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2190 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2191 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2192 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2193 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2194 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2195 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2196 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2197 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2198 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2199 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2200 };
2201 
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,u32 * cs)2202 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2203 {
2204 	struct xe_gt *gt = q->hwe->gt;
2205 	struct xe_device *xe = gt_to_xe(gt);
2206 	const struct instr_state *state_table = NULL;
2207 	int state_table_size = 0;
2208 
2209 	/*
2210 	 * Wa_14019789679
2211 	 *
2212 	 * If the driver doesn't explicitly emit the SVG instructions while
2213 	 * setting up the default LRC, the context switch will write 0's
2214 	 * (noops) into the LRC memory rather than the expected instruction
2215 	 * headers.  Application contexts start out as a copy of the default
2216 	 * LRC, and if they also do not emit specific settings for some SVG
2217 	 * state, then on context restore they'll unintentionally inherit
2218 	 * whatever state setting the previous context had programmed into the
2219 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2220 	 * prevent the hardware from resetting that state back to any specific
2221 	 * value).
2222 	 *
2223 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2224 	 * since that's a specific state setting that can easily cause GPU
2225 	 * hangs if unintentionally inherited.  However to be safe we'll
2226 	 * continue to emit all of the SVG state since it's best not to leak
2227 	 * any of the state between contexts, even if that leakage is harmless.
2228 	 */
2229 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2230 		state_table = xe_hpg_svg_state;
2231 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2232 	}
2233 
2234 	if (!state_table) {
2235 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2236 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2237 		return cs;
2238 	}
2239 
2240 	for (int i = 0; i < state_table_size; i++) {
2241 		u32 instr = state_table[i].instr;
2242 		u16 num_dw = state_table[i].num_dw;
2243 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2244 
2245 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2246 		xe_gt_assert(gt, num_dw != 0);
2247 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2248 
2249 		/*
2250 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2251 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2252 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2253 		 * Just make the replacement here rather than defining a
2254 		 * whole separate table for the single trivial change.
2255 		 */
2256 		if (GRAPHICS_VER(xe) >= 20 &&
2257 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2258 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2259 
2260 		*cs = instr;
2261 		if (!is_single_dw)
2262 			*cs |= (num_dw - 2);
2263 
2264 		cs += num_dw;
2265 	}
2266 
2267 	return cs;
2268 }
2269 
xe_lrc_snapshot_capture(struct xe_lrc * lrc)2270 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2271 {
2272 	struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2273 
2274 	if (!snapshot)
2275 		return NULL;
2276 
2277 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2278 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2279 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2280 	snapshot->head = xe_lrc_ring_head(lrc);
2281 	snapshot->tail.internal = lrc->ring.tail;
2282 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2283 	snapshot->start = xe_lrc_ring_start(lrc);
2284 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2285 	snapshot->seqno = xe_lrc_seqno(lrc);
2286 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2287 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2288 	snapshot->lrc_size = lrc->size;
2289 	snapshot->replay_offset = 0;
2290 	snapshot->replay_size = lrc->replay_size;
2291 	snapshot->lrc_snapshot = NULL;
2292 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2293 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2294 	return snapshot;
2295 }
2296 
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)2297 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2298 {
2299 	struct xe_bo *bo;
2300 	struct iosys_map src;
2301 
2302 	if (!snapshot)
2303 		return;
2304 
2305 	bo = snapshot->lrc_bo;
2306 	snapshot->lrc_bo = NULL;
2307 
2308 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2309 	if (!snapshot->lrc_snapshot)
2310 		goto put_bo;
2311 
2312 	xe_bo_lock(bo, false);
2313 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2314 		xe_map_memcpy_from(xe_bo_device(bo),
2315 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2316 				   snapshot->lrc_size);
2317 		ttm_bo_vunmap(&bo->ttm, &src);
2318 	} else {
2319 		kvfree(snapshot->lrc_snapshot);
2320 		snapshot->lrc_snapshot = NULL;
2321 	}
2322 	xe_bo_unlock(bo);
2323 put_bo:
2324 	xe_bo_put(bo);
2325 }
2326 
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)2327 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2328 {
2329 	unsigned long i;
2330 
2331 	if (!snapshot)
2332 		return;
2333 
2334 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2335 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2336 		   snapshot->ring_addr);
2337 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2338 		   snapshot->indirect_context_desc);
2339 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2340 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2341 		   snapshot->tail.internal, snapshot->tail.memory);
2342 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2343 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2344 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2345 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2346 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2347 
2348 	if (!snapshot->lrc_snapshot)
2349 		return;
2350 
2351 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2352 	drm_puts(p, "\t[HWSP].data: ");
2353 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2354 		u32 *val = snapshot->lrc_snapshot + i;
2355 		char dumped[ASCII85_BUFSZ];
2356 
2357 		drm_puts(p, ascii85_encode(*val, dumped));
2358 	}
2359 
2360 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2361 	drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2362 	drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2363 
2364 	drm_puts(p, "\t[HWCTX].data: ");
2365 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2366 		u32 *val = snapshot->lrc_snapshot + i;
2367 		char dumped[ASCII85_BUFSZ];
2368 
2369 		drm_puts(p, ascii85_encode(*val, dumped));
2370 	}
2371 	drm_puts(p, "\n");
2372 }
2373 
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)2374 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2375 {
2376 	if (!snapshot)
2377 		return;
2378 
2379 	kvfree(snapshot->lrc_snapshot);
2380 	if (snapshot->lrc_bo)
2381 		xe_bo_put(snapshot->lrc_bo);
2382 
2383 	kfree(snapshot);
2384 }
2385 
get_ctx_timestamp(struct xe_lrc * lrc,u32 engine_id,u64 * reg_ctx_ts)2386 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2387 {
2388 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2389 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2390 	struct xe_hw_engine *hwe;
2391 	u64 val;
2392 
2393 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2394 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2395 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2396 			    class, instance))
2397 		return -1;
2398 
2399 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2400 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2401 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2402 	else
2403 		val = xe_mmio_read32(&hwe->gt->mmio,
2404 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2405 
2406 	*reg_ctx_ts = val;
2407 
2408 	return 0;
2409 }
2410 
2411 /**
2412  * xe_lrc_timestamp() - Current ctx timestamp
2413  * @lrc: Pointer to the lrc.
2414  *
2415  * Return latest ctx timestamp. With support for active contexts, the
2416  * calculation may bb slightly racy, so follow a read-again logic to ensure that
2417  * the context is still active before returning the right timestamp.
2418  *
2419  * Returns: New ctx timestamp value
2420  */
xe_lrc_timestamp(struct xe_lrc * lrc)2421 u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2422 {
2423 	u64 lrc_ts, reg_ts, new_ts;
2424 	u32 engine_id;
2425 
2426 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2427 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2428 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2429 		new_ts = lrc_ts;
2430 		goto done;
2431 	}
2432 
2433 	if (lrc_ts == CONTEXT_ACTIVE) {
2434 		engine_id = xe_lrc_engine_id(lrc);
2435 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2436 			new_ts = reg_ts;
2437 
2438 		/* read lrc again to ensure context is still active */
2439 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2440 	}
2441 
2442 	/*
2443 	 * If context switched out, just use the lrc_ts. Note that this needs to
2444 	 * be a separate if condition.
2445 	 */
2446 	if (lrc_ts != CONTEXT_ACTIVE)
2447 		new_ts = lrc_ts;
2448 
2449 done:
2450 	return new_ts;
2451 }
2452 
2453 /**
2454  * xe_lrc_update_timestamp() - Update ctx timestamp
2455  * @lrc: Pointer to the lrc.
2456  * @old_ts: Old timestamp value
2457  *
2458  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2459  * update saved value.
2460  *
2461  * Returns: New ctx timestamp value
2462  */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u64 * old_ts)2463 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2464 {
2465 	*old_ts = lrc->ctx_timestamp;
2466 	lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
2467 
2468 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2469 
2470 	return lrc->ctx_timestamp;
2471 }
2472 
2473 /**
2474  * xe_lrc_ring_is_idle() - LRC is idle
2475  * @lrc: Pointer to the lrc.
2476  *
2477  * Compare LRC ring head and tail to determine if idle.
2478  *
2479  * Return: True is ring is idle, False otherwise
2480  */
xe_lrc_ring_is_idle(struct xe_lrc * lrc)2481 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2482 {
2483 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2484 }
2485