xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 51d24842acb9b8d643046c71314cc3d7a846a3cf)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_gt_regs.h"
18 #include "regs/xe_lrc_layout.h"
19 #include "xe_bb.h"
20 #include "xe_bo.h"
21 #include "xe_configfs.h"
22 #include "xe_device.h"
23 #include "xe_drm_client.h"
24 #include "xe_exec_queue_types.h"
25 #include "xe_gt.h"
26 #include "xe_gt_printk.h"
27 #include "xe_hw_fence.h"
28 #include "xe_map.h"
29 #include "xe_memirq.h"
30 #include "xe_mmio.h"
31 #include "xe_ring_ops.h"
32 #include "xe_sriov.h"
33 #include "xe_trace_lrc.h"
34 #include "xe_vm.h"
35 #include "xe_wa.h"
36 
37 #define LRC_VALID				BIT_ULL(0)
38 #define LRC_PRIVILEGE				BIT_ULL(8)
39 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
40 #define LRC_LEGACY_64B_CONTEXT			3
41 
42 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
43 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
44 
45 #define LRC_PPHWSP_SIZE				SZ_4K
46 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
47 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
48 
49 #define LRC_PRIORITY				GENMASK_ULL(10, 9)
50 #define LRC_PRIORITY_LOW			0
51 #define LRC_PRIORITY_NORMAL			1
52 #define LRC_PRIORITY_HIGH			2
53 
54 /*
55  * Layout of the LRC and associated data allocated as
56  * lrc->bo:
57  *
58  *   Region                       Size
59  *  +============================+=================================+ <- __xe_lrc_ring_offset()
60  *  | Ring                       | ring_size, see                  |
61  *  |                            | xe_lrc_init()                   |
62  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
63  *  | PPHWSP (includes SW state) | 4K                              |
64  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
65  *  | Engine Context Image       | n * 4K, see                     |
66  *  |                            | xe_gt_lrc_size()                |
67  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
68  *  | Indirect Ring State Page   | 0 or 4k, see                    |
69  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
70  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
71  *  | Indirect Context Page      | 0 or 4k, see                    |
72  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
73  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
74  *  | WA BB Per Ctx              | 4k                              |
75  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
76  */
77 
78 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)79 lrc_to_xe(struct xe_lrc *lrc)
80 {
81 	return gt_to_xe(lrc->fence_ctx.gt);
82 }
83 
84 static bool
gt_engine_needs_indirect_ctx(struct xe_gt * gt,enum xe_engine_class class)85 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
86 {
87 	struct xe_device *xe = gt_to_xe(gt);
88 
89 	if (XE_GT_WA(gt, 16010904313) &&
90 	    (class == XE_ENGINE_CLASS_RENDER ||
91 	     class == XE_ENGINE_CLASS_COMPUTE))
92 		return true;
93 
94 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
95 					       class, NULL))
96 		return true;
97 
98 	if (gt->ring_ops[class]->emit_aux_table_inv)
99 		return true;
100 
101 	return false;
102 }
103 
104 /**
105  * xe_gt_lrc_hang_replay_size() - Hang replay size
106  * @gt: The GT
107  * @class: Hardware engine class
108  *
109  * Determine size of GPU hang replay state for a GT and hardware engine class.
110  *
111  * Return: Size of GPU hang replay size
112  */
xe_gt_lrc_hang_replay_size(struct xe_gt * gt,enum xe_engine_class class)113 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
114 {
115 	struct xe_device *xe = gt_to_xe(gt);
116 	size_t size = 0;
117 
118 	/* Engine context image */
119 	switch (class) {
120 	case XE_ENGINE_CLASS_RENDER:
121 		if (GRAPHICS_VERx100(xe) >= 3510)
122 			size += 7 * SZ_4K;
123 		else if (GRAPHICS_VER(xe) >= 20)
124 			size += 3 * SZ_4K;
125 		else
126 			size += 13 * SZ_4K;
127 		break;
128 	case XE_ENGINE_CLASS_COMPUTE:
129 		if (GRAPHICS_VERx100(xe) >= 3510)
130 			size += 5 * SZ_4K;
131 		else if (GRAPHICS_VER(xe) >= 20)
132 			size += 2 * SZ_4K;
133 		else
134 			size += 13 * SZ_4K;
135 		break;
136 	default:
137 		WARN(1, "Unknown engine class: %d", class);
138 		fallthrough;
139 	case XE_ENGINE_CLASS_COPY:
140 	case XE_ENGINE_CLASS_VIDEO_DECODE:
141 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
142 	case XE_ENGINE_CLASS_OTHER:
143 		size += 1 * SZ_4K;
144 	}
145 
146 	return size;
147 }
148 
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)149 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
150 {
151 	size_t size = xe_gt_lrc_hang_replay_size(gt, class);
152 
153 	/* Add indirect ring state page */
154 	if (xe_gt_has_indirect_ring_state(gt))
155 		size += LRC_INDIRECT_RING_STATE_SIZE;
156 
157 	return size + LRC_PPHWSP_SIZE;
158 }
159 
160 /*
161  * The per-platform tables are u8-encoded in @data. Decode @data and set the
162  * addresses' offset and commands in @regs. The following encoding is used
163  * for each byte. There are 2 steps: decoding commands and decoding addresses.
164  *
165  * Commands:
166  * [7]: create NOPs - number of NOPs are set in lower bits
167  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
168  *      MI_LRI_FORCE_POSTED
169  * [5:0]: Number of NOPs or registers to set values to in case of
170  *        MI_LOAD_REGISTER_IMM
171  *
172  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
173  * number of registers. They are set by using the REG/REG16 macros: the former
174  * is used for offsets smaller than 0x200 while the latter is for values bigger
175  * than that. Those macros already set all the bits documented below correctly:
176  *
177  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
178  *      follow, for the lower bits
179  * [6:0]: Register offset, without considering the engine base.
180  *
181  * This function only tweaks the commands and register offsets. Values are not
182  * filled out.
183  */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)184 static void set_offsets(u32 *regs,
185 			const u8 *data,
186 			const struct xe_hw_engine *hwe)
187 #define NOP(x) (BIT(7) | (x))
188 #define LRI(count, flags) ((flags) << 6 | (count) | \
189 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
190 #define POSTED BIT(0)
191 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
192 #define REG16(x) \
193 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
194 	(((x) >> 2) & 0x7f)
195 {
196 	const u32 base = hwe->mmio_base;
197 
198 	while (*data) {
199 		u8 count, flags;
200 
201 		if (*data & BIT(7)) { /* skip */
202 			count = *data++ & ~BIT(7);
203 			regs += count;
204 			continue;
205 		}
206 
207 		count = *data & 0x3f;
208 		flags = *data >> 6;
209 		data++;
210 
211 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
212 		if (flags & POSTED)
213 			*regs |= MI_LRI_FORCE_POSTED;
214 		*regs |= MI_LRI_LRM_CS_MMIO;
215 		regs++;
216 
217 		xe_gt_assert(hwe->gt, count);
218 		do {
219 			u32 offset = 0;
220 			u8 v;
221 
222 			do {
223 				v = *data++;
224 				offset <<= 7;
225 				offset |= v & ~BIT(7);
226 			} while (v & BIT(7));
227 
228 			regs[0] = base + (offset << 2);
229 			regs += 2;
230 		} while (--count);
231 	}
232 
233 	*regs = MI_BATCH_BUFFER_END | BIT(0);
234 }
235 
236 static const u8 gen12_xcs_offsets[] = {
237 	NOP(1),
238 	LRI(13, POSTED),
239 	REG16(0x244),
240 	REG(0x034),
241 	REG(0x030),
242 	REG(0x038),
243 	REG(0x03c),
244 	REG(0x168),
245 	REG(0x140),
246 	REG(0x110),
247 	REG(0x1c0),
248 	REG(0x1c4),
249 	REG(0x1c8),
250 	REG(0x180),
251 	REG16(0x2b4),
252 
253 	NOP(5),
254 	LRI(9, POSTED),
255 	REG16(0x3a8),
256 	REG16(0x28c),
257 	REG16(0x288),
258 	REG16(0x284),
259 	REG16(0x280),
260 	REG16(0x27c),
261 	REG16(0x278),
262 	REG16(0x274),
263 	REG16(0x270),
264 
265 	0
266 };
267 
268 static const u8 dg2_xcs_offsets[] = {
269 	NOP(1),
270 	LRI(15, POSTED),
271 	REG16(0x244),
272 	REG(0x034),
273 	REG(0x030),
274 	REG(0x038),
275 	REG(0x03c),
276 	REG(0x168),
277 	REG(0x140),
278 	REG(0x110),
279 	REG(0x1c0),
280 	REG(0x1c4),
281 	REG(0x1c8),
282 	REG(0x180),
283 	REG16(0x2b4),
284 	REG(0x120),
285 	REG(0x124),
286 
287 	NOP(1),
288 	LRI(9, POSTED),
289 	REG16(0x3a8),
290 	REG16(0x28c),
291 	REG16(0x288),
292 	REG16(0x284),
293 	REG16(0x280),
294 	REG16(0x27c),
295 	REG16(0x278),
296 	REG16(0x274),
297 	REG16(0x270),
298 
299 	0
300 };
301 
302 static const u8 gen12_rcs_offsets[] = {
303 	NOP(1),
304 	LRI(13, POSTED),
305 	REG16(0x244),
306 	REG(0x034),
307 	REG(0x030),
308 	REG(0x038),
309 	REG(0x03c),
310 	REG(0x168),
311 	REG(0x140),
312 	REG(0x110),
313 	REG(0x1c0),
314 	REG(0x1c4),
315 	REG(0x1c8),
316 	REG(0x180),
317 	REG16(0x2b4),
318 
319 	NOP(5),
320 	LRI(9, POSTED),
321 	REG16(0x3a8),
322 	REG16(0x28c),
323 	REG16(0x288),
324 	REG16(0x284),
325 	REG16(0x280),
326 	REG16(0x27c),
327 	REG16(0x278),
328 	REG16(0x274),
329 	REG16(0x270),
330 
331 	LRI(3, POSTED),
332 	REG(0x1b0),
333 	REG16(0x5a8),
334 	REG16(0x5ac),
335 
336 	NOP(6),
337 	LRI(1, 0),
338 	REG(0x0c8),
339 	NOP(3 + 9 + 1),
340 
341 	LRI(51, POSTED),
342 	REG16(0x588),
343 	REG16(0x588),
344 	REG16(0x588),
345 	REG16(0x588),
346 	REG16(0x588),
347 	REG16(0x588),
348 	REG(0x028),
349 	REG(0x09c),
350 	REG(0x0c0),
351 	REG(0x178),
352 	REG(0x17c),
353 	REG16(0x358),
354 	REG(0x170),
355 	REG(0x150),
356 	REG(0x154),
357 	REG(0x158),
358 	REG16(0x41c),
359 	REG16(0x600),
360 	REG16(0x604),
361 	REG16(0x608),
362 	REG16(0x60c),
363 	REG16(0x610),
364 	REG16(0x614),
365 	REG16(0x618),
366 	REG16(0x61c),
367 	REG16(0x620),
368 	REG16(0x624),
369 	REG16(0x628),
370 	REG16(0x62c),
371 	REG16(0x630),
372 	REG16(0x634),
373 	REG16(0x638),
374 	REG16(0x63c),
375 	REG16(0x640),
376 	REG16(0x644),
377 	REG16(0x648),
378 	REG16(0x64c),
379 	REG16(0x650),
380 	REG16(0x654),
381 	REG16(0x658),
382 	REG16(0x65c),
383 	REG16(0x660),
384 	REG16(0x664),
385 	REG16(0x668),
386 	REG16(0x66c),
387 	REG16(0x670),
388 	REG16(0x674),
389 	REG16(0x678),
390 	REG16(0x67c),
391 	REG(0x068),
392 	REG(0x084),
393 	NOP(1),
394 
395 	0
396 };
397 
398 static const u8 xehp_rcs_offsets[] = {
399 	NOP(1),
400 	LRI(13, POSTED),
401 	REG16(0x244),
402 	REG(0x034),
403 	REG(0x030),
404 	REG(0x038),
405 	REG(0x03c),
406 	REG(0x168),
407 	REG(0x140),
408 	REG(0x110),
409 	REG(0x1c0),
410 	REG(0x1c4),
411 	REG(0x1c8),
412 	REG(0x180),
413 	REG16(0x2b4),
414 
415 	NOP(5),
416 	LRI(9, POSTED),
417 	REG16(0x3a8),
418 	REG16(0x28c),
419 	REG16(0x288),
420 	REG16(0x284),
421 	REG16(0x280),
422 	REG16(0x27c),
423 	REG16(0x278),
424 	REG16(0x274),
425 	REG16(0x270),
426 
427 	LRI(3, POSTED),
428 	REG(0x1b0),
429 	REG16(0x5a8),
430 	REG16(0x5ac),
431 
432 	NOP(6),
433 	LRI(1, 0),
434 	REG(0x0c8),
435 
436 	0
437 };
438 
439 static const u8 dg2_rcs_offsets[] = {
440 	NOP(1),
441 	LRI(15, POSTED),
442 	REG16(0x244),
443 	REG(0x034),
444 	REG(0x030),
445 	REG(0x038),
446 	REG(0x03c),
447 	REG(0x168),
448 	REG(0x140),
449 	REG(0x110),
450 	REG(0x1c0),
451 	REG(0x1c4),
452 	REG(0x1c8),
453 	REG(0x180),
454 	REG16(0x2b4),
455 	REG(0x120),
456 	REG(0x124),
457 
458 	NOP(1),
459 	LRI(9, POSTED),
460 	REG16(0x3a8),
461 	REG16(0x28c),
462 	REG16(0x288),
463 	REG16(0x284),
464 	REG16(0x280),
465 	REG16(0x27c),
466 	REG16(0x278),
467 	REG16(0x274),
468 	REG16(0x270),
469 
470 	LRI(3, POSTED),
471 	REG(0x1b0),
472 	REG16(0x5a8),
473 	REG16(0x5ac),
474 
475 	NOP(6),
476 	LRI(1, 0),
477 	REG(0x0c8),
478 
479 	0
480 };
481 
482 static const u8 mtl_rcs_offsets[] = {
483 	NOP(1),
484 	LRI(15, POSTED),
485 	REG16(0x244),
486 	REG(0x034),
487 	REG(0x030),
488 	REG(0x038),
489 	REG(0x03c),
490 	REG(0x168),
491 	REG(0x140),
492 	REG(0x110),
493 	REG(0x1c0),
494 	REG(0x1c4),
495 	REG(0x1c8),
496 	REG(0x180),
497 	REG16(0x2b4),
498 	REG(0x120),
499 	REG(0x124),
500 
501 	NOP(1),
502 	LRI(9, POSTED),
503 	REG16(0x3a8),
504 	REG16(0x28c),
505 	REG16(0x288),
506 	REG16(0x284),
507 	REG16(0x280),
508 	REG16(0x27c),
509 	REG16(0x278),
510 	REG16(0x274),
511 	REG16(0x270),
512 
513 	NOP(2),
514 	LRI(2, POSTED),
515 	REG16(0x5a8),
516 	REG16(0x5ac),
517 
518 	NOP(6),
519 	LRI(1, 0),
520 	REG(0x0c8),
521 
522 	0
523 };
524 
525 #define XE2_CTX_COMMON \
526 	NOP(1),                 /* [0x00] */ \
527 	LRI(15, POSTED),        /* [0x01] */ \
528 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
529 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
530 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
531 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
532 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
533 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
534 	REG(0x140),             /* [0x0e] BB_ADDR */ \
535 	REG(0x110),             /* [0x10] BB_STATE */ \
536 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
537 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
538 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
539 	REG(0x180),             /* [0x18] CCID */ \
540 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
541 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
542 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
543 	\
544 	NOP(1),                 /* [0x20] */ \
545 	LRI(9, POSTED),         /* [0x21] */ \
546 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
547 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
548 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
549 	REG16(0x284),           /* [0x28] dummy reg */ \
550 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
551 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
552 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
553 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
554 	REG16(0x270)            /* [0x32] PTBP_LDW */
555 
556 static const u8 xe2_rcs_offsets[] = {
557 	XE2_CTX_COMMON,
558 
559 	NOP(2),                 /* [0x34] */
560 	LRI(2, POSTED),         /* [0x36] */
561 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
562 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
563 
564 	NOP(6),                 /* [0x41] */
565 	LRI(1, 0),              /* [0x47] */
566 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
567 
568 	0
569 };
570 
571 static const u8 xe2_bcs_offsets[] = {
572 	XE2_CTX_COMMON,
573 
574 	NOP(4 + 8 + 1),         /* [0x34] */
575 	LRI(2, POSTED),         /* [0x41] */
576 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
577 	REG16(0x204),           /* [0x44] BLIT_CCTL */
578 
579 	0
580 };
581 
582 static const u8 xe2_xcs_offsets[] = {
583 	XE2_CTX_COMMON,
584 
585 	0
586 };
587 
588 static const u8 xe2_indirect_ring_state_offsets[] = {
589 	NOP(1),                 /* [0x00] */
590 	LRI(5, POSTED),         /* [0x01] */
591 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
592 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
593 	REG(0x038),             /* [0x06] RING_BUFFER_START */
594 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
595 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
596 
597 	NOP(5),                 /* [0x0c] */
598 	LRI(9, POSTED),         /* [0x11] */
599 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
600 	REG(0x140),             /* [0x14] BB_ADDR */
601 	REG(0x110),             /* [0x16] BB_STATE */
602 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
603 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
604 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
605 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
606 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
607 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
608 
609 	NOP(12),                 /* [0x00] */
610 
611 	0
612 };
613 
614 #undef REG16
615 #undef REG
616 #undef LRI
617 #undef NOP
618 
reg_offsets(struct xe_device * xe,enum xe_engine_class class)619 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
620 {
621 	if (class == XE_ENGINE_CLASS_RENDER) {
622 		if (GRAPHICS_VER(xe) >= 20)
623 			return xe2_rcs_offsets;
624 		else if (GRAPHICS_VERx100(xe) >= 1270)
625 			return mtl_rcs_offsets;
626 		else if (GRAPHICS_VERx100(xe) >= 1255)
627 			return dg2_rcs_offsets;
628 		else if (GRAPHICS_VERx100(xe) >= 1250)
629 			return xehp_rcs_offsets;
630 		else
631 			return gen12_rcs_offsets;
632 	} else if (class == XE_ENGINE_CLASS_COPY) {
633 		if (GRAPHICS_VER(xe) >= 20)
634 			return xe2_bcs_offsets;
635 		else
636 			return gen12_xcs_offsets;
637 	} else {
638 		if (GRAPHICS_VER(xe) >= 20)
639 			return xe2_xcs_offsets;
640 		else if (GRAPHICS_VERx100(xe) >= 1255)
641 			return dg2_xcs_offsets;
642 		else
643 			return gen12_xcs_offsets;
644 	}
645 }
646 
set_context_control(u32 * regs,struct xe_hw_engine * hwe)647 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
648 {
649 	regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
650 							    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
651 
652 	if (xe_gt_has_indirect_ring_state(hwe->gt))
653 		regs[CTX_CONTEXT_CONTROL] |=
654 			REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
655 }
656 
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)657 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
658 {
659 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
660 	struct xe_device *xe = gt_to_xe(hwe->gt);
661 	u8 num_regs;
662 
663 	if (!xe_device_uses_memirq(xe))
664 		return;
665 
666 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
667 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
668 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
669 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
670 
671 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
672 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
673 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
674 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
675 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
676 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
677 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
678 
679 	if (xe_device_has_msix(xe)) {
680 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
681 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
682 	}
683 }
684 
lrc_ring_mi_mode(struct xe_hw_engine * hwe)685 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
686 {
687 	struct xe_device *xe = gt_to_xe(hwe->gt);
688 
689 	if (GRAPHICS_VERx100(xe) >= 1250)
690 		return 0x70;
691 	else
692 		return 0x60;
693 }
694 
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)695 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
696 {
697 	int x;
698 
699 	x = lrc_ring_mi_mode(hwe);
700 	regs[x + 1] &= ~STOP_RING;
701 	regs[x + 1] |= STOP_RING << 16;
702 }
703 
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)704 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
705 {
706 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
707 }
708 
__xe_lrc_ring_offset(struct xe_lrc * lrc)709 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
710 {
711 	return 0;
712 }
713 
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)714 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
715 {
716 	return lrc->ring.size;
717 }
718 
719 /* Make the magic macros work */
720 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
721 #define __xe_lrc_regs_offset xe_lrc_regs_offset
722 
723 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512
724 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
725 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
726 
727 #define LRC_SEQNO_OFFSET 0
728 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8)
729 
xe_lrc_regs_offset(struct xe_lrc * lrc)730 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
731 {
732 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
733 }
734 
735 /**
736  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
737  * @xe: the &xe_device struct instance
738  *
739  * Returns: Size of the LRC registers area for current platform
740  */
xe_lrc_reg_size(struct xe_device * xe)741 size_t xe_lrc_reg_size(struct xe_device *xe)
742 {
743 	if (GRAPHICS_VERx100(xe) >= 1250)
744 		return 96 * sizeof(u32);
745 	else
746 		return 80 * sizeof(u32);
747 }
748 
749 /**
750  * xe_lrc_engine_state_size() - Get size of the engine state within LRC
751  * @gt: the &xe_gt struct instance
752  * @class: Hardware engine class
753  *
754  * Returns: Size of the engine state
755  */
xe_lrc_engine_state_size(struct xe_gt * gt,enum xe_engine_class class)756 size_t xe_lrc_engine_state_size(struct xe_gt *gt, enum xe_engine_class class)
757 {
758 	return xe_gt_lrc_hang_replay_size(gt, class) - xe_lrc_reg_size(gt_to_xe(gt));
759 }
760 
__xe_lrc_seqno_offset(struct xe_lrc * lrc)761 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
762 {
763 	return LRC_SEQNO_OFFSET;
764 }
765 
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)766 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
767 {
768 	return LRC_START_SEQNO_OFFSET;
769 }
770 
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)771 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
772 {
773 	/* This is stored in the driver-defined portion of PPHWSP */
774 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
775 }
776 
__xe_lrc_parallel_offset(struct xe_lrc * lrc)777 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
778 {
779 	/* The parallel is stored in the driver-defined portion of PPHWSP */
780 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
781 }
782 
__xe_lrc_engine_id_offset(struct xe_lrc * lrc)783 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
784 {
785 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
786 }
787 
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)788 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
789 {
790 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
791 }
792 
__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc * lrc)793 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
794 {
795 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
796 }
797 
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)798 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
799 {
800 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
801 		     LRC_INDIRECT_RING_STATE_SIZE;
802 
803 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
804 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
805 
806 	return offset;
807 }
808 
__xe_lrc_indirect_ctx_offset(struct xe_lrc * lrc)809 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
810 {
811 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
812 }
813 
__xe_lrc_wa_bb_offset(struct xe_lrc * lrc)814 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
815 {
816 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
817 }
818 
819 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \
820 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
821 { \
822 	struct xe_bo *bo = (bo_expr); \
823 	struct iosys_map map = bo->vmap; \
824 \
825 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
826 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
827 	return map; \
828 } \
829 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
830 { \
831 	struct xe_bo *bo = (bo_expr); \
832 \
833 	return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \
834 } \
835 
836 DECL_MAP_ADDR_HELPERS(ring, lrc->bo)
837 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo)
838 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo)
839 DECL_MAP_ADDR_HELPERS(regs, lrc->bo)
840 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo)
841 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo)
842 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo)
843 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo)
844 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo)
845 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo)
846 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo)
847 
848 #undef DECL_MAP_ADDR_HELPERS
849 
850 /**
851  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
852  * @lrc: Pointer to the lrc.
853  *
854  * Returns: ctx timestamp GGTT address
855  */
xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc * lrc)856 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
857 {
858 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
859 }
860 
861 /**
862  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
863  * @lrc: Pointer to the lrc.
864  *
865  * Returns: ctx timestamp udw GGTT address
866  */
xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc * lrc)867 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
868 {
869 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
870 }
871 
872 /**
873  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
874  * @lrc: Pointer to the lrc.
875  *
876  * Returns: ctx timestamp value
877  */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)878 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
879 {
880 	struct xe_device *xe = lrc_to_xe(lrc);
881 	struct iosys_map map;
882 	u32 ldw, udw = 0;
883 
884 	map = __xe_lrc_ctx_timestamp_map(lrc);
885 	ldw = xe_map_read32(xe, &map);
886 
887 	if (xe->info.has_64bit_timestamp) {
888 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
889 		udw = xe_map_read32(xe, &map);
890 	}
891 
892 	return (u64)udw << 32 | ldw;
893 }
894 
895 /**
896  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
897  * @lrc: Pointer to the lrc.
898  *
899  * Returns: ctx timestamp job GGTT address
900  */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)901 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
902 {
903 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
904 }
905 
906 /**
907  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
908  * @lrc: Pointer to the lrc.
909  *
910  * Returns: ctx timestamp job value
911  */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)912 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
913 {
914 	struct xe_device *xe = lrc_to_xe(lrc);
915 	struct iosys_map map;
916 
917 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
918 	return xe_map_read32(xe, &map);
919 }
920 
xe_lrc_ggtt_addr(struct xe_lrc * lrc)921 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
922 {
923 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
924 }
925 
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)926 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
927 {
928 	if (!xe_lrc_has_indirect_ring_state(lrc))
929 		return 0;
930 
931 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
932 }
933 
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)934 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
935 {
936 	struct xe_device *xe = lrc_to_xe(lrc);
937 	struct iosys_map map;
938 
939 	map = __xe_lrc_indirect_ring_map(lrc);
940 	iosys_map_incr(&map, reg_nr * sizeof(u32));
941 	return xe_map_read32(xe, &map);
942 }
943 
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)944 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
945 					  int reg_nr, u32 val)
946 {
947 	struct xe_device *xe = lrc_to_xe(lrc);
948 	struct iosys_map map;
949 
950 	map = __xe_lrc_indirect_ring_map(lrc);
951 	iosys_map_incr(&map, reg_nr * sizeof(u32));
952 	xe_map_write32(xe, &map, val);
953 }
954 
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)955 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
956 {
957 	struct xe_device *xe = lrc_to_xe(lrc);
958 	struct iosys_map map;
959 
960 	map = __xe_lrc_regs_map(lrc);
961 	iosys_map_incr(&map, reg_nr * sizeof(u32));
962 	return xe_map_read32(xe, &map);
963 }
964 
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)965 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
966 {
967 	struct xe_device *xe = lrc_to_xe(lrc);
968 	struct iosys_map map;
969 
970 	map = __xe_lrc_regs_map(lrc);
971 	iosys_map_incr(&map, reg_nr * sizeof(u32));
972 	xe_map_write32(xe, &map, val);
973 }
974 
empty_lrc_data(struct xe_hw_engine * hwe)975 static void *empty_lrc_data(struct xe_hw_engine *hwe)
976 {
977 	struct xe_gt *gt = hwe->gt;
978 	void *data;
979 	u32 *regs;
980 
981 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
982 	if (!data)
983 		return NULL;
984 
985 	/* 1st page: Per-Process of HW status Page */
986 	regs = data + LRC_PPHWSP_SIZE;
987 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
988 	set_context_control(regs, hwe);
989 	set_memory_based_intr(regs, hwe);
990 	reset_stop_ring(regs, hwe);
991 	if (xe_gt_has_indirect_ring_state(gt)) {
992 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
993 		       LRC_INDIRECT_RING_STATE_SIZE;
994 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
995 	}
996 
997 	return data;
998 }
999 
1000 /**
1001  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
1002  * of given engine.
1003  * @hwe: the &xe_hw_engine struct instance
1004  */
xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine * hwe)1005 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
1006 {
1007 	struct xe_gt *gt = hwe->gt;
1008 	u32 *regs;
1009 
1010 	if (!gt->default_lrc[hwe->class])
1011 		return;
1012 
1013 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
1014 	set_memory_based_intr(regs, hwe);
1015 }
1016 
1017 /**
1018  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
1019  * for given LRC.
1020  * @lrc: the &xe_lrc struct instance
1021  * @hwe: the &xe_hw_engine struct instance
1022  * @regs: scratch buffer to be used as temporary storage
1023  */
xe_lrc_update_memirq_regs_with_address(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * regs)1024 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1025 					    u32 *regs)
1026 {
1027 	struct xe_gt *gt = hwe->gt;
1028 	struct iosys_map map;
1029 	size_t regs_len;
1030 
1031 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
1032 		return;
1033 
1034 	map = __xe_lrc_regs_map(lrc);
1035 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1036 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1037 	set_memory_based_intr(regs, hwe);
1038 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1039 }
1040 
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)1041 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1042 {
1043 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1044 
1045 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1046 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1047 }
1048 
xe_lrc_finish(struct xe_lrc * lrc)1049 static void xe_lrc_finish(struct xe_lrc *lrc)
1050 {
1051 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1052 	xe_bo_unpin_map_no_vm(lrc->bo);
1053 	xe_bo_unpin_map_no_vm(lrc->seqno_bo);
1054 }
1055 
1056 /*
1057  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1058  * in calculating active context run ticks.
1059  *
1060  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1061  * context, but only gets updated when the context switches out. In order to
1062  * check how long a context has been active before it switches out, two things
1063  * are required:
1064  *
1065  * (1) Determine if the context is running:
1066  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1067  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1068  * initialized. During a query, we just check for this value to determine if the
1069  * context is active. If the context switched out, it would overwrite this
1070  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1071  * the last part of context restore, so reusing this LRC location will not
1072  * clobber anything.
1073  *
1074  * (2) Calculate the time that the context has been active for:
1075  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1076  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1077  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1078  * engine instance. Since we do not know which instance the context is running
1079  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1080  * store it in the PPHSWP.
1081  */
1082 #define CONTEXT_ACTIVE 1ULL
setup_utilization_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1083 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1084 				    struct xe_hw_engine *hwe,
1085 				    u32 *batch,
1086 				    size_t max_len)
1087 {
1088 	u32 *cmd = batch;
1089 
1090 	if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1091 		return 0;
1092 
1093 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1094 		return -ENOSPC;
1095 
1096 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1097 	*cmd++ = ENGINE_ID(0).addr;
1098 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1099 	*cmd++ = 0;
1100 
1101 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1102 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1103 	*cmd++ = 0;
1104 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1105 
1106 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1107 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1108 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1109 		*cmd++ = 0;
1110 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1111 	}
1112 
1113 	return cmd - batch;
1114 }
1115 
setup_timestamp_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1116 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1117 				  u32 *batch, size_t max_len)
1118 {
1119 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1120 	u32 *cmd = batch;
1121 
1122 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1123 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1124 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1125 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1126 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1127 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1128 		return 0;
1129 
1130 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1131 		return -ENOSPC;
1132 
1133 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1134 		 MI_LRM_ASYNC;
1135 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1136 	*cmd++ = ts_addr;
1137 	*cmd++ = 0;
1138 
1139 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1140 		 MI_LRM_ASYNC;
1141 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1142 	*cmd++ = ts_addr;
1143 	*cmd++ = 0;
1144 
1145 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1146 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1147 	*cmd++ = ts_addr;
1148 	*cmd++ = 0;
1149 
1150 	return cmd - batch;
1151 }
1152 
setup_configfs_post_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1153 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1154 						  struct xe_hw_engine *hwe,
1155 						  u32 *batch, size_t max_len)
1156 {
1157 	struct xe_device *xe = gt_to_xe(lrc->gt);
1158 	const u32 *user_batch;
1159 	u32 *cmd = batch;
1160 	u32 count;
1161 
1162 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1163 						    hwe->class, &user_batch);
1164 	if (!count)
1165 		return 0;
1166 
1167 	if (count > max_len)
1168 		return -ENOSPC;
1169 
1170 	/*
1171 	 * This should be used only for tests and validation. Taint the kernel
1172 	 * as anything could be submitted directly in context switches
1173 	 */
1174 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1175 
1176 	memcpy(cmd, user_batch, count * sizeof(u32));
1177 	cmd += count;
1178 
1179 	return cmd - batch;
1180 }
1181 
setup_configfs_mid_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1182 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1183 						 struct xe_hw_engine *hwe,
1184 						 u32 *batch, size_t max_len)
1185 {
1186 	struct xe_device *xe = gt_to_xe(lrc->gt);
1187 	const u32 *user_batch;
1188 	u32 *cmd = batch;
1189 	u32 count;
1190 
1191 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1192 						   hwe->class, &user_batch);
1193 	if (!count)
1194 		return 0;
1195 
1196 	if (count > max_len)
1197 		return -ENOSPC;
1198 
1199 	/*
1200 	 * This should be used only for tests and validation. Taint the kernel
1201 	 * as anything could be submitted directly in context switches
1202 	 */
1203 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1204 
1205 	memcpy(cmd, user_batch, count * sizeof(u32));
1206 	cmd += count;
1207 
1208 	return cmd - batch;
1209 }
1210 
setup_invalidate_state_cache_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1211 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1212 					       struct xe_hw_engine *hwe,
1213 					       u32 *batch, size_t max_len)
1214 {
1215 	u32 *cmd = batch;
1216 
1217 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1218 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1219 		return 0;
1220 
1221 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1222 		return -ENOSPC;
1223 
1224 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_LRM_CS_MMIO | MI_LRI_NUM_REGS(1);
1225 	*cmd++ = CS_DEBUG_MODE2(0).addr;
1226 	*cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1227 
1228 	return cmd - batch;
1229 }
1230 
setup_invalidate_auxccs_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1231 static ssize_t setup_invalidate_auxccs_wa(struct xe_lrc *lrc,
1232 					  struct xe_hw_engine *hwe,
1233 					  u32 *batch, size_t max_len)
1234 {
1235 	struct xe_gt *gt = lrc->gt;
1236 	u32 *(*emit)(struct xe_gt *gt, u32 *cmd) =
1237 		gt->ring_ops[hwe->class]->emit_aux_table_inv;
1238 
1239 	if (!emit)
1240 		return 0;
1241 
1242 	if (xe_gt_WARN_ON(gt, max_len < 8))
1243 		return -ENOSPC;
1244 
1245 	return emit(gt, batch) - batch;
1246 }
1247 
1248 struct bo_setup {
1249 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1250 			 u32 *batch, size_t max_size);
1251 };
1252 
1253 struct bo_setup_state {
1254 	/* Input: */
1255 	struct xe_lrc		*lrc;
1256 	struct xe_hw_engine	*hwe;
1257 	size_t			max_size;
1258 	size_t                  reserve_dw;
1259 	unsigned int		offset;
1260 	const struct bo_setup	*funcs;
1261 	unsigned int		num_funcs;
1262 
1263 	/* State: */
1264 	u32			*buffer;
1265 	u32			*ptr;
1266 	unsigned int		written;
1267 };
1268 
setup_bo(struct bo_setup_state * state)1269 static int setup_bo(struct bo_setup_state *state)
1270 {
1271 	ssize_t remain;
1272 
1273 	if (state->lrc->bo->vmap.is_iomem) {
1274 		xe_gt_assert(state->hwe->gt, state->buffer);
1275 		state->ptr = state->buffer;
1276 	} else {
1277 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1278 	}
1279 
1280 	remain = state->max_size / sizeof(u32);
1281 
1282 	for (size_t i = 0; i < state->num_funcs; i++) {
1283 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1284 						    state->ptr, remain);
1285 
1286 		remain -= len;
1287 
1288 		/*
1289 		 * Caller has asked for at least reserve_dw to remain unused.
1290 		 */
1291 		if (len < 0 ||
1292 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1293 			goto fail;
1294 
1295 		state->ptr += len;
1296 		state->written += len;
1297 	}
1298 
1299 	return 0;
1300 
1301 fail:
1302 	return -ENOSPC;
1303 }
1304 
finish_bo(struct bo_setup_state * state)1305 static void finish_bo(struct bo_setup_state *state)
1306 {
1307 	if (!state->lrc->bo->vmap.is_iomem)
1308 		return;
1309 
1310 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1311 			 state->offset, state->buffer,
1312 			 state->written * sizeof(u32));
1313 }
1314 
1315 /**
1316  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1317  * @lrc: the &xe_lrc struct instance
1318  * @hwe: the &xe_hw_engine struct instance
1319  * @scratch: preallocated scratch buffer for temporary storage
1320  * Return: 0 on success, negative error code on failure
1321  */
xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * scratch)1322 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1323 {
1324 	static const struct bo_setup funcs[] = {
1325 		{ .setup = setup_timestamp_wa },
1326 		{ .setup = setup_invalidate_state_cache_wa },
1327 		{ .setup = setup_utilization_wa },
1328 		{ .setup = setup_configfs_post_ctx_restore_bb },
1329 	};
1330 	struct bo_setup_state state = {
1331 		.lrc = lrc,
1332 		.hwe = hwe,
1333 		.max_size = LRC_WA_BB_SIZE,
1334 		.buffer = scratch,
1335 		.reserve_dw = 1,
1336 		.offset = __xe_lrc_wa_bb_offset(lrc),
1337 		.funcs = funcs,
1338 		.num_funcs = ARRAY_SIZE(funcs),
1339 	};
1340 	int ret;
1341 
1342 	ret = setup_bo(&state);
1343 	if (ret)
1344 		return ret;
1345 
1346 	*state.ptr++ = MI_BATCH_BUFFER_END;
1347 	state.written++;
1348 
1349 	finish_bo(&state);
1350 
1351 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1352 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1353 
1354 	return 0;
1355 }
1356 
setup_wa_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1357 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1358 {
1359 	u32 *buf = NULL;
1360 	int ret;
1361 
1362 	if (lrc->bo->vmap.is_iomem) {
1363 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1364 		if (!buf)
1365 			return -ENOMEM;
1366 	}
1367 
1368 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1369 
1370 	kfree(buf);
1371 
1372 	return ret;
1373 }
1374 
1375 static int
setup_indirect_ctx(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1376 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1377 {
1378 	static const struct bo_setup rcs_funcs[] = {
1379 		{ .setup = setup_timestamp_wa },
1380 		{ .setup = setup_invalidate_auxccs_wa },
1381 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1382 	};
1383 	static const struct bo_setup xcs_funcs[] = {
1384 		{ .setup = setup_invalidate_auxccs_wa },
1385 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1386 	};
1387 	struct bo_setup_state state = {
1388 		.lrc = lrc,
1389 		.hwe = hwe,
1390 		.max_size = (63 * 64) /* max 63 cachelines */,
1391 		.buffer = NULL,
1392 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1393 	};
1394 	int ret;
1395 
1396 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1397 		return 0;
1398 
1399 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1400 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1401 		state.funcs = rcs_funcs;
1402 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1403 	} else {
1404 		state.funcs = xcs_funcs;
1405 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1406 	}
1407 
1408 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1409 		return 0;
1410 
1411 	if (lrc->bo->vmap.is_iomem) {
1412 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1413 		if (!state.buffer)
1414 			return -ENOMEM;
1415 	}
1416 
1417 	ret = setup_bo(&state);
1418 	if (ret) {
1419 		kfree(state.buffer);
1420 		return ret;
1421 	}
1422 
1423 	/*
1424 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1425 	 * execute: size for indirect ctx must be a multiple of 64.
1426 	 */
1427 	while (state.written & 0xf) {
1428 		*state.ptr++ = MI_NOOP;
1429 		state.written++;
1430 	}
1431 
1432 	finish_bo(&state);
1433 	kfree(state.buffer);
1434 
1435 	/*
1436 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1437 	 * varies per engine class, but the default is good enough
1438 	 */
1439 	xe_lrc_write_ctx_reg(lrc,
1440 			     CTX_CS_INDIRECT_CTX,
1441 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1442 			     /* Size in CLs. */
1443 			     (state.written * sizeof(u32) / 64));
1444 
1445 	return 0;
1446 }
1447 
xe_multi_queue_prio_to_lrc(struct xe_lrc * lrc,enum xe_multi_queue_priority priority)1448 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1449 {
1450 	struct xe_device *xe = gt_to_xe(lrc->gt);
1451 
1452 	xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1453 		       priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1454 
1455 	/* xe_multi_queue_priority is directly mapped to LRC priority values */
1456 	return priority;
1457 }
1458 
1459 /**
1460  * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1461  * @lrc: Logical Ring Context
1462  * @priority: Multi queue priority of the exec queue
1463  *
1464  * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1465  */
xe_lrc_set_multi_queue_priority(struct xe_lrc * lrc,enum xe_multi_queue_priority priority)1466 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1467 {
1468 	lrc->desc &= ~LRC_PRIORITY;
1469 	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1470 }
1471 
xe_lrc_ctx_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,void * replay_state,u16 msix_vec,u32 init_flags)1472 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1473 			   void *replay_state, u16 msix_vec, u32 init_flags)
1474 {
1475 	struct xe_gt *gt = hwe->gt;
1476 	struct xe_tile *tile = gt_to_tile(gt);
1477 	struct xe_device *xe = gt_to_xe(gt);
1478 	struct iosys_map map;
1479 	u32 arb_enable;
1480 	u32 state_cache_perf_fix[3];
1481 	int err;
1482 
1483 	/*
1484 	 * Init Per-Process of HW status Page, LRC / context state to known
1485 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1486 	 * it's the early submission to record the lrc: build a new empty one from
1487 	 * scratch.
1488 	 */
1489 	map = __xe_lrc_pphwsp_map(lrc);
1490 	if (gt->default_lrc[hwe->class] || replay_state) {
1491 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1492 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1493 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1494 				 lrc->size - LRC_PPHWSP_SIZE);
1495 		if (replay_state)
1496 			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1497 					 replay_state, lrc->replay_size);
1498 	} else {
1499 		void *init_data = empty_lrc_data(hwe);
1500 
1501 		if (!init_data) {
1502 			return -ENOMEM;
1503 		}
1504 
1505 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
1506 		kfree(init_data);
1507 	}
1508 
1509 	if (vm)
1510 		xe_lrc_set_ppgtt(lrc, vm);
1511 
1512 	if (xe_device_has_msix(xe)) {
1513 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1514 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1515 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1516 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1517 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1518 	}
1519 
1520 	if (xe_gt_has_indirect_ring_state(gt)) {
1521 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1522 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1523 
1524 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1525 					      __xe_lrc_ring_ggtt_addr(lrc));
1526 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1527 
1528 		/* Match head and tail pointers */
1529 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail);
1530 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1531 
1532 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1533 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1534 	} else {
1535 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1536 
1537 		/* Match head and tail pointers */
1538 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail);
1539 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1540 
1541 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1542 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1543 	}
1544 
1545 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1546 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1547 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1548 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE));
1549 
1550 	if (init_flags & XE_LRC_CREATE_PXP)
1551 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1552 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1553 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE));
1554 
1555 	lrc->ctx_timestamp = 0;
1556 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1557 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1558 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1559 
1560 	if (xe->info.has_asid && vm)
1561 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1562 
1563 	lrc->desc = LRC_VALID;
1564 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1565 	/* TODO: Priority */
1566 
1567 	/* While this appears to have something about privileged batches or
1568 	 * some such, it really just means PPGTT mode.
1569 	 */
1570 	if (vm)
1571 		lrc->desc |= LRC_PRIVILEGE;
1572 
1573 	if (GRAPHICS_VERx100(xe) < 1250) {
1574 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1575 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1576 	}
1577 
1578 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1579 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1580 
1581 	if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) {
1582 		state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1583 		state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
1584 		state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX);
1585 		xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
1586 	}
1587 
1588 	map = __xe_lrc_seqno_map(lrc);
1589 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1590 
1591 	map = __xe_lrc_start_seqno_map(lrc);
1592 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1593 
1594 	err = setup_wa_bb(lrc, hwe);
1595 	if (err)
1596 		return err;
1597 
1598 	err = setup_indirect_ctx(lrc, hwe);
1599 
1600 	return err;
1601 }
1602 
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,void * replay_state,u32 ring_size,u16 msix_vec,u32 init_flags)1603 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1604 		       void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
1605 {
1606 	struct xe_gt *gt = hwe->gt;
1607 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1608 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1609 	struct xe_tile *tile = gt_to_tile(gt);
1610 	struct xe_device *xe = gt_to_xe(gt);
1611 	struct xe_bo *bo;
1612 	u32 bo_flags;
1613 	int err;
1614 
1615 	kref_init(&lrc->refcount);
1616 	lrc->gt = gt;
1617 	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1618 	lrc->size = lrc_size;
1619 	lrc->flags = 0;
1620 	lrc->ring.size = ring_size;
1621 	lrc->ring.tail = 0;
1622 
1623 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1624 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1625 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1626 	}
1627 
1628 	if (xe_gt_has_indirect_ring_state(gt))
1629 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1630 
1631 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1632 		   XE_BO_FLAG_GGTT_INVALIDATE;
1633 
1634 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1635 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1636 
1637 	bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
1638 				       ttm_bo_type_kernel,
1639 				       bo_flags, false);
1640 	if (IS_ERR(bo))
1641 		return PTR_ERR(bo);
1642 
1643 	lrc->bo = bo;
1644 
1645 	bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
1646 				       ttm_bo_type_kernel,
1647 				       XE_BO_FLAG_GGTT |
1648 				       XE_BO_FLAG_GGTT_INVALIDATE |
1649 				       XE_BO_FLAG_SYSTEM, false);
1650 	if (IS_ERR(bo)) {
1651 		err = PTR_ERR(bo);
1652 		goto err_lrc_finish;
1653 	}
1654 	lrc->seqno_bo = bo;
1655 
1656 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1657 			     hwe->fence_irq, hwe->name);
1658 
1659 	err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags);
1660 	if (err)
1661 		goto err_lrc_finish;
1662 
1663 	if (vm && vm->xef)
1664 		xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1665 
1666 	return 0;
1667 
1668 err_lrc_finish:
1669 	xe_lrc_finish(lrc);
1670 	return err;
1671 }
1672 
1673 /**
1674  * xe_lrc_create - Create a LRC
1675  * @hwe: Hardware Engine
1676  * @vm: The VM (address space)
1677  * @replay_state: GPU hang replay state
1678  * @ring_size: LRC ring size
1679  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1680  * @flags: LRC initialization flags
1681  *
1682  * Allocate and initialize the Logical Ring Context (LRC).
1683  *
1684  * Return pointer to created LRC upon success and an error pointer
1685  * upon failure.
1686  */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,void * replay_state,u32 ring_size,u16 msix_vec,u32 flags)1687 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1688 			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1689 {
1690 	struct xe_lrc *lrc;
1691 	int err;
1692 
1693 	lrc = kzalloc_obj(*lrc);
1694 	if (!lrc)
1695 		return ERR_PTR(-ENOMEM);
1696 
1697 	err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1698 	if (err) {
1699 		kfree(lrc);
1700 		return ERR_PTR(err);
1701 	}
1702 
1703 	return lrc;
1704 }
1705 
1706 /**
1707  * xe_lrc_destroy - Destroy the LRC
1708  * @ref: reference to LRC
1709  *
1710  * Called when ref == 0, release resources held by the Logical Ring Context
1711  * (LRC) and free the LRC memory.
1712  */
xe_lrc_destroy(struct kref * ref)1713 void xe_lrc_destroy(struct kref *ref)
1714 {
1715 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1716 
1717 	xe_lrc_finish(lrc);
1718 	kfree(lrc);
1719 }
1720 
1721 /**
1722  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1723  * @lrc: the &xe_lrc struct instance
1724  */
xe_lrc_update_hwctx_regs_with_address(struct xe_lrc * lrc)1725 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1726 {
1727 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1728 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1729 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1730 
1731 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1732 					      __xe_lrc_ring_ggtt_addr(lrc));
1733 	} else {
1734 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1735 	}
1736 }
1737 
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1738 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1739 {
1740 	if (xe_lrc_has_indirect_ring_state(lrc))
1741 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1742 	else
1743 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1744 }
1745 
xe_lrc_ring_tail(struct xe_lrc * lrc)1746 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1747 {
1748 	if (xe_lrc_has_indirect_ring_state(lrc))
1749 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1750 	else
1751 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1752 }
1753 
xe_lrc_ring_start(struct xe_lrc * lrc)1754 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1755 {
1756 	if (xe_lrc_has_indirect_ring_state(lrc))
1757 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1758 	else
1759 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1760 }
1761 
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1762 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1763 {
1764 	if (xe_lrc_has_indirect_ring_state(lrc))
1765 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1766 	else
1767 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1768 }
1769 
xe_lrc_ring_head(struct xe_lrc * lrc)1770 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1771 {
1772 	if (xe_lrc_has_indirect_ring_state(lrc))
1773 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1774 	else
1775 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1776 }
1777 
xe_lrc_ring_space(struct xe_lrc * lrc)1778 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1779 {
1780 	const u32 head = xe_lrc_ring_head(lrc);
1781 	const u32 tail = lrc->ring.tail;
1782 	const u32 size = lrc->ring.size;
1783 
1784 	return ((head - tail - 1) & (size - 1)) + 1;
1785 }
1786 
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1787 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1788 				const void *data, size_t size)
1789 {
1790 	struct xe_device *xe = lrc_to_xe(lrc);
1791 
1792 	iosys_map_incr(&ring, lrc->ring.tail);
1793 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1794 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1795 }
1796 
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1797 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1798 {
1799 	struct xe_device *xe = lrc_to_xe(lrc);
1800 	struct iosys_map ring;
1801 	u32 rhs;
1802 	size_t aligned_size;
1803 
1804 	xe_assert(xe, IS_ALIGNED(size, 4));
1805 	aligned_size = ALIGN(size, 8);
1806 
1807 	ring = __xe_lrc_ring_map(lrc);
1808 
1809 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1810 	rhs = lrc->ring.size - lrc->ring.tail;
1811 	if (size > rhs) {
1812 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1813 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1814 	} else {
1815 		__xe_lrc_write_ring(lrc, ring, data, size);
1816 	}
1817 
1818 	if (aligned_size > size) {
1819 		u32 noop = MI_NOOP;
1820 
1821 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1822 	}
1823 }
1824 
xe_lrc_descriptor(struct xe_lrc * lrc)1825 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1826 {
1827 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1828 }
1829 
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1830 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1831 {
1832 	return __xe_lrc_seqno_ggtt_addr(lrc);
1833 }
1834 
1835 /**
1836  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1837  *
1838  * Allocate but don't initialize an lrc seqno fence.
1839  *
1840  * Return: Pointer to the allocated fence or
1841  * negative error pointer on error.
1842  */
xe_lrc_alloc_seqno_fence(void)1843 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1844 {
1845 	return xe_hw_fence_alloc();
1846 }
1847 
1848 /**
1849  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1850  * @fence: Pointer to the fence to free.
1851  *
1852  * Frees an lrc seqno fence that hasn't yet been
1853  * initialized.
1854  */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1855 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1856 {
1857 	xe_hw_fence_free(fence);
1858 }
1859 
1860 /**
1861  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1862  * @lrc: Pointer to the lrc.
1863  * @fence: Pointer to the fence to initialize.
1864  *
1865  * Initializes a pre-allocated lrc seqno fence.
1866  * After initialization, the fence is subject to normal
1867  * dma-fence refcounting.
1868  */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1869 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1870 {
1871 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1872 }
1873 
xe_lrc_seqno(struct xe_lrc * lrc)1874 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1875 {
1876 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1877 
1878 	return xe_map_read32(lrc_to_xe(lrc), &map);
1879 }
1880 
xe_lrc_start_seqno(struct xe_lrc * lrc)1881 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1882 {
1883 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1884 
1885 	return xe_map_read32(lrc_to_xe(lrc), &map);
1886 }
1887 
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1888 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1889 {
1890 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1891 }
1892 
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1893 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1894 {
1895 	return __xe_lrc_parallel_ggtt_addr(lrc);
1896 }
1897 
xe_lrc_parallel_map(struct xe_lrc * lrc)1898 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1899 {
1900 	return __xe_lrc_parallel_map(lrc);
1901 }
1902 
1903 /**
1904  * xe_lrc_engine_id() - Read engine id value
1905  * @lrc: Pointer to the lrc.
1906  *
1907  * Returns: context id value
1908  */
xe_lrc_engine_id(struct xe_lrc * lrc)1909 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1910 {
1911 	struct xe_device *xe = lrc_to_xe(lrc);
1912 	struct iosys_map map;
1913 
1914 	map = __xe_lrc_engine_id_map(lrc);
1915 	return xe_map_read32(xe, &map);
1916 }
1917 
instr_dw(u32 cmd_header)1918 static int instr_dw(u32 cmd_header)
1919 {
1920 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1921 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1922 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1923 		return 1;
1924 
1925 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1926 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1927 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1928 
1929 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1930 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1931 }
1932 
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * start,u32 * dw,int remaining_dw)1933 static int dump_mi_command(struct drm_printer *p,
1934 			   struct xe_gt *gt,
1935 			   u32 *start,
1936 			   u32 *dw,
1937 			   int remaining_dw)
1938 {
1939 	u32 inst_header = *dw;
1940 	u32 numdw = instr_dw(inst_header);
1941 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1942 	int num_noop;
1943 
1944 	/* First check for commands that don't have/use a '# DW' field */
1945 	switch (inst_header & MI_OPCODE) {
1946 	case MI_NOOP:
1947 		num_noop = 1;
1948 		while (num_noop < remaining_dw &&
1949 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1950 			num_noop++;
1951 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_NOOP (%d dwords)\n",
1952 			   dw - num_noop - start, inst_header, num_noop);
1953 		return num_noop;
1954 
1955 	case MI_TOPOLOGY_FILTER:
1956 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_TOPOLOGY_FILTER\n",
1957 			   dw - start, inst_header);
1958 		return 1;
1959 
1960 	case MI_BATCH_BUFFER_END:
1961 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_BATCH_BUFFER_END\n",
1962 			   dw - start, inst_header);
1963 		/* Return 'remaining_dw' to consume the rest of the LRC */
1964 		return remaining_dw;
1965 	}
1966 
1967 	/*
1968 	 * Any remaining commands include a # of dwords.  We should make sure
1969 	 * it doesn't exceed the remaining size of the LRC.
1970 	 */
1971 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1972 		numdw = remaining_dw;
1973 
1974 	switch (inst_header & MI_OPCODE) {
1975 	case MI_LOAD_REGISTER_IMM:
1976 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1977 			   dw - start, inst_header, (numdw - 1) / 2);
1978 		for (int i = 1; i < numdw; i += 2)
1979 			drm_printf(p, "LRC[%#5tx]  =  - %#6x = %#010x\n",
1980 				   &dw[i] - start, dw[i], dw[i + 1]);
1981 		return numdw;
1982 
1983 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1984 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1985 			   dw - start, inst_header,
1986 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1987 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1988 		if (numdw == 4)
1989 			drm_printf(p, "LRC[%#5tx]  =  - %#6x = %#010llx\n",
1990 				   dw - start,
1991 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1992 		else
1993 			drm_printf(p, "LRC[%#5tx]  =  - %*ph (%s)\n",
1994 				   dw - start, (int)sizeof(u32) * (numdw - 1),
1995 				   dw + 1, numdw < 4 ? "truncated" : "malformed");
1996 		return numdw;
1997 
1998 	case MI_FORCE_WAKEUP:
1999 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_FORCE_WAKEUP\n",
2000 			   dw - start, inst_header);
2001 		return numdw;
2002 
2003 	default:
2004 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown MI opcode %#x, likely %d dwords\n",
2005 			   dw - start, inst_header, opcode, numdw);
2006 		return numdw;
2007 	}
2008 }
2009 
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * start,u32 * dw,int remaining_dw)2010 static int dump_gfxpipe_command(struct drm_printer *p,
2011 				struct xe_gt *gt,
2012 				u32 *start,
2013 				u32 *dw,
2014 				int remaining_dw)
2015 {
2016 	u32 numdw = instr_dw(*dw);
2017 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
2018 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
2019 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
2020 
2021 	/*
2022 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2023 	 * remaining size of the LRC.
2024 	 */
2025 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2026 		numdw = remaining_dw;
2027 
2028 	switch (*dw & GFXPIPE_MATCH_MASK) {
2029 #define MATCH(cmd) \
2030 	case cmd: \
2031 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] " #cmd " (%d dwords)\n", \
2032 			   dw - start, *dw, numdw); \
2033 		return numdw
2034 #define MATCH3D(cmd) \
2035 	case CMD_##cmd: \
2036 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] " #cmd " (%d dwords)\n", \
2037 			   dw - start, *dw, numdw); \
2038 		return numdw
2039 
2040 	MATCH(STATE_BASE_ADDRESS);
2041 	MATCH(STATE_SIP);
2042 	MATCH(GPGPU_CSR_BASE_ADDRESS);
2043 	MATCH(STATE_COMPUTE_MODE);
2044 	MATCH3D(3DSTATE_BTD);
2045 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
2046 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
2047 
2048 	MATCH3D(3DSTATE_VF_STATISTICS);
2049 
2050 	MATCH(PIPELINE_SELECT);
2051 
2052 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
2053 	MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN);
2054 	MATCH3D(3DSTATE_CLEAR_PARAMS);
2055 	MATCH3D(3DSTATE_DEPTH_BUFFER);
2056 	MATCH3D(3DSTATE_STENCIL_BUFFER);
2057 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
2058 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
2059 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
2060 	MATCH3D(3DSTATE_INDEX_BUFFER);
2061 	MATCH3D(3DSTATE_VF);
2062 	MATCH3D(3DSTATE_MULTISAMPLE);
2063 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
2064 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
2065 	MATCH3D(3DSTATE_VS);
2066 	MATCH3D(3DSTATE_GS);
2067 	MATCH3D(3DSTATE_CLIP);
2068 	MATCH3D(3DSTATE_SF);
2069 	MATCH3D(3DSTATE_WM);
2070 	MATCH3D(3DSTATE_CONSTANT_VS);
2071 	MATCH3D(3DSTATE_CONSTANT_GS);
2072 	MATCH3D(3DSTATE_CONSTANT_PS);
2073 	MATCH3D(3DSTATE_SAMPLE_MASK);
2074 	MATCH3D(3DSTATE_CONSTANT_HS);
2075 	MATCH3D(3DSTATE_CONSTANT_DS);
2076 	MATCH3D(3DSTATE_HS);
2077 	MATCH3D(3DSTATE_TE);
2078 	MATCH3D(3DSTATE_DS);
2079 	MATCH3D(3DSTATE_STREAMOUT);
2080 	MATCH3D(3DSTATE_SBE);
2081 	MATCH3D(3DSTATE_PS);
2082 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
2083 	MATCH3D(3DSTATE_CPS_POINTERS);
2084 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2085 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2086 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2087 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2088 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2089 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2090 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2091 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2092 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2093 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2094 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2095 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2096 	MATCH3D(3DSTATE_VF_INSTANCING);
2097 	MATCH3D(3DSTATE_VF_SGVS);
2098 	MATCH3D(3DSTATE_VF_TOPOLOGY);
2099 	MATCH3D(3DSTATE_WM_CHROMAKEY);
2100 	MATCH3D(3DSTATE_PS_BLEND);
2101 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2102 	MATCH3D(3DSTATE_PS_EXTRA);
2103 	MATCH3D(3DSTATE_RASTER);
2104 	MATCH3D(3DSTATE_SBE_SWIZ);
2105 	MATCH3D(3DSTATE_WM_HZ_OP);
2106 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2107 	MATCH3D(3DSTATE_VF_SGVS_2);
2108 	MATCH3D(3DSTATE_VFG);
2109 	MATCH3D(3DSTATE_URB_ALLOC_VS);
2110 	MATCH3D(3DSTATE_URB_ALLOC_HS);
2111 	MATCH3D(3DSTATE_URB_ALLOC_DS);
2112 	MATCH3D(3DSTATE_URB_ALLOC_GS);
2113 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2114 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2115 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2116 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2117 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2118 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2119 	MATCH3D(3DSTATE_AMFS);
2120 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
2121 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2122 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2123 	MATCH3D(3DSTATE_MESH_CONTROL);
2124 	MATCH3D(3DSTATE_MESH_DISTRIB);
2125 	MATCH3D(3DSTATE_TASK_REDISTRIB);
2126 	MATCH3D(3DSTATE_MESH_SHADER);
2127 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
2128 	MATCH3D(3DSTATE_TASK_CONTROL);
2129 	MATCH3D(3DSTATE_TASK_SHADER);
2130 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
2131 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
2132 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
2133 	MATCH3D(3DSTATE_CLIP_MESH);
2134 	MATCH3D(3DSTATE_SBE_MESH);
2135 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2136 	MATCH3D(3DSTATE_COARSE_PIXEL);
2137 	MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT);
2138 	MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT);
2139 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2);
2140 	MATCH3D(3DSTATE_CC_STATE_POINTERS_2);
2141 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2);
2142 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2);
2143 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2);
2144 
2145 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2146 	MATCH3D(3DSTATE_URB_MEMORY);
2147 	MATCH3D(3DSTATE_CHROMA_KEY);
2148 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2149 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2150 	MATCH3D(3DSTATE_LINE_STIPPLE);
2151 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2152 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2153 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2154 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2155 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2156 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2157 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2158 	MATCH3D(3DSTATE_SO_DECL_LIST);
2159 	MATCH3D(3DSTATE_SO_BUFFER);
2160 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2161 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2162 	MATCH3D(3DSTATE_3D_MODE);
2163 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2164 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2165 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2166 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2);
2167 
2168 	default:
2169 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2170 			   dw - start, *dw, pipeline, opcode, subopcode, numdw);
2171 		return numdw;
2172 	}
2173 }
2174 
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * start,u32 * dw,int remaining_dw)2175 static int dump_gfx_state_command(struct drm_printer *p,
2176 				  struct xe_gt *gt,
2177 				  u32 *start,
2178 				  u32 *dw,
2179 				  int remaining_dw)
2180 {
2181 	u32 numdw = instr_dw(*dw);
2182 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2183 
2184 	/*
2185 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2186 	 * remaining size of the LRC.
2187 	 */
2188 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2189 		numdw = remaining_dw;
2190 
2191 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2192 	MATCH(STATE_WRITE_INLINE);
2193 
2194 	default:
2195 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2196 			   dw - start, *dw, opcode, numdw);
2197 		return numdw;
2198 	}
2199 }
2200 
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)2201 void xe_lrc_dump_default(struct drm_printer *p,
2202 			 struct xe_gt *gt,
2203 			 enum xe_engine_class hwe_class)
2204 {
2205 	u32 *dw, *start;
2206 	int remaining_dw, num_dw;
2207 
2208 	if (!gt->default_lrc[hwe_class]) {
2209 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2210 		return;
2211 	}
2212 
2213 	/*
2214 	 * Skip the beginning of the LRC since it contains the per-process
2215 	 * hardware status page.
2216 	 */
2217 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2218 	start = dw;
2219 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2220 
2221 	while (remaining_dw > 0) {
2222 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2223 			num_dw = dump_mi_command(p, gt, start, dw, remaining_dw);
2224 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2225 			num_dw = dump_gfxpipe_command(p, gt, start, dw, remaining_dw);
2226 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2227 			num_dw = dump_gfx_state_command(p, gt, start, dw, remaining_dw);
2228 		} else {
2229 			num_dw = min(instr_dw(*dw), remaining_dw);
2230 			drm_printf(p, "LRC[%#5tx]  =  [%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2231 				   dw - start,
2232 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2233 				   num_dw);
2234 		}
2235 
2236 		dw += num_dw;
2237 		remaining_dw -= num_dw;
2238 	}
2239 }
2240 
2241 /*
2242  * Lookup the value of a register within the offset/value pairs of an
2243  * MI_LOAD_REGISTER_IMM instruction.
2244  *
2245  * Return -ENOENT if the register is not present in the MI_LRI instruction.
2246  */
lookup_reg_in_mi_lri(u32 offset,u32 * value,const u32 * dword_pair,int num_regs)2247 static int lookup_reg_in_mi_lri(u32 offset, u32 *value,
2248 				const u32 *dword_pair, int num_regs)
2249 {
2250 	for (int i = 0; i < num_regs; i++) {
2251 		if (dword_pair[2 * i] == offset) {
2252 			*value = dword_pair[2 * i + 1];
2253 			return 0;
2254 		}
2255 	}
2256 
2257 	return -ENOENT;
2258 }
2259 
2260 /*
2261  * Lookup the value of a register in a specific engine type's default LRC.
2262  *
2263  * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register
2264  * cannot be found in the default LRC.
2265  */
xe_lrc_lookup_default_reg_value(struct xe_gt * gt,enum xe_engine_class hwe_class,u32 offset,u32 * value)2266 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt,
2267 				    enum xe_engine_class hwe_class,
2268 				    u32 offset,
2269 				    u32 *value)
2270 {
2271 	u32 *dw;
2272 	int remaining_dw, ret;
2273 
2274 	if (!gt->default_lrc[hwe_class])
2275 		return -EINVAL;
2276 
2277 	/*
2278 	 * Skip the beginning of the LRC since it contains the per-process
2279 	 * hardware status page.
2280 	 */
2281 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2282 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2283 
2284 	while (remaining_dw > 0) {
2285 		u32 num_dw = instr_dw(*dw);
2286 
2287 		if (num_dw > remaining_dw)
2288 			num_dw = remaining_dw;
2289 
2290 		switch (*dw & XE_INSTR_CMD_TYPE) {
2291 		case XE_INSTR_MI:
2292 			switch (*dw & MI_OPCODE) {
2293 			case MI_BATCH_BUFFER_END:
2294 				/* End of LRC; register not found */
2295 				return -ENOENT;
2296 
2297 			case MI_NOOP:
2298 			case MI_TOPOLOGY_FILTER:
2299 				/*
2300 				 * MI_NOOP and MI_TOPOLOGY_FILTER don't have
2301 				 * a length field and are always 1-dword
2302 				 * instructions.
2303 				 */
2304 				remaining_dw--;
2305 				dw++;
2306 				break;
2307 
2308 			case MI_LOAD_REGISTER_IMM:
2309 				ret = lookup_reg_in_mi_lri(offset, value,
2310 							   dw + 1, (num_dw - 1) / 2);
2311 				if (ret == 0)
2312 					return 0;
2313 
2314 				fallthrough;
2315 
2316 			default:
2317 				/*
2318 				 * Jump to next instruction based on length
2319 				 * field.
2320 				 */
2321 				remaining_dw -= num_dw;
2322 				dw += num_dw;
2323 				break;
2324 			}
2325 			break;
2326 
2327 		default:
2328 			/* Jump to next instruction based on length field. */
2329 			remaining_dw -= num_dw;
2330 			dw += num_dw;
2331 		}
2332 	}
2333 
2334 	return -ENOENT;
2335 }
2336 
2337 struct instr_state {
2338 	u32 instr;
2339 	u16 num_dw;
2340 };
2341 
2342 static const struct instr_state xe_hpg_svg_state[] = {
2343 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2344 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2345 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2346 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2347 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2348 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2349 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2350 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2351 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2352 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2353 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2354 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2355 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2356 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2357 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2358 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2359 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2360 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2361 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2362 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2363 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2364 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2365 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2366 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2367 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2368 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2369 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2370 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2371 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2372 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2373 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2374 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2375 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2376 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2377 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2378 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2379 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2380 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2381 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2382 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2383 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2384 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2385 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2386 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2387 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2388 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2389 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2390 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2391 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2392 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2393 };
2394 
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,u32 * cs)2395 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2396 {
2397 	struct xe_gt *gt = q->hwe->gt;
2398 	struct xe_device *xe = gt_to_xe(gt);
2399 	const struct instr_state *state_table = NULL;
2400 	int state_table_size = 0;
2401 
2402 	/*
2403 	 * Wa_14019789679
2404 	 *
2405 	 * If the driver doesn't explicitly emit the SVG instructions while
2406 	 * setting up the default LRC, the context switch will write 0's
2407 	 * (noops) into the LRC memory rather than the expected instruction
2408 	 * headers.  Application contexts start out as a copy of the default
2409 	 * LRC, and if they also do not emit specific settings for some SVG
2410 	 * state, then on context restore they'll unintentionally inherit
2411 	 * whatever state setting the previous context had programmed into the
2412 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2413 	 * prevent the hardware from resetting that state back to any specific
2414 	 * value).
2415 	 *
2416 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2417 	 * since that's a specific state setting that can easily cause GPU
2418 	 * hangs if unintentionally inherited.  However to be safe we'll
2419 	 * continue to emit all of the SVG state since it's best not to leak
2420 	 * any of the state between contexts, even if that leakage is harmless.
2421 	 */
2422 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2423 		state_table = xe_hpg_svg_state;
2424 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2425 	}
2426 
2427 	if (!state_table) {
2428 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2429 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2430 		return cs;
2431 	}
2432 
2433 	for (int i = 0; i < state_table_size; i++) {
2434 		u32 instr = state_table[i].instr;
2435 		u16 num_dw = state_table[i].num_dw;
2436 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2437 
2438 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2439 		xe_gt_assert(gt, num_dw != 0);
2440 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2441 
2442 		/*
2443 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2444 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2445 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2446 		 * Just make the replacement here rather than defining a
2447 		 * whole separate table for the single trivial change.
2448 		 */
2449 		if (GRAPHICS_VER(xe) >= 20 &&
2450 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2451 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2452 
2453 		*cs = instr;
2454 		if (!is_single_dw)
2455 			*cs |= (num_dw - 2);
2456 
2457 		cs += num_dw;
2458 	}
2459 
2460 	return cs;
2461 }
2462 
xe_lrc_snapshot_capture(struct xe_lrc * lrc)2463 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2464 {
2465 	struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2466 
2467 	if (!snapshot)
2468 		return NULL;
2469 
2470 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2471 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2472 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2473 	snapshot->head = xe_lrc_ring_head(lrc);
2474 	snapshot->tail.internal = lrc->ring.tail;
2475 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2476 	snapshot->start = xe_lrc_ring_start(lrc);
2477 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2478 	snapshot->seqno = xe_lrc_seqno(lrc);
2479 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2480 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2481 	snapshot->lrc_size = lrc->size;
2482 	snapshot->replay_offset = 0;
2483 	snapshot->replay_size = lrc->replay_size;
2484 	snapshot->lrc_snapshot = NULL;
2485 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2486 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2487 	return snapshot;
2488 }
2489 
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)2490 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2491 {
2492 	struct xe_bo *bo;
2493 	struct iosys_map src;
2494 
2495 	if (!snapshot)
2496 		return;
2497 
2498 	bo = snapshot->lrc_bo;
2499 	snapshot->lrc_bo = NULL;
2500 
2501 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2502 	if (!snapshot->lrc_snapshot)
2503 		goto put_bo;
2504 
2505 	xe_bo_lock(bo, false);
2506 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2507 		xe_map_memcpy_from(xe_bo_device(bo),
2508 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2509 				   snapshot->lrc_size);
2510 		ttm_bo_vunmap(&bo->ttm, &src);
2511 	} else {
2512 		kvfree(snapshot->lrc_snapshot);
2513 		snapshot->lrc_snapshot = NULL;
2514 	}
2515 	xe_bo_unlock(bo);
2516 put_bo:
2517 	xe_bo_put(bo);
2518 }
2519 
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)2520 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2521 {
2522 	unsigned long i;
2523 
2524 	if (!snapshot)
2525 		return;
2526 
2527 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2528 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2529 		   snapshot->ring_addr);
2530 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2531 		   snapshot->indirect_context_desc);
2532 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2533 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2534 		   snapshot->tail.internal, snapshot->tail.memory);
2535 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2536 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2537 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2538 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2539 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2540 
2541 	if (!snapshot->lrc_snapshot)
2542 		return;
2543 
2544 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2545 	drm_puts(p, "\t[HWSP].data: ");
2546 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2547 		u32 *val = snapshot->lrc_snapshot + i;
2548 		char dumped[ASCII85_BUFSZ];
2549 
2550 		drm_puts(p, ascii85_encode(*val, dumped));
2551 	}
2552 
2553 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2554 	drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2555 	drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2556 
2557 	drm_puts(p, "\t[HWCTX].data: ");
2558 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2559 		u32 *val = snapshot->lrc_snapshot + i;
2560 		char dumped[ASCII85_BUFSZ];
2561 
2562 		drm_puts(p, ascii85_encode(*val, dumped));
2563 	}
2564 	drm_puts(p, "\n");
2565 }
2566 
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)2567 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2568 {
2569 	if (!snapshot)
2570 		return;
2571 
2572 	kvfree(snapshot->lrc_snapshot);
2573 	if (snapshot->lrc_bo)
2574 		xe_bo_put(snapshot->lrc_bo);
2575 
2576 	kfree(snapshot);
2577 }
2578 
get_ctx_timestamp(struct xe_lrc * lrc,u32 engine_id,u64 * reg_ctx_ts)2579 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2580 {
2581 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2582 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2583 	struct xe_hw_engine *hwe;
2584 	u64 val;
2585 
2586 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2587 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2588 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2589 			    class, instance))
2590 		return -1;
2591 
2592 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2593 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2594 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2595 	else
2596 		val = xe_mmio_read32(&hwe->gt->mmio,
2597 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2598 
2599 	*reg_ctx_ts = val;
2600 
2601 	return 0;
2602 }
2603 
2604 /**
2605  * xe_lrc_timestamp() - Current ctx timestamp
2606  * @lrc: Pointer to the lrc.
2607  *
2608  * Return latest ctx timestamp. With support for active contexts, the
2609  * calculation may be slightly racy, so follow a read-again logic to ensure that
2610  * the context is still active before returning the right timestamp.
2611  *
2612  * Returns: New ctx timestamp value
2613  */
xe_lrc_timestamp(struct xe_lrc * lrc)2614 u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2615 {
2616 	u64 lrc_ts, reg_ts, new_ts = lrc->ctx_timestamp;
2617 	u32 engine_id;
2618 
2619 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2620 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2621 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2622 		new_ts = lrc_ts;
2623 		goto done;
2624 	}
2625 
2626 	if (lrc_ts == CONTEXT_ACTIVE) {
2627 		engine_id = xe_lrc_engine_id(lrc);
2628 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2629 			new_ts = reg_ts;
2630 
2631 		/* read lrc again to ensure context is still active */
2632 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2633 	}
2634 
2635 	/*
2636 	 * If context switched out, just use the lrc_ts. Note that this needs to
2637 	 * be a separate if condition.
2638 	 */
2639 	if (lrc_ts != CONTEXT_ACTIVE)
2640 		new_ts = lrc_ts;
2641 
2642 done:
2643 	return new_ts;
2644 }
2645 
2646 /**
2647  * xe_lrc_update_timestamp() - Update ctx timestamp
2648  * @lrc: Pointer to the lrc.
2649  * @old_ts: Old timestamp value
2650  *
2651  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2652  * update saved value.
2653  *
2654  * Returns: New ctx timestamp value
2655  */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u64 * old_ts)2656 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2657 {
2658 	*old_ts = lrc->ctx_timestamp;
2659 	lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
2660 
2661 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2662 
2663 	return lrc->ctx_timestamp;
2664 }
2665 
2666 /**
2667  * xe_lrc_ring_is_idle() - LRC is idle
2668  * @lrc: Pointer to the lrc.
2669  *
2670  * Compare LRC ring head and tail to determine if idle.
2671  *
2672  * Return: True is ring is idle, False otherwise
2673  */
xe_lrc_ring_is_idle(struct xe_lrc * lrc)2674 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2675 {
2676 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2677 }
2678