xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision d6112dddbf354d21ff2fcd49338df68782492c73)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_lrc_layout.h"
18 #include "xe_bb.h"
19 #include "xe_bo.h"
20 #include "xe_configfs.h"
21 #include "xe_device.h"
22 #include "xe_drm_client.h"
23 #include "xe_exec_queue_types.h"
24 #include "xe_gt.h"
25 #include "xe_gt_printk.h"
26 #include "xe_hw_fence.h"
27 #include "xe_map.h"
28 #include "xe_memirq.h"
29 #include "xe_mmio.h"
30 #include "xe_sriov.h"
31 #include "xe_trace_lrc.h"
32 #include "xe_vm.h"
33 #include "xe_wa.h"
34 
35 #define LRC_VALID				BIT_ULL(0)
36 #define LRC_PRIVILEGE				BIT_ULL(8)
37 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
38 #define LRC_LEGACY_64B_CONTEXT			3
39 
40 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
41 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
42 
43 #define LRC_PPHWSP_SIZE				SZ_4K
44 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
45 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
46 
47 /*
48  * Layout of the LRC and associated data allocated as
49  * lrc->bo:
50  *
51  *   Region                       Size
52  *  +============================+=================================+ <- __xe_lrc_ring_offset()
53  *  | Ring                       | ring_size, see                  |
54  *  |                            | xe_lrc_init()                   |
55  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
56  *  | PPHWSP (includes SW state) | 4K                              |
57  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
58  *  | Engine Context Image       | n * 4K, see                     |
59  *  |                            | xe_gt_lrc_size()                |
60  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
61  *  | Indirect Ring State Page   | 0 or 4k, see                    |
62  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
63  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
64  *  | Indirect Context Page      | 0 or 4k, see                    |
65  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
66  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
67  *  | WA BB Per Ctx              | 4k                              |
68  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
69  */
70 
71 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)72 lrc_to_xe(struct xe_lrc *lrc)
73 {
74 	return gt_to_xe(lrc->fence_ctx.gt);
75 }
76 
77 static bool
gt_engine_needs_indirect_ctx(struct xe_gt * gt,enum xe_engine_class class)78 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
79 {
80 	struct xe_device *xe = gt_to_xe(gt);
81 
82 	if (XE_GT_WA(gt, 16010904313) &&
83 	    (class == XE_ENGINE_CLASS_RENDER ||
84 	     class == XE_ENGINE_CLASS_COMPUTE))
85 		return true;
86 
87 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
88 					       class, NULL))
89 		return true;
90 
91 	return false;
92 }
93 
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)94 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
95 {
96 	struct xe_device *xe = gt_to_xe(gt);
97 	size_t size;
98 
99 	/* Per-process HW status page (PPHWSP) */
100 	size = LRC_PPHWSP_SIZE;
101 
102 	/* Engine context image */
103 	switch (class) {
104 	case XE_ENGINE_CLASS_RENDER:
105 		if (GRAPHICS_VER(xe) >= 20)
106 			size += 3 * SZ_4K;
107 		else
108 			size += 13 * SZ_4K;
109 		break;
110 	case XE_ENGINE_CLASS_COMPUTE:
111 		if (GRAPHICS_VER(xe) >= 20)
112 			size += 2 * SZ_4K;
113 		else
114 			size += 13 * SZ_4K;
115 		break;
116 	default:
117 		WARN(1, "Unknown engine class: %d", class);
118 		fallthrough;
119 	case XE_ENGINE_CLASS_COPY:
120 	case XE_ENGINE_CLASS_VIDEO_DECODE:
121 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
122 	case XE_ENGINE_CLASS_OTHER:
123 		size += 1 * SZ_4K;
124 	}
125 
126 	/* Add indirect ring state page */
127 	if (xe_gt_has_indirect_ring_state(gt))
128 		size += LRC_INDIRECT_RING_STATE_SIZE;
129 
130 	return size;
131 }
132 
133 /*
134  * The per-platform tables are u8-encoded in @data. Decode @data and set the
135  * addresses' offset and commands in @regs. The following encoding is used
136  * for each byte. There are 2 steps: decoding commands and decoding addresses.
137  *
138  * Commands:
139  * [7]: create NOPs - number of NOPs are set in lower bits
140  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
141  *      MI_LRI_FORCE_POSTED
142  * [5:0]: Number of NOPs or registers to set values to in case of
143  *        MI_LOAD_REGISTER_IMM
144  *
145  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
146  * number of registers. They are set by using the REG/REG16 macros: the former
147  * is used for offsets smaller than 0x200 while the latter is for values bigger
148  * than that. Those macros already set all the bits documented below correctly:
149  *
150  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
151  *      follow, for the lower bits
152  * [6:0]: Register offset, without considering the engine base.
153  *
154  * This function only tweaks the commands and register offsets. Values are not
155  * filled out.
156  */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)157 static void set_offsets(u32 *regs,
158 			const u8 *data,
159 			const struct xe_hw_engine *hwe)
160 #define NOP(x) (BIT(7) | (x))
161 #define LRI(count, flags) ((flags) << 6 | (count) | \
162 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
163 #define POSTED BIT(0)
164 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
165 #define REG16(x) \
166 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
167 	(((x) >> 2) & 0x7f)
168 {
169 	const u32 base = hwe->mmio_base;
170 
171 	while (*data) {
172 		u8 count, flags;
173 
174 		if (*data & BIT(7)) { /* skip */
175 			count = *data++ & ~BIT(7);
176 			regs += count;
177 			continue;
178 		}
179 
180 		count = *data & 0x3f;
181 		flags = *data >> 6;
182 		data++;
183 
184 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
185 		if (flags & POSTED)
186 			*regs |= MI_LRI_FORCE_POSTED;
187 		*regs |= MI_LRI_LRM_CS_MMIO;
188 		regs++;
189 
190 		xe_gt_assert(hwe->gt, count);
191 		do {
192 			u32 offset = 0;
193 			u8 v;
194 
195 			do {
196 				v = *data++;
197 				offset <<= 7;
198 				offset |= v & ~BIT(7);
199 			} while (v & BIT(7));
200 
201 			regs[0] = base + (offset << 2);
202 			regs += 2;
203 		} while (--count);
204 	}
205 
206 	*regs = MI_BATCH_BUFFER_END | BIT(0);
207 }
208 
209 static const u8 gen12_xcs_offsets[] = {
210 	NOP(1),
211 	LRI(13, POSTED),
212 	REG16(0x244),
213 	REG(0x034),
214 	REG(0x030),
215 	REG(0x038),
216 	REG(0x03c),
217 	REG(0x168),
218 	REG(0x140),
219 	REG(0x110),
220 	REG(0x1c0),
221 	REG(0x1c4),
222 	REG(0x1c8),
223 	REG(0x180),
224 	REG16(0x2b4),
225 
226 	NOP(5),
227 	LRI(9, POSTED),
228 	REG16(0x3a8),
229 	REG16(0x28c),
230 	REG16(0x288),
231 	REG16(0x284),
232 	REG16(0x280),
233 	REG16(0x27c),
234 	REG16(0x278),
235 	REG16(0x274),
236 	REG16(0x270),
237 
238 	0
239 };
240 
241 static const u8 dg2_xcs_offsets[] = {
242 	NOP(1),
243 	LRI(15, POSTED),
244 	REG16(0x244),
245 	REG(0x034),
246 	REG(0x030),
247 	REG(0x038),
248 	REG(0x03c),
249 	REG(0x168),
250 	REG(0x140),
251 	REG(0x110),
252 	REG(0x1c0),
253 	REG(0x1c4),
254 	REG(0x1c8),
255 	REG(0x180),
256 	REG16(0x2b4),
257 	REG(0x120),
258 	REG(0x124),
259 
260 	NOP(1),
261 	LRI(9, POSTED),
262 	REG16(0x3a8),
263 	REG16(0x28c),
264 	REG16(0x288),
265 	REG16(0x284),
266 	REG16(0x280),
267 	REG16(0x27c),
268 	REG16(0x278),
269 	REG16(0x274),
270 	REG16(0x270),
271 
272 	0
273 };
274 
275 static const u8 gen12_rcs_offsets[] = {
276 	NOP(1),
277 	LRI(13, POSTED),
278 	REG16(0x244),
279 	REG(0x034),
280 	REG(0x030),
281 	REG(0x038),
282 	REG(0x03c),
283 	REG(0x168),
284 	REG(0x140),
285 	REG(0x110),
286 	REG(0x1c0),
287 	REG(0x1c4),
288 	REG(0x1c8),
289 	REG(0x180),
290 	REG16(0x2b4),
291 
292 	NOP(5),
293 	LRI(9, POSTED),
294 	REG16(0x3a8),
295 	REG16(0x28c),
296 	REG16(0x288),
297 	REG16(0x284),
298 	REG16(0x280),
299 	REG16(0x27c),
300 	REG16(0x278),
301 	REG16(0x274),
302 	REG16(0x270),
303 
304 	LRI(3, POSTED),
305 	REG(0x1b0),
306 	REG16(0x5a8),
307 	REG16(0x5ac),
308 
309 	NOP(6),
310 	LRI(1, 0),
311 	REG(0x0c8),
312 	NOP(3 + 9 + 1),
313 
314 	LRI(51, POSTED),
315 	REG16(0x588),
316 	REG16(0x588),
317 	REG16(0x588),
318 	REG16(0x588),
319 	REG16(0x588),
320 	REG16(0x588),
321 	REG(0x028),
322 	REG(0x09c),
323 	REG(0x0c0),
324 	REG(0x178),
325 	REG(0x17c),
326 	REG16(0x358),
327 	REG(0x170),
328 	REG(0x150),
329 	REG(0x154),
330 	REG(0x158),
331 	REG16(0x41c),
332 	REG16(0x600),
333 	REG16(0x604),
334 	REG16(0x608),
335 	REG16(0x60c),
336 	REG16(0x610),
337 	REG16(0x614),
338 	REG16(0x618),
339 	REG16(0x61c),
340 	REG16(0x620),
341 	REG16(0x624),
342 	REG16(0x628),
343 	REG16(0x62c),
344 	REG16(0x630),
345 	REG16(0x634),
346 	REG16(0x638),
347 	REG16(0x63c),
348 	REG16(0x640),
349 	REG16(0x644),
350 	REG16(0x648),
351 	REG16(0x64c),
352 	REG16(0x650),
353 	REG16(0x654),
354 	REG16(0x658),
355 	REG16(0x65c),
356 	REG16(0x660),
357 	REG16(0x664),
358 	REG16(0x668),
359 	REG16(0x66c),
360 	REG16(0x670),
361 	REG16(0x674),
362 	REG16(0x678),
363 	REG16(0x67c),
364 	REG(0x068),
365 	REG(0x084),
366 	NOP(1),
367 
368 	0
369 };
370 
371 static const u8 xehp_rcs_offsets[] = {
372 	NOP(1),
373 	LRI(13, POSTED),
374 	REG16(0x244),
375 	REG(0x034),
376 	REG(0x030),
377 	REG(0x038),
378 	REG(0x03c),
379 	REG(0x168),
380 	REG(0x140),
381 	REG(0x110),
382 	REG(0x1c0),
383 	REG(0x1c4),
384 	REG(0x1c8),
385 	REG(0x180),
386 	REG16(0x2b4),
387 
388 	NOP(5),
389 	LRI(9, POSTED),
390 	REG16(0x3a8),
391 	REG16(0x28c),
392 	REG16(0x288),
393 	REG16(0x284),
394 	REG16(0x280),
395 	REG16(0x27c),
396 	REG16(0x278),
397 	REG16(0x274),
398 	REG16(0x270),
399 
400 	LRI(3, POSTED),
401 	REG(0x1b0),
402 	REG16(0x5a8),
403 	REG16(0x5ac),
404 
405 	NOP(6),
406 	LRI(1, 0),
407 	REG(0x0c8),
408 
409 	0
410 };
411 
412 static const u8 dg2_rcs_offsets[] = {
413 	NOP(1),
414 	LRI(15, POSTED),
415 	REG16(0x244),
416 	REG(0x034),
417 	REG(0x030),
418 	REG(0x038),
419 	REG(0x03c),
420 	REG(0x168),
421 	REG(0x140),
422 	REG(0x110),
423 	REG(0x1c0),
424 	REG(0x1c4),
425 	REG(0x1c8),
426 	REG(0x180),
427 	REG16(0x2b4),
428 	REG(0x120),
429 	REG(0x124),
430 
431 	NOP(1),
432 	LRI(9, POSTED),
433 	REG16(0x3a8),
434 	REG16(0x28c),
435 	REG16(0x288),
436 	REG16(0x284),
437 	REG16(0x280),
438 	REG16(0x27c),
439 	REG16(0x278),
440 	REG16(0x274),
441 	REG16(0x270),
442 
443 	LRI(3, POSTED),
444 	REG(0x1b0),
445 	REG16(0x5a8),
446 	REG16(0x5ac),
447 
448 	NOP(6),
449 	LRI(1, 0),
450 	REG(0x0c8),
451 
452 	0
453 };
454 
455 static const u8 mtl_rcs_offsets[] = {
456 	NOP(1),
457 	LRI(15, POSTED),
458 	REG16(0x244),
459 	REG(0x034),
460 	REG(0x030),
461 	REG(0x038),
462 	REG(0x03c),
463 	REG(0x168),
464 	REG(0x140),
465 	REG(0x110),
466 	REG(0x1c0),
467 	REG(0x1c4),
468 	REG(0x1c8),
469 	REG(0x180),
470 	REG16(0x2b4),
471 	REG(0x120),
472 	REG(0x124),
473 
474 	NOP(1),
475 	LRI(9, POSTED),
476 	REG16(0x3a8),
477 	REG16(0x28c),
478 	REG16(0x288),
479 	REG16(0x284),
480 	REG16(0x280),
481 	REG16(0x27c),
482 	REG16(0x278),
483 	REG16(0x274),
484 	REG16(0x270),
485 
486 	NOP(2),
487 	LRI(2, POSTED),
488 	REG16(0x5a8),
489 	REG16(0x5ac),
490 
491 	NOP(6),
492 	LRI(1, 0),
493 	REG(0x0c8),
494 
495 	0
496 };
497 
498 #define XE2_CTX_COMMON \
499 	NOP(1),                 /* [0x00] */ \
500 	LRI(15, POSTED),        /* [0x01] */ \
501 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
502 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
503 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
504 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
505 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
506 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
507 	REG(0x140),             /* [0x0e] BB_ADDR */ \
508 	REG(0x110),             /* [0x10] BB_STATE */ \
509 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
510 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
511 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
512 	REG(0x180),             /* [0x18] CCID */ \
513 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
514 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
515 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
516 	\
517 	NOP(1),                 /* [0x20] */ \
518 	LRI(9, POSTED),         /* [0x21] */ \
519 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
520 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
521 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
522 	REG16(0x284),           /* [0x28] dummy reg */ \
523 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
524 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
525 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
526 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
527 	REG16(0x270)            /* [0x32] PTBP_LDW */
528 
529 static const u8 xe2_rcs_offsets[] = {
530 	XE2_CTX_COMMON,
531 
532 	NOP(2),                 /* [0x34] */
533 	LRI(2, POSTED),         /* [0x36] */
534 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
535 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
536 
537 	NOP(6),                 /* [0x41] */
538 	LRI(1, 0),              /* [0x47] */
539 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
540 
541 	0
542 };
543 
544 static const u8 xe2_bcs_offsets[] = {
545 	XE2_CTX_COMMON,
546 
547 	NOP(4 + 8 + 1),         /* [0x34] */
548 	LRI(2, POSTED),         /* [0x41] */
549 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
550 	REG16(0x204),           /* [0x44] BLIT_CCTL */
551 
552 	0
553 };
554 
555 static const u8 xe2_xcs_offsets[] = {
556 	XE2_CTX_COMMON,
557 
558 	0
559 };
560 
561 static const u8 xe2_indirect_ring_state_offsets[] = {
562 	NOP(1),                 /* [0x00] */
563 	LRI(5, POSTED),         /* [0x01] */
564 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
565 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
566 	REG(0x038),             /* [0x06] RING_BUFFER_START */
567 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
568 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
569 
570 	NOP(5),                 /* [0x0c] */
571 	LRI(9, POSTED),         /* [0x11] */
572 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
573 	REG(0x140),             /* [0x14] BB_ADDR */
574 	REG(0x110),             /* [0x16] BB_STATE */
575 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
576 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
577 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
578 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
579 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
580 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
581 
582 	NOP(12),                 /* [0x00] */
583 
584 	0
585 };
586 
587 #undef REG16
588 #undef REG
589 #undef LRI
590 #undef NOP
591 
reg_offsets(struct xe_device * xe,enum xe_engine_class class)592 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
593 {
594 	if (class == XE_ENGINE_CLASS_RENDER) {
595 		if (GRAPHICS_VER(xe) >= 20)
596 			return xe2_rcs_offsets;
597 		else if (GRAPHICS_VERx100(xe) >= 1270)
598 			return mtl_rcs_offsets;
599 		else if (GRAPHICS_VERx100(xe) >= 1255)
600 			return dg2_rcs_offsets;
601 		else if (GRAPHICS_VERx100(xe) >= 1250)
602 			return xehp_rcs_offsets;
603 		else
604 			return gen12_rcs_offsets;
605 	} else if (class == XE_ENGINE_CLASS_COPY) {
606 		if (GRAPHICS_VER(xe) >= 20)
607 			return xe2_bcs_offsets;
608 		else
609 			return gen12_xcs_offsets;
610 	} else {
611 		if (GRAPHICS_VER(xe) >= 20)
612 			return xe2_xcs_offsets;
613 		else if (GRAPHICS_VERx100(xe) >= 1255)
614 			return dg2_xcs_offsets;
615 		else
616 			return gen12_xcs_offsets;
617 	}
618 }
619 
set_context_control(u32 * regs,struct xe_hw_engine * hwe)620 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
621 {
622 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
623 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
624 
625 	if (xe_gt_has_indirect_ring_state(hwe->gt))
626 		regs[CTX_CONTEXT_CONTROL] |=
627 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
628 }
629 
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)630 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
631 {
632 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
633 	struct xe_device *xe = gt_to_xe(hwe->gt);
634 	u8 num_regs;
635 
636 	if (!xe_device_uses_memirq(xe))
637 		return;
638 
639 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
640 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
641 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
642 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
643 
644 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
645 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
646 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
647 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
648 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
649 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
650 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
651 
652 	if (xe_device_has_msix(xe)) {
653 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
654 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
655 	}
656 }
657 
lrc_ring_mi_mode(struct xe_hw_engine * hwe)658 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
659 {
660 	struct xe_device *xe = gt_to_xe(hwe->gt);
661 
662 	if (GRAPHICS_VERx100(xe) >= 1250)
663 		return 0x70;
664 	else
665 		return 0x60;
666 }
667 
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)668 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
669 {
670 	int x;
671 
672 	x = lrc_ring_mi_mode(hwe);
673 	regs[x + 1] &= ~STOP_RING;
674 	regs[x + 1] |= STOP_RING << 16;
675 }
676 
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)677 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
678 {
679 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
680 }
681 
__xe_lrc_ring_offset(struct xe_lrc * lrc)682 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
683 {
684 	return 0;
685 }
686 
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)687 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
688 {
689 	return lrc->ring.size;
690 }
691 
692 /* Make the magic macros work */
693 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
694 #define __xe_lrc_regs_offset xe_lrc_regs_offset
695 
696 #define LRC_SEQNO_PPHWSP_OFFSET 512
697 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
698 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
699 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
700 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
701 
xe_lrc_regs_offset(struct xe_lrc * lrc)702 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
703 {
704 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
705 }
706 
707 /**
708  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
709  * @xe: the &xe_device struct instance
710  *
711  * Returns: Size of the LRC registers area for current platform
712  */
xe_lrc_reg_size(struct xe_device * xe)713 size_t xe_lrc_reg_size(struct xe_device *xe)
714 {
715 	if (GRAPHICS_VERx100(xe) >= 1250)
716 		return 96 * sizeof(u32);
717 	else
718 		return 80 * sizeof(u32);
719 }
720 
xe_lrc_skip_size(struct xe_device * xe)721 size_t xe_lrc_skip_size(struct xe_device *xe)
722 {
723 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
724 }
725 
__xe_lrc_seqno_offset(struct xe_lrc * lrc)726 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
727 {
728 	/* The seqno is stored in the driver-defined portion of PPHWSP */
729 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
730 }
731 
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)732 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
733 {
734 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
735 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
736 }
737 
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)738 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
739 {
740 	/* This is stored in the driver-defined portion of PPHWSP */
741 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
742 }
743 
__xe_lrc_parallel_offset(struct xe_lrc * lrc)744 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
745 {
746 	/* The parallel is stored in the driver-defined portion of PPHWSP */
747 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
748 }
749 
__xe_lrc_engine_id_offset(struct xe_lrc * lrc)750 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
751 {
752 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
753 }
754 
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)755 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
756 {
757 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
758 }
759 
__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc * lrc)760 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
761 {
762 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
763 }
764 
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)765 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
766 {
767 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
768 		     LRC_INDIRECT_RING_STATE_SIZE;
769 
770 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
771 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
772 
773 	return offset;
774 }
775 
__xe_lrc_indirect_ctx_offset(struct xe_lrc * lrc)776 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
777 {
778 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
779 }
780 
__xe_lrc_wa_bb_offset(struct xe_lrc * lrc)781 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
782 {
783 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
784 }
785 
786 #define DECL_MAP_ADDR_HELPERS(elem) \
787 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
788 { \
789 	struct iosys_map map = lrc->bo->vmap; \
790 \
791 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
792 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
793 	return map; \
794 } \
795 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
796 { \
797 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
798 } \
799 
800 DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)801 DECL_MAP_ADDR_HELPERS(pphwsp)
802 DECL_MAP_ADDR_HELPERS(seqno)
803 DECL_MAP_ADDR_HELPERS(regs)
804 DECL_MAP_ADDR_HELPERS(start_seqno)
805 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
806 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
807 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
808 DECL_MAP_ADDR_HELPERS(parallel)
809 DECL_MAP_ADDR_HELPERS(indirect_ring)
810 DECL_MAP_ADDR_HELPERS(engine_id)
811 
812 #undef DECL_MAP_ADDR_HELPERS
813 
814 /**
815  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
816  * @lrc: Pointer to the lrc.
817  *
818  * Returns: ctx timestamp GGTT address
819  */
820 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
821 {
822 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
823 }
824 
825 /**
826  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
827  * @lrc: Pointer to the lrc.
828  *
829  * Returns: ctx timestamp udw GGTT address
830  */
xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc * lrc)831 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
832 {
833 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
834 }
835 
836 /**
837  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
838  * @lrc: Pointer to the lrc.
839  *
840  * Returns: ctx timestamp value
841  */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)842 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
843 {
844 	struct xe_device *xe = lrc_to_xe(lrc);
845 	struct iosys_map map;
846 	u32 ldw, udw = 0;
847 
848 	map = __xe_lrc_ctx_timestamp_map(lrc);
849 	ldw = xe_map_read32(xe, &map);
850 
851 	if (xe->info.has_64bit_timestamp) {
852 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
853 		udw = xe_map_read32(xe, &map);
854 	}
855 
856 	return (u64)udw << 32 | ldw;
857 }
858 
859 /**
860  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
861  * @lrc: Pointer to the lrc.
862  *
863  * Returns: ctx timestamp job GGTT address
864  */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)865 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
866 {
867 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
868 }
869 
870 /**
871  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
872  * @lrc: Pointer to the lrc.
873  *
874  * Returns: ctx timestamp job value
875  */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)876 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
877 {
878 	struct xe_device *xe = lrc_to_xe(lrc);
879 	struct iosys_map map;
880 
881 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
882 	return xe_map_read32(xe, &map);
883 }
884 
xe_lrc_ggtt_addr(struct xe_lrc * lrc)885 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
886 {
887 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
888 }
889 
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)890 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
891 {
892 	if (!xe_lrc_has_indirect_ring_state(lrc))
893 		return 0;
894 
895 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
896 }
897 
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)898 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
899 {
900 	struct xe_device *xe = lrc_to_xe(lrc);
901 	struct iosys_map map;
902 
903 	map = __xe_lrc_indirect_ring_map(lrc);
904 	iosys_map_incr(&map, reg_nr * sizeof(u32));
905 	return xe_map_read32(xe, &map);
906 }
907 
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)908 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
909 					  int reg_nr, u32 val)
910 {
911 	struct xe_device *xe = lrc_to_xe(lrc);
912 	struct iosys_map map;
913 
914 	map = __xe_lrc_indirect_ring_map(lrc);
915 	iosys_map_incr(&map, reg_nr * sizeof(u32));
916 	xe_map_write32(xe, &map, val);
917 }
918 
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)919 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
920 {
921 	struct xe_device *xe = lrc_to_xe(lrc);
922 	struct iosys_map map;
923 
924 	map = __xe_lrc_regs_map(lrc);
925 	iosys_map_incr(&map, reg_nr * sizeof(u32));
926 	return xe_map_read32(xe, &map);
927 }
928 
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)929 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
930 {
931 	struct xe_device *xe = lrc_to_xe(lrc);
932 	struct iosys_map map;
933 
934 	map = __xe_lrc_regs_map(lrc);
935 	iosys_map_incr(&map, reg_nr * sizeof(u32));
936 	xe_map_write32(xe, &map, val);
937 }
938 
empty_lrc_data(struct xe_hw_engine * hwe)939 static void *empty_lrc_data(struct xe_hw_engine *hwe)
940 {
941 	struct xe_gt *gt = hwe->gt;
942 	void *data;
943 	u32 *regs;
944 
945 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
946 	if (!data)
947 		return NULL;
948 
949 	/* 1st page: Per-Process of HW status Page */
950 	regs = data + LRC_PPHWSP_SIZE;
951 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
952 	set_context_control(regs, hwe);
953 	set_memory_based_intr(regs, hwe);
954 	reset_stop_ring(regs, hwe);
955 	if (xe_gt_has_indirect_ring_state(gt)) {
956 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
957 		       LRC_INDIRECT_RING_STATE_SIZE;
958 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
959 	}
960 
961 	return data;
962 }
963 
964 /**
965  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
966  * of given engine.
967  * @hwe: the &xe_hw_engine struct instance
968  */
xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine * hwe)969 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
970 {
971 	struct xe_gt *gt = hwe->gt;
972 	u32 *regs;
973 
974 	if (!gt->default_lrc[hwe->class])
975 		return;
976 
977 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
978 	set_memory_based_intr(regs, hwe);
979 }
980 
981 /**
982  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
983  * for given LRC.
984  * @lrc: the &xe_lrc struct instance
985  * @hwe: the &xe_hw_engine struct instance
986  * @regs: scratch buffer to be used as temporary storage
987  */
xe_lrc_update_memirq_regs_with_address(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * regs)988 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
989 					    u32 *regs)
990 {
991 	struct xe_gt *gt = hwe->gt;
992 	struct iosys_map map;
993 	size_t regs_len;
994 
995 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
996 		return;
997 
998 	map = __xe_lrc_regs_map(lrc);
999 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1000 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1001 	set_memory_based_intr(regs, hwe);
1002 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1003 }
1004 
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)1005 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1006 {
1007 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1008 
1009 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1010 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1011 }
1012 
xe_lrc_finish(struct xe_lrc * lrc)1013 static void xe_lrc_finish(struct xe_lrc *lrc)
1014 {
1015 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1016 	xe_bo_unpin_map_no_vm(lrc->bo);
1017 }
1018 
1019 /*
1020  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1021  * in calculating active context run ticks.
1022  *
1023  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1024  * context, but only gets updated when the context switches out. In order to
1025  * check how long a context has been active before it switches out, two things
1026  * are required:
1027  *
1028  * (1) Determine if the context is running:
1029  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1030  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1031  * initialized. During a query, we just check for this value to determine if the
1032  * context is active. If the context switched out, it would overwrite this
1033  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1034  * the last part of context restore, so reusing this LRC location will not
1035  * clobber anything.
1036  *
1037  * (2) Calculate the time that the context has been active for:
1038  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1039  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1040  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1041  * engine instance. Since we do not know which instance the context is running
1042  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1043  * store it in the PPHSWP.
1044  */
1045 #define CONTEXT_ACTIVE 1ULL
setup_utilization_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1046 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1047 				    struct xe_hw_engine *hwe,
1048 				    u32 *batch,
1049 				    size_t max_len)
1050 {
1051 	u32 *cmd = batch;
1052 
1053 	if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1054 		return 0;
1055 
1056 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1057 		return -ENOSPC;
1058 
1059 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1060 	*cmd++ = ENGINE_ID(0).addr;
1061 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1062 	*cmd++ = 0;
1063 
1064 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1065 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1066 	*cmd++ = 0;
1067 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1068 
1069 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1070 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1071 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1072 		*cmd++ = 0;
1073 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1074 	}
1075 
1076 	return cmd - batch;
1077 }
1078 
setup_timestamp_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1079 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1080 				  u32 *batch, size_t max_len)
1081 {
1082 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1083 	u32 *cmd = batch;
1084 
1085 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1086 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1087 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1088 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1089 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1090 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1091 		return 0;
1092 
1093 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1094 		return -ENOSPC;
1095 
1096 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1097 		 MI_LRM_ASYNC;
1098 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1099 	*cmd++ = ts_addr;
1100 	*cmd++ = 0;
1101 
1102 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1103 		 MI_LRM_ASYNC;
1104 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1105 	*cmd++ = ts_addr;
1106 	*cmd++ = 0;
1107 
1108 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1109 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1110 	*cmd++ = ts_addr;
1111 	*cmd++ = 0;
1112 
1113 	return cmd - batch;
1114 }
1115 
setup_configfs_post_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1116 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1117 						  struct xe_hw_engine *hwe,
1118 						  u32 *batch, size_t max_len)
1119 {
1120 	struct xe_device *xe = gt_to_xe(lrc->gt);
1121 	const u32 *user_batch;
1122 	u32 *cmd = batch;
1123 	u32 count;
1124 
1125 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1126 						    hwe->class, &user_batch);
1127 	if (!count)
1128 		return 0;
1129 
1130 	if (count > max_len)
1131 		return -ENOSPC;
1132 
1133 	/*
1134 	 * This should be used only for tests and validation. Taint the kernel
1135 	 * as anything could be submitted directly in context switches
1136 	 */
1137 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1138 
1139 	memcpy(cmd, user_batch, count * sizeof(u32));
1140 	cmd += count;
1141 
1142 	return cmd - batch;
1143 }
1144 
setup_configfs_mid_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1145 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1146 						 struct xe_hw_engine *hwe,
1147 						 u32 *batch, size_t max_len)
1148 {
1149 	struct xe_device *xe = gt_to_xe(lrc->gt);
1150 	const u32 *user_batch;
1151 	u32 *cmd = batch;
1152 	u32 count;
1153 
1154 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1155 						   hwe->class, &user_batch);
1156 	if (!count)
1157 		return 0;
1158 
1159 	if (count > max_len)
1160 		return -ENOSPC;
1161 
1162 	/*
1163 	 * This should be used only for tests and validation. Taint the kernel
1164 	 * as anything could be submitted directly in context switches
1165 	 */
1166 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1167 
1168 	memcpy(cmd, user_batch, count * sizeof(u32));
1169 	cmd += count;
1170 
1171 	return cmd - batch;
1172 }
1173 
setup_invalidate_state_cache_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1174 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1175 					       struct xe_hw_engine *hwe,
1176 					       u32 *batch, size_t max_len)
1177 {
1178 	u32 *cmd = batch;
1179 
1180 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1181 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1182 		return 0;
1183 
1184 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1185 		return -ENOSPC;
1186 
1187 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1188 	*cmd++ = CS_DEBUG_MODE1(0).addr;
1189 	*cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1190 
1191 	return cmd - batch;
1192 }
1193 
1194 struct bo_setup {
1195 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1196 			 u32 *batch, size_t max_size);
1197 };
1198 
1199 struct bo_setup_state {
1200 	/* Input: */
1201 	struct xe_lrc		*lrc;
1202 	struct xe_hw_engine	*hwe;
1203 	size_t			max_size;
1204 	size_t                  reserve_dw;
1205 	unsigned int		offset;
1206 	const struct bo_setup	*funcs;
1207 	unsigned int		num_funcs;
1208 
1209 	/* State: */
1210 	u32			*buffer;
1211 	u32			*ptr;
1212 	unsigned int		written;
1213 };
1214 
setup_bo(struct bo_setup_state * state)1215 static int setup_bo(struct bo_setup_state *state)
1216 {
1217 	ssize_t remain;
1218 
1219 	if (state->lrc->bo->vmap.is_iomem) {
1220 		xe_gt_assert(state->hwe->gt, state->buffer);
1221 		state->ptr = state->buffer;
1222 	} else {
1223 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1224 	}
1225 
1226 	remain = state->max_size / sizeof(u32);
1227 
1228 	for (size_t i = 0; i < state->num_funcs; i++) {
1229 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1230 						    state->ptr, remain);
1231 
1232 		remain -= len;
1233 
1234 		/*
1235 		 * Caller has asked for at least reserve_dw to remain unused.
1236 		 */
1237 		if (len < 0 ||
1238 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1239 			goto fail;
1240 
1241 		state->ptr += len;
1242 		state->written += len;
1243 	}
1244 
1245 	return 0;
1246 
1247 fail:
1248 	return -ENOSPC;
1249 }
1250 
finish_bo(struct bo_setup_state * state)1251 static void finish_bo(struct bo_setup_state *state)
1252 {
1253 	if (!state->lrc->bo->vmap.is_iomem)
1254 		return;
1255 
1256 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1257 			 state->offset, state->buffer,
1258 			 state->written * sizeof(u32));
1259 }
1260 
1261 /**
1262  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1263  * @lrc: the &xe_lrc struct instance
1264  * @hwe: the &xe_hw_engine struct instance
1265  * @scratch: preallocated scratch buffer for temporary storage
1266  * Return: 0 on success, negative error code on failure
1267  */
xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * scratch)1268 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1269 {
1270 	static const struct bo_setup funcs[] = {
1271 		{ .setup = setup_timestamp_wa },
1272 		{ .setup = setup_invalidate_state_cache_wa },
1273 		{ .setup = setup_utilization_wa },
1274 		{ .setup = setup_configfs_post_ctx_restore_bb },
1275 	};
1276 	struct bo_setup_state state = {
1277 		.lrc = lrc,
1278 		.hwe = hwe,
1279 		.max_size = LRC_WA_BB_SIZE,
1280 		.buffer = scratch,
1281 		.reserve_dw = 1,
1282 		.offset = __xe_lrc_wa_bb_offset(lrc),
1283 		.funcs = funcs,
1284 		.num_funcs = ARRAY_SIZE(funcs),
1285 	};
1286 	int ret;
1287 
1288 	ret = setup_bo(&state);
1289 	if (ret)
1290 		return ret;
1291 
1292 	*state.ptr++ = MI_BATCH_BUFFER_END;
1293 	state.written++;
1294 
1295 	finish_bo(&state);
1296 
1297 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1298 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1299 
1300 	return 0;
1301 }
1302 
setup_wa_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1303 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1304 {
1305 	u32 *buf = NULL;
1306 	int ret;
1307 
1308 	if (lrc->bo->vmap.is_iomem) {
1309 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1310 		if (!buf)
1311 			return -ENOMEM;
1312 	}
1313 
1314 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1315 
1316 	kfree(buf);
1317 
1318 	return ret;
1319 }
1320 
1321 static int
setup_indirect_ctx(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1322 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1323 {
1324 	static const struct bo_setup rcs_funcs[] = {
1325 		{ .setup = setup_timestamp_wa },
1326 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1327 	};
1328 	static const struct bo_setup xcs_funcs[] = {
1329 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1330 	};
1331 	struct bo_setup_state state = {
1332 		.lrc = lrc,
1333 		.hwe = hwe,
1334 		.max_size = (63 * 64) /* max 63 cachelines */,
1335 		.buffer = NULL,
1336 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1337 	};
1338 	int ret;
1339 
1340 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1341 		return 0;
1342 
1343 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1344 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1345 		state.funcs = rcs_funcs;
1346 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1347 	} else {
1348 		state.funcs = xcs_funcs;
1349 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1350 	}
1351 
1352 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1353 		return 0;
1354 
1355 	if (lrc->bo->vmap.is_iomem) {
1356 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1357 		if (!state.buffer)
1358 			return -ENOMEM;
1359 	}
1360 
1361 	ret = setup_bo(&state);
1362 	if (ret) {
1363 		kfree(state.buffer);
1364 		return ret;
1365 	}
1366 
1367 	/*
1368 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1369 	 * execute: size for indirect ctx must be a multiple of 64.
1370 	 */
1371 	while (state.written & 0xf) {
1372 		*state.ptr++ = MI_NOOP;
1373 		state.written++;
1374 	}
1375 
1376 	finish_bo(&state);
1377 	kfree(state.buffer);
1378 
1379 	/*
1380 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1381 	 * varies per engine class, but the default is good enough
1382 	 */
1383 	xe_lrc_write_ctx_reg(lrc,
1384 			     CTX_CS_INDIRECT_CTX,
1385 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1386 			     /* Size in CLs. */
1387 			     (state.written * sizeof(u32) / 64));
1388 
1389 	return 0;
1390 }
1391 
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 init_flags)1392 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1393 		       struct xe_vm *vm, u32 ring_size, u16 msix_vec,
1394 		       u32 init_flags)
1395 {
1396 	struct xe_gt *gt = hwe->gt;
1397 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1398 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1399 	struct xe_tile *tile = gt_to_tile(gt);
1400 	struct xe_device *xe = gt_to_xe(gt);
1401 	struct iosys_map map;
1402 	u32 arb_enable;
1403 	u32 bo_flags;
1404 	int err;
1405 
1406 	kref_init(&lrc->refcount);
1407 	lrc->gt = gt;
1408 	lrc->size = lrc_size;
1409 	lrc->flags = 0;
1410 	lrc->ring.size = ring_size;
1411 	lrc->ring.tail = 0;
1412 
1413 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1414 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1415 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1416 	}
1417 
1418 	if (xe_gt_has_indirect_ring_state(gt))
1419 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1420 
1421 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1422 		   XE_BO_FLAG_GGTT_INVALIDATE;
1423 
1424 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1425 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1426 
1427 	lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
1428 					    bo_size,
1429 					    ttm_bo_type_kernel,
1430 					    bo_flags, false);
1431 	if (IS_ERR(lrc->bo))
1432 		return PTR_ERR(lrc->bo);
1433 
1434 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1435 			     hwe->fence_irq, hwe->name);
1436 
1437 	/*
1438 	 * Init Per-Process of HW status Page, LRC / context state to known
1439 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1440 	 * it's the early submission to record the lrc: build a new empty one from
1441 	 * scratch.
1442 	 */
1443 	map = __xe_lrc_pphwsp_map(lrc);
1444 	if (gt->default_lrc[hwe->class]) {
1445 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1446 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1447 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1448 				 lrc_size - LRC_PPHWSP_SIZE);
1449 	} else {
1450 		void *init_data = empty_lrc_data(hwe);
1451 
1452 		if (!init_data) {
1453 			err = -ENOMEM;
1454 			goto err_lrc_finish;
1455 		}
1456 
1457 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1458 		kfree(init_data);
1459 	}
1460 
1461 	if (vm) {
1462 		xe_lrc_set_ppgtt(lrc, vm);
1463 
1464 		if (vm->xef)
1465 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1466 	}
1467 
1468 	if (xe_device_has_msix(xe)) {
1469 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1470 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1471 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1472 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1473 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1474 	}
1475 
1476 	if (xe_gt_has_indirect_ring_state(gt)) {
1477 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1478 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1479 
1480 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1481 					      __xe_lrc_ring_ggtt_addr(lrc));
1482 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1483 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1484 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1485 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1486 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1487 	} else {
1488 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1489 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1490 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1491 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1492 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1493 	}
1494 
1495 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1496 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1497 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1498 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1499 
1500 	if (init_flags & XE_LRC_CREATE_PXP)
1501 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1502 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1503 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1504 
1505 	lrc->ctx_timestamp = 0;
1506 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1507 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1508 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1509 
1510 	if (xe->info.has_asid && vm)
1511 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1512 
1513 	lrc->desc = LRC_VALID;
1514 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1515 	/* TODO: Priority */
1516 
1517 	/* While this appears to have something about privileged batches or
1518 	 * some such, it really just means PPGTT mode.
1519 	 */
1520 	if (vm)
1521 		lrc->desc |= LRC_PRIVILEGE;
1522 
1523 	if (GRAPHICS_VERx100(xe) < 1250) {
1524 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1525 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1526 	}
1527 
1528 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1529 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1530 
1531 	map = __xe_lrc_seqno_map(lrc);
1532 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1533 
1534 	map = __xe_lrc_start_seqno_map(lrc);
1535 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1536 
1537 	err = setup_wa_bb(lrc, hwe);
1538 	if (err)
1539 		goto err_lrc_finish;
1540 
1541 	err = setup_indirect_ctx(lrc, hwe);
1542 	if (err)
1543 		goto err_lrc_finish;
1544 
1545 	return 0;
1546 
1547 err_lrc_finish:
1548 	xe_lrc_finish(lrc);
1549 	return err;
1550 }
1551 
1552 /**
1553  * xe_lrc_create - Create a LRC
1554  * @hwe: Hardware Engine
1555  * @vm: The VM (address space)
1556  * @ring_size: LRC ring size
1557  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1558  * @flags: LRC initialization flags
1559  *
1560  * Allocate and initialize the Logical Ring Context (LRC).
1561  *
1562  * Return pointer to created LRC upon success and an error pointer
1563  * upon failure.
1564  */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 flags)1565 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1566 			     u32 ring_size, u16 msix_vec, u32 flags)
1567 {
1568 	struct xe_lrc *lrc;
1569 	int err;
1570 
1571 	lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1572 	if (!lrc)
1573 		return ERR_PTR(-ENOMEM);
1574 
1575 	err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1576 	if (err) {
1577 		kfree(lrc);
1578 		return ERR_PTR(err);
1579 	}
1580 
1581 	return lrc;
1582 }
1583 
1584 /**
1585  * xe_lrc_destroy - Destroy the LRC
1586  * @ref: reference to LRC
1587  *
1588  * Called when ref == 0, release resources held by the Logical Ring Context
1589  * (LRC) and free the LRC memory.
1590  */
xe_lrc_destroy(struct kref * ref)1591 void xe_lrc_destroy(struct kref *ref)
1592 {
1593 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1594 
1595 	xe_lrc_finish(lrc);
1596 	kfree(lrc);
1597 }
1598 
1599 /**
1600  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1601  * @lrc: the &xe_lrc struct instance
1602  */
xe_lrc_update_hwctx_regs_with_address(struct xe_lrc * lrc)1603 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1604 {
1605 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1606 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1607 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1608 
1609 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1610 					      __xe_lrc_ring_ggtt_addr(lrc));
1611 	} else {
1612 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1613 	}
1614 }
1615 
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1616 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1617 {
1618 	if (xe_lrc_has_indirect_ring_state(lrc))
1619 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1620 	else
1621 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1622 }
1623 
xe_lrc_ring_tail(struct xe_lrc * lrc)1624 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1625 {
1626 	if (xe_lrc_has_indirect_ring_state(lrc))
1627 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1628 	else
1629 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1630 }
1631 
xe_lrc_ring_start(struct xe_lrc * lrc)1632 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1633 {
1634 	if (xe_lrc_has_indirect_ring_state(lrc))
1635 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1636 	else
1637 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1638 }
1639 
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1640 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1641 {
1642 	if (xe_lrc_has_indirect_ring_state(lrc))
1643 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1644 	else
1645 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1646 }
1647 
xe_lrc_ring_head(struct xe_lrc * lrc)1648 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1649 {
1650 	if (xe_lrc_has_indirect_ring_state(lrc))
1651 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1652 	else
1653 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1654 }
1655 
xe_lrc_ring_space(struct xe_lrc * lrc)1656 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1657 {
1658 	const u32 head = xe_lrc_ring_head(lrc);
1659 	const u32 tail = lrc->ring.tail;
1660 	const u32 size = lrc->ring.size;
1661 
1662 	return ((head - tail - 1) & (size - 1)) + 1;
1663 }
1664 
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1665 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1666 				const void *data, size_t size)
1667 {
1668 	struct xe_device *xe = lrc_to_xe(lrc);
1669 
1670 	iosys_map_incr(&ring, lrc->ring.tail);
1671 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1672 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1673 }
1674 
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1675 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1676 {
1677 	struct xe_device *xe = lrc_to_xe(lrc);
1678 	struct iosys_map ring;
1679 	u32 rhs;
1680 	size_t aligned_size;
1681 
1682 	xe_assert(xe, IS_ALIGNED(size, 4));
1683 	aligned_size = ALIGN(size, 8);
1684 
1685 	ring = __xe_lrc_ring_map(lrc);
1686 
1687 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1688 	rhs = lrc->ring.size - lrc->ring.tail;
1689 	if (size > rhs) {
1690 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1691 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1692 	} else {
1693 		__xe_lrc_write_ring(lrc, ring, data, size);
1694 	}
1695 
1696 	if (aligned_size > size) {
1697 		u32 noop = MI_NOOP;
1698 
1699 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1700 	}
1701 }
1702 
xe_lrc_descriptor(struct xe_lrc * lrc)1703 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1704 {
1705 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1706 }
1707 
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1708 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1709 {
1710 	return __xe_lrc_seqno_ggtt_addr(lrc);
1711 }
1712 
1713 /**
1714  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1715  *
1716  * Allocate but don't initialize an lrc seqno fence.
1717  *
1718  * Return: Pointer to the allocated fence or
1719  * negative error pointer on error.
1720  */
xe_lrc_alloc_seqno_fence(void)1721 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1722 {
1723 	return xe_hw_fence_alloc();
1724 }
1725 
1726 /**
1727  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1728  * @fence: Pointer to the fence to free.
1729  *
1730  * Frees an lrc seqno fence that hasn't yet been
1731  * initialized.
1732  */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1733 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1734 {
1735 	xe_hw_fence_free(fence);
1736 }
1737 
1738 /**
1739  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1740  * @lrc: Pointer to the lrc.
1741  * @fence: Pointer to the fence to initialize.
1742  *
1743  * Initializes a pre-allocated lrc seqno fence.
1744  * After initialization, the fence is subject to normal
1745  * dma-fence refcounting.
1746  */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1747 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1748 {
1749 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1750 }
1751 
xe_lrc_seqno(struct xe_lrc * lrc)1752 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1753 {
1754 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1755 
1756 	return xe_map_read32(lrc_to_xe(lrc), &map);
1757 }
1758 
xe_lrc_start_seqno(struct xe_lrc * lrc)1759 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1760 {
1761 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1762 
1763 	return xe_map_read32(lrc_to_xe(lrc), &map);
1764 }
1765 
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1766 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1767 {
1768 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1769 }
1770 
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1771 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1772 {
1773 	return __xe_lrc_parallel_ggtt_addr(lrc);
1774 }
1775 
xe_lrc_parallel_map(struct xe_lrc * lrc)1776 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1777 {
1778 	return __xe_lrc_parallel_map(lrc);
1779 }
1780 
1781 /**
1782  * xe_lrc_engine_id() - Read engine id value
1783  * @lrc: Pointer to the lrc.
1784  *
1785  * Returns: context id value
1786  */
xe_lrc_engine_id(struct xe_lrc * lrc)1787 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1788 {
1789 	struct xe_device *xe = lrc_to_xe(lrc);
1790 	struct iosys_map map;
1791 
1792 	map = __xe_lrc_engine_id_map(lrc);
1793 	return xe_map_read32(xe, &map);
1794 }
1795 
instr_dw(u32 cmd_header)1796 static int instr_dw(u32 cmd_header)
1797 {
1798 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1799 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1800 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1801 		return 1;
1802 
1803 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1804 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1805 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1806 
1807 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1808 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1809 }
1810 
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1811 static int dump_mi_command(struct drm_printer *p,
1812 			   struct xe_gt *gt,
1813 			   u32 *dw,
1814 			   int remaining_dw)
1815 {
1816 	u32 inst_header = *dw;
1817 	u32 numdw = instr_dw(inst_header);
1818 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1819 	int num_noop;
1820 
1821 	/* First check for commands that don't have/use a '# DW' field */
1822 	switch (inst_header & MI_OPCODE) {
1823 	case MI_NOOP:
1824 		num_noop = 1;
1825 		while (num_noop < remaining_dw &&
1826 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1827 			num_noop++;
1828 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1829 		return num_noop;
1830 
1831 	case MI_TOPOLOGY_FILTER:
1832 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1833 		return 1;
1834 
1835 	case MI_BATCH_BUFFER_END:
1836 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1837 		/* Return 'remaining_dw' to consume the rest of the LRC */
1838 		return remaining_dw;
1839 	}
1840 
1841 	/*
1842 	 * Any remaining commands include a # of dwords.  We should make sure
1843 	 * it doesn't exceed the remaining size of the LRC.
1844 	 */
1845 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1846 		numdw = remaining_dw;
1847 
1848 	switch (inst_header & MI_OPCODE) {
1849 	case MI_LOAD_REGISTER_IMM:
1850 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1851 			   inst_header, (numdw - 1) / 2);
1852 		for (int i = 1; i < numdw; i += 2)
1853 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1854 		return numdw;
1855 
1856 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1857 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1858 			   inst_header,
1859 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1860 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1861 		if (numdw == 4)
1862 			drm_printf(p, " - %#6x = %#010llx\n",
1863 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1864 		else
1865 			drm_printf(p, " - %*ph (%s)\n",
1866 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1867 				   numdw < 4 ? "truncated" : "malformed");
1868 		return numdw;
1869 
1870 	case MI_FORCE_WAKEUP:
1871 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1872 		return numdw;
1873 
1874 	default:
1875 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1876 			   inst_header, opcode, numdw);
1877 		return numdw;
1878 	}
1879 }
1880 
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1881 static int dump_gfxpipe_command(struct drm_printer *p,
1882 				struct xe_gt *gt,
1883 				u32 *dw,
1884 				int remaining_dw)
1885 {
1886 	u32 numdw = instr_dw(*dw);
1887 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1888 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1889 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1890 
1891 	/*
1892 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1893 	 * remaining size of the LRC.
1894 	 */
1895 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1896 		numdw = remaining_dw;
1897 
1898 	switch (*dw & GFXPIPE_MATCH_MASK) {
1899 #define MATCH(cmd) \
1900 	case cmd: \
1901 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1902 		return numdw
1903 #define MATCH3D(cmd) \
1904 	case CMD_##cmd: \
1905 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1906 		return numdw
1907 
1908 	MATCH(STATE_BASE_ADDRESS);
1909 	MATCH(STATE_SIP);
1910 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1911 	MATCH(STATE_COMPUTE_MODE);
1912 	MATCH3D(3DSTATE_BTD);
1913 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1914 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1915 
1916 	MATCH3D(3DSTATE_VF_STATISTICS);
1917 
1918 	MATCH(PIPELINE_SELECT);
1919 
1920 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1921 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1922 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1923 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1924 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1925 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1926 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1927 	MATCH3D(3DSTATE_INDEX_BUFFER);
1928 	MATCH3D(3DSTATE_VF);
1929 	MATCH3D(3DSTATE_MULTISAMPLE);
1930 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1931 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1932 	MATCH3D(3DSTATE_VS);
1933 	MATCH3D(3DSTATE_GS);
1934 	MATCH3D(3DSTATE_CLIP);
1935 	MATCH3D(3DSTATE_SF);
1936 	MATCH3D(3DSTATE_WM);
1937 	MATCH3D(3DSTATE_CONSTANT_VS);
1938 	MATCH3D(3DSTATE_CONSTANT_GS);
1939 	MATCH3D(3DSTATE_CONSTANT_PS);
1940 	MATCH3D(3DSTATE_SAMPLE_MASK);
1941 	MATCH3D(3DSTATE_CONSTANT_HS);
1942 	MATCH3D(3DSTATE_CONSTANT_DS);
1943 	MATCH3D(3DSTATE_HS);
1944 	MATCH3D(3DSTATE_TE);
1945 	MATCH3D(3DSTATE_DS);
1946 	MATCH3D(3DSTATE_STREAMOUT);
1947 	MATCH3D(3DSTATE_SBE);
1948 	MATCH3D(3DSTATE_PS);
1949 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1950 	MATCH3D(3DSTATE_CPS_POINTERS);
1951 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1952 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1953 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1954 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1955 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1956 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1957 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1958 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1959 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1960 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1961 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1962 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1963 	MATCH3D(3DSTATE_VF_INSTANCING);
1964 	MATCH3D(3DSTATE_VF_SGVS);
1965 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1966 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1967 	MATCH3D(3DSTATE_PS_BLEND);
1968 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1969 	MATCH3D(3DSTATE_PS_EXTRA);
1970 	MATCH3D(3DSTATE_RASTER);
1971 	MATCH3D(3DSTATE_SBE_SWIZ);
1972 	MATCH3D(3DSTATE_WM_HZ_OP);
1973 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1974 	MATCH3D(3DSTATE_VF_SGVS_2);
1975 	MATCH3D(3DSTATE_VFG);
1976 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1977 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1978 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1979 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1980 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1981 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1982 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1983 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1984 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1985 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1986 	MATCH3D(3DSTATE_AMFS);
1987 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1988 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1989 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1990 	MATCH3D(3DSTATE_MESH_CONTROL);
1991 	MATCH3D(3DSTATE_MESH_DISTRIB);
1992 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1993 	MATCH3D(3DSTATE_MESH_SHADER);
1994 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1995 	MATCH3D(3DSTATE_TASK_CONTROL);
1996 	MATCH3D(3DSTATE_TASK_SHADER);
1997 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1998 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1999 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
2000 	MATCH3D(3DSTATE_CLIP_MESH);
2001 	MATCH3D(3DSTATE_SBE_MESH);
2002 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2003 	MATCH3D(3DSTATE_COARSE_PIXEL);
2004 
2005 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2006 	MATCH3D(3DSTATE_CHROMA_KEY);
2007 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2008 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2009 	MATCH3D(3DSTATE_LINE_STIPPLE);
2010 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2011 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2012 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2013 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2014 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2015 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2016 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2017 	MATCH3D(3DSTATE_SO_DECL_LIST);
2018 	MATCH3D(3DSTATE_SO_BUFFER);
2019 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2020 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2021 	MATCH3D(3DSTATE_3D_MODE);
2022 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2023 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2024 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2025 
2026 	default:
2027 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2028 			   *dw, pipeline, opcode, subopcode, numdw);
2029 		return numdw;
2030 	}
2031 }
2032 
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)2033 static int dump_gfx_state_command(struct drm_printer *p,
2034 				  struct xe_gt *gt,
2035 				  u32 *dw,
2036 				  int remaining_dw)
2037 {
2038 	u32 numdw = instr_dw(*dw);
2039 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2040 
2041 	/*
2042 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2043 	 * remaining size of the LRC.
2044 	 */
2045 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2046 		numdw = remaining_dw;
2047 
2048 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2049 	MATCH(STATE_WRITE_INLINE);
2050 
2051 	default:
2052 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2053 			   *dw, opcode, numdw);
2054 		return numdw;
2055 	}
2056 }
2057 
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)2058 void xe_lrc_dump_default(struct drm_printer *p,
2059 			 struct xe_gt *gt,
2060 			 enum xe_engine_class hwe_class)
2061 {
2062 	u32 *dw;
2063 	int remaining_dw, num_dw;
2064 
2065 	if (!gt->default_lrc[hwe_class]) {
2066 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2067 		return;
2068 	}
2069 
2070 	/*
2071 	 * Skip the beginning of the LRC since it contains the per-process
2072 	 * hardware status page.
2073 	 */
2074 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2075 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2076 
2077 	while (remaining_dw > 0) {
2078 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2079 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
2080 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2081 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
2082 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2083 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
2084 		} else {
2085 			num_dw = min(instr_dw(*dw), remaining_dw);
2086 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2087 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2088 				   num_dw);
2089 		}
2090 
2091 		dw += num_dw;
2092 		remaining_dw -= num_dw;
2093 	}
2094 }
2095 
2096 struct instr_state {
2097 	u32 instr;
2098 	u16 num_dw;
2099 };
2100 
2101 static const struct instr_state xe_hpg_svg_state[] = {
2102 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2103 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2104 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2105 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2106 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2107 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2108 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2109 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2110 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2111 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2112 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2113 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2114 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2115 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2116 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2117 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2118 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2119 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2120 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2121 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2122 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2123 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2124 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2125 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2126 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2127 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2128 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2129 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2130 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2131 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2132 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2133 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2134 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2135 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2136 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2137 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2138 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2139 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2140 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2141 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2142 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2143 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2144 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2145 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2146 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2147 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2148 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2149 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2150 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2151 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2152 };
2153 
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,u32 * cs)2154 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2155 {
2156 	struct xe_gt *gt = q->hwe->gt;
2157 	struct xe_device *xe = gt_to_xe(gt);
2158 	const struct instr_state *state_table = NULL;
2159 	int state_table_size = 0;
2160 
2161 	/*
2162 	 * Wa_14019789679
2163 	 *
2164 	 * If the driver doesn't explicitly emit the SVG instructions while
2165 	 * setting up the default LRC, the context switch will write 0's
2166 	 * (noops) into the LRC memory rather than the expected instruction
2167 	 * headers.  Application contexts start out as a copy of the default
2168 	 * LRC, and if they also do not emit specific settings for some SVG
2169 	 * state, then on context restore they'll unintentionally inherit
2170 	 * whatever state setting the previous context had programmed into the
2171 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2172 	 * prevent the hardware from resetting that state back to any specific
2173 	 * value).
2174 	 *
2175 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2176 	 * since that's a specific state setting that can easily cause GPU
2177 	 * hangs if unintentionally inherited.  However to be safe we'll
2178 	 * continue to emit all of the SVG state since it's best not to leak
2179 	 * any of the state between contexts, even if that leakage is harmless.
2180 	 */
2181 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2182 		state_table = xe_hpg_svg_state;
2183 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2184 	}
2185 
2186 	if (!state_table) {
2187 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2188 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2189 		return cs;
2190 	}
2191 
2192 	for (int i = 0; i < state_table_size; i++) {
2193 		u32 instr = state_table[i].instr;
2194 		u16 num_dw = state_table[i].num_dw;
2195 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2196 
2197 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2198 		xe_gt_assert(gt, num_dw != 0);
2199 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2200 
2201 		/*
2202 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2203 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2204 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2205 		 * Just make the replacement here rather than defining a
2206 		 * whole separate table for the single trivial change.
2207 		 */
2208 		if (GRAPHICS_VER(xe) >= 20 &&
2209 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2210 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2211 
2212 		*cs = instr;
2213 		if (!is_single_dw)
2214 			*cs |= (num_dw - 2);
2215 
2216 		cs += num_dw;
2217 	}
2218 
2219 	return cs;
2220 }
2221 
xe_lrc_snapshot_capture(struct xe_lrc * lrc)2222 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2223 {
2224 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
2225 
2226 	if (!snapshot)
2227 		return NULL;
2228 
2229 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2230 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2231 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2232 	snapshot->head = xe_lrc_ring_head(lrc);
2233 	snapshot->tail.internal = lrc->ring.tail;
2234 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2235 	snapshot->start = xe_lrc_ring_start(lrc);
2236 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2237 	snapshot->seqno = xe_lrc_seqno(lrc);
2238 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2239 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2240 	snapshot->lrc_size = lrc->size;
2241 	snapshot->lrc_snapshot = NULL;
2242 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2243 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2244 	return snapshot;
2245 }
2246 
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)2247 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2248 {
2249 	struct xe_bo *bo;
2250 	struct iosys_map src;
2251 
2252 	if (!snapshot)
2253 		return;
2254 
2255 	bo = snapshot->lrc_bo;
2256 	snapshot->lrc_bo = NULL;
2257 
2258 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2259 	if (!snapshot->lrc_snapshot)
2260 		goto put_bo;
2261 
2262 	xe_bo_lock(bo, false);
2263 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2264 		xe_map_memcpy_from(xe_bo_device(bo),
2265 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2266 				   snapshot->lrc_size);
2267 		ttm_bo_vunmap(&bo->ttm, &src);
2268 	} else {
2269 		kvfree(snapshot->lrc_snapshot);
2270 		snapshot->lrc_snapshot = NULL;
2271 	}
2272 	xe_bo_unlock(bo);
2273 put_bo:
2274 	xe_bo_put(bo);
2275 }
2276 
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)2277 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2278 {
2279 	unsigned long i;
2280 
2281 	if (!snapshot)
2282 		return;
2283 
2284 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2285 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2286 		   snapshot->ring_addr);
2287 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2288 		   snapshot->indirect_context_desc);
2289 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2290 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2291 		   snapshot->tail.internal, snapshot->tail.memory);
2292 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2293 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2294 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2295 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2296 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2297 
2298 	if (!snapshot->lrc_snapshot)
2299 		return;
2300 
2301 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2302 	drm_puts(p, "\t[HWSP].data: ");
2303 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2304 		u32 *val = snapshot->lrc_snapshot + i;
2305 		char dumped[ASCII85_BUFSZ];
2306 
2307 		drm_puts(p, ascii85_encode(*val, dumped));
2308 	}
2309 
2310 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2311 	drm_puts(p, "\t[HWCTX].data: ");
2312 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2313 		u32 *val = snapshot->lrc_snapshot + i;
2314 		char dumped[ASCII85_BUFSZ];
2315 
2316 		drm_puts(p, ascii85_encode(*val, dumped));
2317 	}
2318 	drm_puts(p, "\n");
2319 }
2320 
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)2321 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2322 {
2323 	if (!snapshot)
2324 		return;
2325 
2326 	kvfree(snapshot->lrc_snapshot);
2327 	if (snapshot->lrc_bo)
2328 		xe_bo_put(snapshot->lrc_bo);
2329 
2330 	kfree(snapshot);
2331 }
2332 
get_ctx_timestamp(struct xe_lrc * lrc,u32 engine_id,u64 * reg_ctx_ts)2333 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2334 {
2335 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2336 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2337 	struct xe_hw_engine *hwe;
2338 	u64 val;
2339 
2340 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2341 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2342 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2343 			    class, instance))
2344 		return -1;
2345 
2346 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2347 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2348 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2349 	else
2350 		val = xe_mmio_read32(&hwe->gt->mmio,
2351 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2352 
2353 	*reg_ctx_ts = val;
2354 
2355 	return 0;
2356 }
2357 
2358 /**
2359  * xe_lrc_update_timestamp() - Update ctx timestamp
2360  * @lrc: Pointer to the lrc.
2361  * @old_ts: Old timestamp value
2362  *
2363  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2364  * update saved value. With support for active contexts, the calculation may be
2365  * slightly racy, so follow a read-again logic to ensure that the context is
2366  * still active before returning the right timestamp.
2367  *
2368  * Returns: New ctx timestamp value
2369  */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u64 * old_ts)2370 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2371 {
2372 	u64 lrc_ts, reg_ts;
2373 	u32 engine_id;
2374 
2375 	*old_ts = lrc->ctx_timestamp;
2376 
2377 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2378 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2379 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2380 		lrc->ctx_timestamp = lrc_ts;
2381 		goto done;
2382 	}
2383 
2384 	if (lrc_ts == CONTEXT_ACTIVE) {
2385 		engine_id = xe_lrc_engine_id(lrc);
2386 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2387 			lrc->ctx_timestamp = reg_ts;
2388 
2389 		/* read lrc again to ensure context is still active */
2390 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2391 	}
2392 
2393 	/*
2394 	 * If context switched out, just use the lrc_ts. Note that this needs to
2395 	 * be a separate if condition.
2396 	 */
2397 	if (lrc_ts != CONTEXT_ACTIVE)
2398 		lrc->ctx_timestamp = lrc_ts;
2399 
2400 done:
2401 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2402 
2403 	return lrc->ctx_timestamp;
2404 }
2405 
2406 /**
2407  * xe_lrc_ring_is_idle() - LRC is idle
2408  * @lrc: Pointer to the lrc.
2409  *
2410  * Compare LRC ring head and tail to determine if idle.
2411  *
2412  * Return: True is ring is idle, False otherwise
2413  */
xe_lrc_ring_is_idle(struct xe_lrc * lrc)2414 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2415 {
2416 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2417 }
2418