xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision ddb7a62af2e766eabb4ab7080e6ed8d6b8915302)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 
12 #include "instructions/xe_mi_commands.h"
13 #include "instructions/xe_gfxpipe_commands.h"
14 #include "instructions/xe_gfx_state_commands.h"
15 #include "regs/xe_engine_regs.h"
16 #include "regs/xe_lrc_layout.h"
17 #include "xe_bb.h"
18 #include "xe_bo.h"
19 #include "xe_device.h"
20 #include "xe_drm_client.h"
21 #include "xe_exec_queue_types.h"
22 #include "xe_gt.h"
23 #include "xe_gt_printk.h"
24 #include "xe_hw_fence.h"
25 #include "xe_map.h"
26 #include "xe_memirq.h"
27 #include "xe_mmio.h"
28 #include "xe_sriov.h"
29 #include "xe_trace_lrc.h"
30 #include "xe_vm.h"
31 #include "xe_wa.h"
32 
33 #define LRC_VALID				BIT_ULL(0)
34 #define LRC_PRIVILEGE				BIT_ULL(8)
35 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
36 #define LRC_LEGACY_64B_CONTEXT			3
37 
38 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
39 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
40 
41 #define LRC_PPHWSP_SIZE				SZ_4K
42 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
43 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
44 #define LRC_WA_BB_SIZE				SZ_4K
45 
46 /*
47  * Layout of the LRC and associated data allocated as
48  * lrc->bo:
49  *
50  *   Region                       Size
51  *  +============================+=================================+ <- __xe_lrc_ring_offset()
52  *  | Ring                       | ring_size, see                  |
53  *  |                            | xe_lrc_init()                   |
54  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
55  *  | PPHWSP (includes SW state) | 4K                              |
56  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
57  *  | Engine Context Image       | n * 4K, see                     |
58  *  |                            | xe_gt_lrc_size()                |
59  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
60  *  | Indirect Ring State Page   | 0 or 4k, see                    |
61  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
62  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
63  *  | Indirect Context Page      | 0 or 4k, see                    |
64  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
65  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
66  *  | WA BB Per Ctx              | 4k                              |
67  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
68  */
69 
70 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)71 lrc_to_xe(struct xe_lrc *lrc)
72 {
73 	return gt_to_xe(lrc->fence_ctx.gt);
74 }
75 
76 static bool
gt_engine_needs_indirect_ctx(struct xe_gt * gt,enum xe_engine_class class)77 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
78 {
79 	return false;
80 }
81 
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)82 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
83 {
84 	struct xe_device *xe = gt_to_xe(gt);
85 	size_t size;
86 
87 	/* Per-process HW status page (PPHWSP) */
88 	size = LRC_PPHWSP_SIZE;
89 
90 	/* Engine context image */
91 	switch (class) {
92 	case XE_ENGINE_CLASS_RENDER:
93 		if (GRAPHICS_VER(xe) >= 20)
94 			size += 3 * SZ_4K;
95 		else
96 			size += 13 * SZ_4K;
97 		break;
98 	case XE_ENGINE_CLASS_COMPUTE:
99 		if (GRAPHICS_VER(xe) >= 20)
100 			size += 2 * SZ_4K;
101 		else
102 			size += 13 * SZ_4K;
103 		break;
104 	default:
105 		WARN(1, "Unknown engine class: %d", class);
106 		fallthrough;
107 	case XE_ENGINE_CLASS_COPY:
108 	case XE_ENGINE_CLASS_VIDEO_DECODE:
109 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
110 	case XE_ENGINE_CLASS_OTHER:
111 		size += 1 * SZ_4K;
112 	}
113 
114 	/* Add indirect ring state page */
115 	if (xe_gt_has_indirect_ring_state(gt))
116 		size += LRC_INDIRECT_RING_STATE_SIZE;
117 
118 	return size;
119 }
120 
121 /*
122  * The per-platform tables are u8-encoded in @data. Decode @data and set the
123  * addresses' offset and commands in @regs. The following encoding is used
124  * for each byte. There are 2 steps: decoding commands and decoding addresses.
125  *
126  * Commands:
127  * [7]: create NOPs - number of NOPs are set in lower bits
128  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
129  *      MI_LRI_FORCE_POSTED
130  * [5:0]: Number of NOPs or registers to set values to in case of
131  *        MI_LOAD_REGISTER_IMM
132  *
133  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
134  * number of registers. They are set by using the REG/REG16 macros: the former
135  * is used for offsets smaller than 0x200 while the latter is for values bigger
136  * than that. Those macros already set all the bits documented below correctly:
137  *
138  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
139  *      follow, for the lower bits
140  * [6:0]: Register offset, without considering the engine base.
141  *
142  * This function only tweaks the commands and register offsets. Values are not
143  * filled out.
144  */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)145 static void set_offsets(u32 *regs,
146 			const u8 *data,
147 			const struct xe_hw_engine *hwe)
148 #define NOP(x) (BIT(7) | (x))
149 #define LRI(count, flags) ((flags) << 6 | (count) | \
150 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
151 #define POSTED BIT(0)
152 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
153 #define REG16(x) \
154 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
155 	(((x) >> 2) & 0x7f)
156 {
157 	const u32 base = hwe->mmio_base;
158 
159 	while (*data) {
160 		u8 count, flags;
161 
162 		if (*data & BIT(7)) { /* skip */
163 			count = *data++ & ~BIT(7);
164 			regs += count;
165 			continue;
166 		}
167 
168 		count = *data & 0x3f;
169 		flags = *data >> 6;
170 		data++;
171 
172 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
173 		if (flags & POSTED)
174 			*regs |= MI_LRI_FORCE_POSTED;
175 		*regs |= MI_LRI_LRM_CS_MMIO;
176 		regs++;
177 
178 		xe_gt_assert(hwe->gt, count);
179 		do {
180 			u32 offset = 0;
181 			u8 v;
182 
183 			do {
184 				v = *data++;
185 				offset <<= 7;
186 				offset |= v & ~BIT(7);
187 			} while (v & BIT(7));
188 
189 			regs[0] = base + (offset << 2);
190 			regs += 2;
191 		} while (--count);
192 	}
193 
194 	*regs = MI_BATCH_BUFFER_END | BIT(0);
195 }
196 
197 static const u8 gen12_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(13, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 
214 	NOP(5),
215 	LRI(9, POSTED),
216 	REG16(0x3a8),
217 	REG16(0x28c),
218 	REG16(0x288),
219 	REG16(0x284),
220 	REG16(0x280),
221 	REG16(0x27c),
222 	REG16(0x278),
223 	REG16(0x274),
224 	REG16(0x270),
225 
226 	0
227 };
228 
229 static const u8 dg2_xcs_offsets[] = {
230 	NOP(1),
231 	LRI(15, POSTED),
232 	REG16(0x244),
233 	REG(0x034),
234 	REG(0x030),
235 	REG(0x038),
236 	REG(0x03c),
237 	REG(0x168),
238 	REG(0x140),
239 	REG(0x110),
240 	REG(0x1c0),
241 	REG(0x1c4),
242 	REG(0x1c8),
243 	REG(0x180),
244 	REG16(0x2b4),
245 	REG(0x120),
246 	REG(0x124),
247 
248 	NOP(1),
249 	LRI(9, POSTED),
250 	REG16(0x3a8),
251 	REG16(0x28c),
252 	REG16(0x288),
253 	REG16(0x284),
254 	REG16(0x280),
255 	REG16(0x27c),
256 	REG16(0x278),
257 	REG16(0x274),
258 	REG16(0x270),
259 
260 	0
261 };
262 
263 static const u8 gen12_rcs_offsets[] = {
264 	NOP(1),
265 	LRI(13, POSTED),
266 	REG16(0x244),
267 	REG(0x034),
268 	REG(0x030),
269 	REG(0x038),
270 	REG(0x03c),
271 	REG(0x168),
272 	REG(0x140),
273 	REG(0x110),
274 	REG(0x1c0),
275 	REG(0x1c4),
276 	REG(0x1c8),
277 	REG(0x180),
278 	REG16(0x2b4),
279 
280 	NOP(5),
281 	LRI(9, POSTED),
282 	REG16(0x3a8),
283 	REG16(0x28c),
284 	REG16(0x288),
285 	REG16(0x284),
286 	REG16(0x280),
287 	REG16(0x27c),
288 	REG16(0x278),
289 	REG16(0x274),
290 	REG16(0x270),
291 
292 	LRI(3, POSTED),
293 	REG(0x1b0),
294 	REG16(0x5a8),
295 	REG16(0x5ac),
296 
297 	NOP(6),
298 	LRI(1, 0),
299 	REG(0x0c8),
300 	NOP(3 + 9 + 1),
301 
302 	LRI(51, POSTED),
303 	REG16(0x588),
304 	REG16(0x588),
305 	REG16(0x588),
306 	REG16(0x588),
307 	REG16(0x588),
308 	REG16(0x588),
309 	REG(0x028),
310 	REG(0x09c),
311 	REG(0x0c0),
312 	REG(0x178),
313 	REG(0x17c),
314 	REG16(0x358),
315 	REG(0x170),
316 	REG(0x150),
317 	REG(0x154),
318 	REG(0x158),
319 	REG16(0x41c),
320 	REG16(0x600),
321 	REG16(0x604),
322 	REG16(0x608),
323 	REG16(0x60c),
324 	REG16(0x610),
325 	REG16(0x614),
326 	REG16(0x618),
327 	REG16(0x61c),
328 	REG16(0x620),
329 	REG16(0x624),
330 	REG16(0x628),
331 	REG16(0x62c),
332 	REG16(0x630),
333 	REG16(0x634),
334 	REG16(0x638),
335 	REG16(0x63c),
336 	REG16(0x640),
337 	REG16(0x644),
338 	REG16(0x648),
339 	REG16(0x64c),
340 	REG16(0x650),
341 	REG16(0x654),
342 	REG16(0x658),
343 	REG16(0x65c),
344 	REG16(0x660),
345 	REG16(0x664),
346 	REG16(0x668),
347 	REG16(0x66c),
348 	REG16(0x670),
349 	REG16(0x674),
350 	REG16(0x678),
351 	REG16(0x67c),
352 	REG(0x068),
353 	REG(0x084),
354 	NOP(1),
355 
356 	0
357 };
358 
359 static const u8 xehp_rcs_offsets[] = {
360 	NOP(1),
361 	LRI(13, POSTED),
362 	REG16(0x244),
363 	REG(0x034),
364 	REG(0x030),
365 	REG(0x038),
366 	REG(0x03c),
367 	REG(0x168),
368 	REG(0x140),
369 	REG(0x110),
370 	REG(0x1c0),
371 	REG(0x1c4),
372 	REG(0x1c8),
373 	REG(0x180),
374 	REG16(0x2b4),
375 
376 	NOP(5),
377 	LRI(9, POSTED),
378 	REG16(0x3a8),
379 	REG16(0x28c),
380 	REG16(0x288),
381 	REG16(0x284),
382 	REG16(0x280),
383 	REG16(0x27c),
384 	REG16(0x278),
385 	REG16(0x274),
386 	REG16(0x270),
387 
388 	LRI(3, POSTED),
389 	REG(0x1b0),
390 	REG16(0x5a8),
391 	REG16(0x5ac),
392 
393 	NOP(6),
394 	LRI(1, 0),
395 	REG(0x0c8),
396 
397 	0
398 };
399 
400 static const u8 dg2_rcs_offsets[] = {
401 	NOP(1),
402 	LRI(15, POSTED),
403 	REG16(0x244),
404 	REG(0x034),
405 	REG(0x030),
406 	REG(0x038),
407 	REG(0x03c),
408 	REG(0x168),
409 	REG(0x140),
410 	REG(0x110),
411 	REG(0x1c0),
412 	REG(0x1c4),
413 	REG(0x1c8),
414 	REG(0x180),
415 	REG16(0x2b4),
416 	REG(0x120),
417 	REG(0x124),
418 
419 	NOP(1),
420 	LRI(9, POSTED),
421 	REG16(0x3a8),
422 	REG16(0x28c),
423 	REG16(0x288),
424 	REG16(0x284),
425 	REG16(0x280),
426 	REG16(0x27c),
427 	REG16(0x278),
428 	REG16(0x274),
429 	REG16(0x270),
430 
431 	LRI(3, POSTED),
432 	REG(0x1b0),
433 	REG16(0x5a8),
434 	REG16(0x5ac),
435 
436 	NOP(6),
437 	LRI(1, 0),
438 	REG(0x0c8),
439 
440 	0
441 };
442 
443 static const u8 mtl_rcs_offsets[] = {
444 	NOP(1),
445 	LRI(15, POSTED),
446 	REG16(0x244),
447 	REG(0x034),
448 	REG(0x030),
449 	REG(0x038),
450 	REG(0x03c),
451 	REG(0x168),
452 	REG(0x140),
453 	REG(0x110),
454 	REG(0x1c0),
455 	REG(0x1c4),
456 	REG(0x1c8),
457 	REG(0x180),
458 	REG16(0x2b4),
459 	REG(0x120),
460 	REG(0x124),
461 
462 	NOP(1),
463 	LRI(9, POSTED),
464 	REG16(0x3a8),
465 	REG16(0x28c),
466 	REG16(0x288),
467 	REG16(0x284),
468 	REG16(0x280),
469 	REG16(0x27c),
470 	REG16(0x278),
471 	REG16(0x274),
472 	REG16(0x270),
473 
474 	NOP(2),
475 	LRI(2, POSTED),
476 	REG16(0x5a8),
477 	REG16(0x5ac),
478 
479 	NOP(6),
480 	LRI(1, 0),
481 	REG(0x0c8),
482 
483 	0
484 };
485 
486 #define XE2_CTX_COMMON \
487 	NOP(1),                 /* [0x00] */ \
488 	LRI(15, POSTED),        /* [0x01] */ \
489 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
490 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
491 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
492 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
493 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
494 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
495 	REG(0x140),             /* [0x0e] BB_ADDR */ \
496 	REG(0x110),             /* [0x10] BB_STATE */ \
497 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
498 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
499 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
500 	REG(0x180),             /* [0x18] CCID */ \
501 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
502 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
503 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
504 	\
505 	NOP(1),                 /* [0x20] */ \
506 	LRI(9, POSTED),         /* [0x21] */ \
507 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
508 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
509 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
510 	REG16(0x284),           /* [0x28] dummy reg */ \
511 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
512 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
513 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
514 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
515 	REG16(0x270)            /* [0x32] PTBP_LDW */
516 
517 static const u8 xe2_rcs_offsets[] = {
518 	XE2_CTX_COMMON,
519 
520 	NOP(2),                 /* [0x34] */
521 	LRI(2, POSTED),         /* [0x36] */
522 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
523 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
524 
525 	NOP(6),                 /* [0x41] */
526 	LRI(1, 0),              /* [0x47] */
527 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
528 
529 	0
530 };
531 
532 static const u8 xe2_bcs_offsets[] = {
533 	XE2_CTX_COMMON,
534 
535 	NOP(4 + 8 + 1),         /* [0x34] */
536 	LRI(2, POSTED),         /* [0x41] */
537 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
538 	REG16(0x204),           /* [0x44] BLIT_CCTL */
539 
540 	0
541 };
542 
543 static const u8 xe2_xcs_offsets[] = {
544 	XE2_CTX_COMMON,
545 
546 	0
547 };
548 
549 static const u8 xe2_indirect_ring_state_offsets[] = {
550 	NOP(1),                 /* [0x00] */
551 	LRI(5, POSTED),         /* [0x01] */
552 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
553 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
554 	REG(0x038),             /* [0x06] RING_BUFFER_START */
555 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
556 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
557 
558 	NOP(5),                 /* [0x0c] */
559 	LRI(9, POSTED),         /* [0x11] */
560 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
561 	REG(0x140),             /* [0x14] BB_ADDR */
562 	REG(0x110),             /* [0x16] BB_STATE */
563 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
564 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
565 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
566 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
567 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
568 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
569 
570 	NOP(12),                 /* [0x00] */
571 
572 	0
573 };
574 
575 #undef REG16
576 #undef REG
577 #undef LRI
578 #undef NOP
579 
reg_offsets(struct xe_device * xe,enum xe_engine_class class)580 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
581 {
582 	if (class == XE_ENGINE_CLASS_RENDER) {
583 		if (GRAPHICS_VER(xe) >= 20)
584 			return xe2_rcs_offsets;
585 		else if (GRAPHICS_VERx100(xe) >= 1270)
586 			return mtl_rcs_offsets;
587 		else if (GRAPHICS_VERx100(xe) >= 1255)
588 			return dg2_rcs_offsets;
589 		else if (GRAPHICS_VERx100(xe) >= 1250)
590 			return xehp_rcs_offsets;
591 		else
592 			return gen12_rcs_offsets;
593 	} else if (class == XE_ENGINE_CLASS_COPY) {
594 		if (GRAPHICS_VER(xe) >= 20)
595 			return xe2_bcs_offsets;
596 		else
597 			return gen12_xcs_offsets;
598 	} else {
599 		if (GRAPHICS_VER(xe) >= 20)
600 			return xe2_xcs_offsets;
601 		else if (GRAPHICS_VERx100(xe) >= 1255)
602 			return dg2_xcs_offsets;
603 		else
604 			return gen12_xcs_offsets;
605 	}
606 }
607 
set_context_control(u32 * regs,struct xe_hw_engine * hwe)608 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
609 {
610 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
611 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
612 
613 	if (xe_gt_has_indirect_ring_state(hwe->gt))
614 		regs[CTX_CONTEXT_CONTROL] |=
615 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
616 }
617 
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)618 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
619 {
620 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
621 	struct xe_device *xe = gt_to_xe(hwe->gt);
622 	u8 num_regs;
623 
624 	if (!xe_device_uses_memirq(xe))
625 		return;
626 
627 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
628 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
629 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
630 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
631 
632 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
633 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
634 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
635 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
636 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
637 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
638 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
639 
640 	if (xe_device_has_msix(xe)) {
641 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
642 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
643 	}
644 }
645 
lrc_ring_mi_mode(struct xe_hw_engine * hwe)646 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
647 {
648 	struct xe_device *xe = gt_to_xe(hwe->gt);
649 
650 	if (GRAPHICS_VERx100(xe) >= 1250)
651 		return 0x70;
652 	else
653 		return 0x60;
654 }
655 
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)656 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
657 {
658 	int x;
659 
660 	x = lrc_ring_mi_mode(hwe);
661 	regs[x + 1] &= ~STOP_RING;
662 	regs[x + 1] |= STOP_RING << 16;
663 }
664 
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)665 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
666 {
667 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
668 }
669 
__xe_lrc_ring_offset(struct xe_lrc * lrc)670 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
671 {
672 	return 0;
673 }
674 
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)675 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
676 {
677 	return lrc->ring.size;
678 }
679 
680 /* Make the magic macros work */
681 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
682 #define __xe_lrc_regs_offset xe_lrc_regs_offset
683 
684 #define LRC_SEQNO_PPHWSP_OFFSET 512
685 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
686 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
687 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
688 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
689 
xe_lrc_regs_offset(struct xe_lrc * lrc)690 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
691 {
692 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
693 }
694 
lrc_reg_size(struct xe_device * xe)695 static size_t lrc_reg_size(struct xe_device *xe)
696 {
697 	if (GRAPHICS_VERx100(xe) >= 1250)
698 		return 96 * sizeof(u32);
699 	else
700 		return 80 * sizeof(u32);
701 }
702 
xe_lrc_skip_size(struct xe_device * xe)703 size_t xe_lrc_skip_size(struct xe_device *xe)
704 {
705 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
706 }
707 
__xe_lrc_seqno_offset(struct xe_lrc * lrc)708 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
709 {
710 	/* The seqno is stored in the driver-defined portion of PPHWSP */
711 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
712 }
713 
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)714 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
715 {
716 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
717 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
718 }
719 
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)720 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
721 {
722 	/* This is stored in the driver-defined portion of PPHWSP */
723 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
724 }
725 
__xe_lrc_parallel_offset(struct xe_lrc * lrc)726 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
727 {
728 	/* The parallel is stored in the driver-defined portion of PPHWSP */
729 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
730 }
731 
__xe_lrc_engine_id_offset(struct xe_lrc * lrc)732 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
733 {
734 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
735 }
736 
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)737 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
738 {
739 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
740 }
741 
__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc * lrc)742 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
743 {
744 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
745 }
746 
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)747 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
748 {
749 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
750 		     LRC_INDIRECT_RING_STATE_SIZE;
751 
752 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
753 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
754 
755 	return offset;
756 }
757 
__xe_lrc_indirect_ctx_offset(struct xe_lrc * lrc)758 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
759 {
760 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
761 }
762 
__xe_lrc_wa_bb_offset(struct xe_lrc * lrc)763 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
764 {
765 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
766 }
767 
768 #define DECL_MAP_ADDR_HELPERS(elem) \
769 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
770 { \
771 	struct iosys_map map = lrc->bo->vmap; \
772 \
773 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
774 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
775 	return map; \
776 } \
777 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
778 { \
779 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
780 } \
781 
782 DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)783 DECL_MAP_ADDR_HELPERS(pphwsp)
784 DECL_MAP_ADDR_HELPERS(seqno)
785 DECL_MAP_ADDR_HELPERS(regs)
786 DECL_MAP_ADDR_HELPERS(start_seqno)
787 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
788 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
789 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
790 DECL_MAP_ADDR_HELPERS(parallel)
791 DECL_MAP_ADDR_HELPERS(indirect_ring)
792 DECL_MAP_ADDR_HELPERS(engine_id)
793 
794 #undef DECL_MAP_ADDR_HELPERS
795 
796 /**
797  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
798  * @lrc: Pointer to the lrc.
799  *
800  * Returns: ctx timestamp GGTT address
801  */
802 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
803 {
804 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
805 }
806 
807 /**
808  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
809  * @lrc: Pointer to the lrc.
810  *
811  * Returns: ctx timestamp udw GGTT address
812  */
xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc * lrc)813 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
814 {
815 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
816 }
817 
818 /**
819  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
820  * @lrc: Pointer to the lrc.
821  *
822  * Returns: ctx timestamp value
823  */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)824 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
825 {
826 	struct xe_device *xe = lrc_to_xe(lrc);
827 	struct iosys_map map;
828 	u32 ldw, udw = 0;
829 
830 	map = __xe_lrc_ctx_timestamp_map(lrc);
831 	ldw = xe_map_read32(xe, &map);
832 
833 	if (xe->info.has_64bit_timestamp) {
834 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
835 		udw = xe_map_read32(xe, &map);
836 	}
837 
838 	return (u64)udw << 32 | ldw;
839 }
840 
841 /**
842  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
843  * @lrc: Pointer to the lrc.
844  *
845  * Returns: ctx timestamp job GGTT address
846  */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)847 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
848 {
849 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
850 }
851 
852 /**
853  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
854  * @lrc: Pointer to the lrc.
855  *
856  * Returns: ctx timestamp job value
857  */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)858 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
859 {
860 	struct xe_device *xe = lrc_to_xe(lrc);
861 	struct iosys_map map;
862 
863 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
864 	return xe_map_read32(xe, &map);
865 }
866 
xe_lrc_ggtt_addr(struct xe_lrc * lrc)867 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
868 {
869 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
870 }
871 
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)872 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
873 {
874 	if (!xe_lrc_has_indirect_ring_state(lrc))
875 		return 0;
876 
877 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
878 }
879 
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)880 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
881 {
882 	struct xe_device *xe = lrc_to_xe(lrc);
883 	struct iosys_map map;
884 
885 	map = __xe_lrc_indirect_ring_map(lrc);
886 	iosys_map_incr(&map, reg_nr * sizeof(u32));
887 	return xe_map_read32(xe, &map);
888 }
889 
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)890 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
891 					  int reg_nr, u32 val)
892 {
893 	struct xe_device *xe = lrc_to_xe(lrc);
894 	struct iosys_map map;
895 
896 	map = __xe_lrc_indirect_ring_map(lrc);
897 	iosys_map_incr(&map, reg_nr * sizeof(u32));
898 	xe_map_write32(xe, &map, val);
899 }
900 
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)901 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
902 {
903 	struct xe_device *xe = lrc_to_xe(lrc);
904 	struct iosys_map map;
905 
906 	map = __xe_lrc_regs_map(lrc);
907 	iosys_map_incr(&map, reg_nr * sizeof(u32));
908 	return xe_map_read32(xe, &map);
909 }
910 
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)911 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
912 {
913 	struct xe_device *xe = lrc_to_xe(lrc);
914 	struct iosys_map map;
915 
916 	map = __xe_lrc_regs_map(lrc);
917 	iosys_map_incr(&map, reg_nr * sizeof(u32));
918 	xe_map_write32(xe, &map, val);
919 }
920 
empty_lrc_data(struct xe_hw_engine * hwe)921 static void *empty_lrc_data(struct xe_hw_engine *hwe)
922 {
923 	struct xe_gt *gt = hwe->gt;
924 	void *data;
925 	u32 *regs;
926 
927 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
928 	if (!data)
929 		return NULL;
930 
931 	/* 1st page: Per-Process of HW status Page */
932 	regs = data + LRC_PPHWSP_SIZE;
933 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
934 	set_context_control(regs, hwe);
935 	set_memory_based_intr(regs, hwe);
936 	reset_stop_ring(regs, hwe);
937 	if (xe_gt_has_indirect_ring_state(gt)) {
938 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
939 		       LRC_INDIRECT_RING_STATE_SIZE;
940 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
941 	}
942 
943 	return data;
944 }
945 
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)946 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
947 {
948 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
949 
950 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
951 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
952 }
953 
xe_lrc_finish(struct xe_lrc * lrc)954 static void xe_lrc_finish(struct xe_lrc *lrc)
955 {
956 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
957 	xe_bo_unpin_map_no_vm(lrc->bo);
958 }
959 
960 /*
961  * wa_bb_setup_utilization() - Write commands to wa bb to assist
962  * in calculating active context run ticks.
963  *
964  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
965  * context, but only gets updated when the context switches out. In order to
966  * check how long a context has been active before it switches out, two things
967  * are required:
968  *
969  * (1) Determine if the context is running:
970  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
971  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
972  * initialized. During a query, we just check for this value to determine if the
973  * context is active. If the context switched out, it would overwrite this
974  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
975  * the last part of context restore, so reusing this LRC location will not
976  * clobber anything.
977  *
978  * (2) Calculate the time that the context has been active for:
979  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
980  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
981  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
982  * engine instance. Since we do not know which instance the context is running
983  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
984  * store it in the PPHSWP.
985  */
986 #define CONTEXT_ACTIVE 1ULL
setup_utilization_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)987 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
988 				    struct xe_hw_engine *hwe,
989 				    u32 *batch,
990 				    size_t max_len)
991 {
992 	u32 *cmd = batch;
993 
994 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
995 		return -ENOSPC;
996 
997 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
998 	*cmd++ = ENGINE_ID(0).addr;
999 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1000 	*cmd++ = 0;
1001 
1002 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1003 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1004 	*cmd++ = 0;
1005 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1006 
1007 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1008 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1009 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1010 		*cmd++ = 0;
1011 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1012 	}
1013 
1014 	return cmd - batch;
1015 }
1016 
1017 struct bo_setup {
1018 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1019 			 u32 *batch, size_t max_size);
1020 };
1021 
1022 struct bo_setup_state {
1023 	/* Input: */
1024 	struct xe_lrc		*lrc;
1025 	struct xe_hw_engine	*hwe;
1026 	size_t			max_size;
1027 	size_t                  reserve_dw;
1028 	unsigned int		offset;
1029 	const struct bo_setup	*funcs;
1030 	unsigned int		num_funcs;
1031 
1032 	/* State: */
1033 	u32			*buffer;
1034 	u32			*ptr;
1035 	unsigned int		written;
1036 };
1037 
setup_bo(struct bo_setup_state * state)1038 static int setup_bo(struct bo_setup_state *state)
1039 {
1040 	ssize_t remain;
1041 
1042 	if (state->lrc->bo->vmap.is_iomem) {
1043 		state->buffer = kmalloc(state->max_size, GFP_KERNEL);
1044 		if (!state->buffer)
1045 			return -ENOMEM;
1046 		state->ptr = state->buffer;
1047 	} else {
1048 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1049 		state->buffer = NULL;
1050 	}
1051 
1052 	remain = state->max_size / sizeof(u32);
1053 
1054 	for (size_t i = 0; i < state->num_funcs; i++) {
1055 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1056 						    state->ptr, remain);
1057 
1058 		remain -= len;
1059 
1060 		/*
1061 		 * Caller has asked for at least reserve_dw to remain unused.
1062 		 */
1063 		if (len < 0 ||
1064 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1065 			goto fail;
1066 
1067 		state->ptr += len;
1068 		state->written += len;
1069 	}
1070 
1071 	return 0;
1072 
1073 fail:
1074 	kfree(state->buffer);
1075 	return -ENOSPC;
1076 }
1077 
finish_bo(struct bo_setup_state * state)1078 static void finish_bo(struct bo_setup_state *state)
1079 {
1080 	if (!state->buffer)
1081 		return;
1082 
1083 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1084 			 state->offset, state->buffer,
1085 			 state->written * sizeof(u32));
1086 	kfree(state->buffer);
1087 }
1088 
setup_wa_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1089 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1090 {
1091 	static const struct bo_setup funcs[] = {
1092 		{ .setup = setup_utilization_wa },
1093 	};
1094 	struct bo_setup_state state = {
1095 		.lrc = lrc,
1096 		.hwe = hwe,
1097 		.max_size = LRC_WA_BB_SIZE,
1098 		.reserve_dw = 1,
1099 		.offset = __xe_lrc_wa_bb_offset(lrc),
1100 		.funcs = funcs,
1101 		.num_funcs = ARRAY_SIZE(funcs),
1102 	};
1103 	int ret;
1104 
1105 	ret = setup_bo(&state);
1106 	if (ret)
1107 		return ret;
1108 
1109 	*state.ptr++ = MI_BATCH_BUFFER_END;
1110 	state.written++;
1111 
1112 	finish_bo(&state);
1113 
1114 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1115 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1116 
1117 	return 0;
1118 }
1119 
1120 static int
setup_indirect_ctx(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1121 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1122 {
1123 	static struct bo_setup rcs_funcs[] = {
1124 	};
1125 	struct bo_setup_state state = {
1126 		.lrc = lrc,
1127 		.hwe = hwe,
1128 		.max_size = (63 * 64) /* max 63 cachelines */,
1129 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1130 	};
1131 	int ret;
1132 
1133 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1134 		return 0;
1135 
1136 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1137 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1138 		state.funcs = rcs_funcs;
1139 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1140 	}
1141 
1142 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1143 		return 0;
1144 
1145 	ret = setup_bo(&state);
1146 	if (ret)
1147 		return ret;
1148 
1149 	/*
1150 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1151 	 * execute: size for indirect ctx must be a multiple of 64.
1152 	 */
1153 	while (state.written & 0xf) {
1154 		*state.ptr++ = MI_NOOP;
1155 		state.written++;
1156 	}
1157 
1158 	finish_bo(&state);
1159 
1160 	xe_lrc_write_ctx_reg(lrc,
1161 			     CTX_CS_INDIRECT_CTX,
1162 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1163 			     /* Size in CLs. */
1164 			     (state.written * sizeof(u32) / 64));
1165 	xe_lrc_write_ctx_reg(lrc,
1166 			     CTX_CS_INDIRECT_CTX_OFFSET,
1167 			     CTX_INDIRECT_CTX_OFFSET_DEFAULT);
1168 
1169 	return 0;
1170 }
1171 
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 init_flags)1172 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1173 		       struct xe_vm *vm, u32 ring_size, u16 msix_vec,
1174 		       u32 init_flags)
1175 {
1176 	struct xe_gt *gt = hwe->gt;
1177 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1178 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1179 	struct xe_tile *tile = gt_to_tile(gt);
1180 	struct xe_device *xe = gt_to_xe(gt);
1181 	struct iosys_map map;
1182 	u32 arb_enable;
1183 	u32 bo_flags;
1184 	int err;
1185 
1186 	kref_init(&lrc->refcount);
1187 	lrc->gt = gt;
1188 	lrc->size = lrc_size;
1189 	lrc->flags = 0;
1190 	lrc->ring.size = ring_size;
1191 	lrc->ring.tail = 0;
1192 
1193 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1194 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1195 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1196 	}
1197 
1198 	if (xe_gt_has_indirect_ring_state(gt))
1199 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1200 
1201 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1202 		   XE_BO_FLAG_GGTT_INVALIDATE;
1203 	if (vm && vm->xef) /* userspace */
1204 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE;
1205 
1206 	lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, bo_size,
1207 				       ttm_bo_type_kernel,
1208 				       bo_flags);
1209 	if (IS_ERR(lrc->bo))
1210 		return PTR_ERR(lrc->bo);
1211 
1212 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1213 			     hwe->fence_irq, hwe->name);
1214 
1215 	/*
1216 	 * Init Per-Process of HW status Page, LRC / context state to known
1217 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1218 	 * it's the early submission to record the lrc: build a new empty one from
1219 	 * scratch.
1220 	 */
1221 	map = __xe_lrc_pphwsp_map(lrc);
1222 	if (gt->default_lrc[hwe->class]) {
1223 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1224 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1225 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1226 				 lrc_size - LRC_PPHWSP_SIZE);
1227 	} else {
1228 		void *init_data = empty_lrc_data(hwe);
1229 
1230 		if (!init_data) {
1231 			err = -ENOMEM;
1232 			goto err_lrc_finish;
1233 		}
1234 
1235 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1236 		kfree(init_data);
1237 	}
1238 
1239 	if (vm) {
1240 		xe_lrc_set_ppgtt(lrc, vm);
1241 
1242 		if (vm->xef)
1243 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1244 	}
1245 
1246 	if (xe_device_has_msix(xe)) {
1247 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1248 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1249 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1250 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1251 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1252 	}
1253 
1254 	if (xe_gt_has_indirect_ring_state(gt)) {
1255 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1256 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1257 
1258 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1259 					      __xe_lrc_ring_ggtt_addr(lrc));
1260 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1261 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1262 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1263 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1264 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1265 	} else {
1266 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1267 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1268 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1269 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1270 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1271 	}
1272 
1273 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1274 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1275 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1276 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1277 
1278 	if (init_flags & XE_LRC_CREATE_PXP)
1279 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1280 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1281 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1282 
1283 	lrc->ctx_timestamp = 0;
1284 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1285 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1286 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1287 
1288 	if (xe->info.has_asid && vm)
1289 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1290 
1291 	lrc->desc = LRC_VALID;
1292 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1293 	/* TODO: Priority */
1294 
1295 	/* While this appears to have something about privileged batches or
1296 	 * some such, it really just means PPGTT mode.
1297 	 */
1298 	if (vm)
1299 		lrc->desc |= LRC_PRIVILEGE;
1300 
1301 	if (GRAPHICS_VERx100(xe) < 1250) {
1302 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1303 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1304 	}
1305 
1306 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1307 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1308 
1309 	map = __xe_lrc_seqno_map(lrc);
1310 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1311 
1312 	map = __xe_lrc_start_seqno_map(lrc);
1313 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1314 
1315 	err = setup_wa_bb(lrc, hwe);
1316 	if (err)
1317 		goto err_lrc_finish;
1318 
1319 	err = setup_indirect_ctx(lrc, hwe);
1320 	if (err)
1321 		goto err_lrc_finish;
1322 
1323 	return 0;
1324 
1325 err_lrc_finish:
1326 	xe_lrc_finish(lrc);
1327 	return err;
1328 }
1329 
1330 /**
1331  * xe_lrc_create - Create a LRC
1332  * @hwe: Hardware Engine
1333  * @vm: The VM (address space)
1334  * @ring_size: LRC ring size
1335  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1336  * @flags: LRC initialization flags
1337  *
1338  * Allocate and initialize the Logical Ring Context (LRC).
1339  *
1340  * Return pointer to created LRC upon success and an error pointer
1341  * upon failure.
1342  */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 flags)1343 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1344 			     u32 ring_size, u16 msix_vec, u32 flags)
1345 {
1346 	struct xe_lrc *lrc;
1347 	int err;
1348 
1349 	lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1350 	if (!lrc)
1351 		return ERR_PTR(-ENOMEM);
1352 
1353 	err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1354 	if (err) {
1355 		kfree(lrc);
1356 		return ERR_PTR(err);
1357 	}
1358 
1359 	return lrc;
1360 }
1361 
1362 /**
1363  * xe_lrc_destroy - Destroy the LRC
1364  * @ref: reference to LRC
1365  *
1366  * Called when ref == 0, release resources held by the Logical Ring Context
1367  * (LRC) and free the LRC memory.
1368  */
xe_lrc_destroy(struct kref * ref)1369 void xe_lrc_destroy(struct kref *ref)
1370 {
1371 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1372 
1373 	xe_lrc_finish(lrc);
1374 	kfree(lrc);
1375 }
1376 
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1377 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1378 {
1379 	if (xe_lrc_has_indirect_ring_state(lrc))
1380 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1381 	else
1382 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1383 }
1384 
xe_lrc_ring_tail(struct xe_lrc * lrc)1385 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1386 {
1387 	if (xe_lrc_has_indirect_ring_state(lrc))
1388 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1389 	else
1390 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1391 }
1392 
xe_lrc_ring_start(struct xe_lrc * lrc)1393 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1394 {
1395 	if (xe_lrc_has_indirect_ring_state(lrc))
1396 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1397 	else
1398 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1399 }
1400 
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1401 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1402 {
1403 	if (xe_lrc_has_indirect_ring_state(lrc))
1404 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1405 	else
1406 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1407 }
1408 
xe_lrc_ring_head(struct xe_lrc * lrc)1409 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1410 {
1411 	if (xe_lrc_has_indirect_ring_state(lrc))
1412 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1413 	else
1414 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1415 }
1416 
xe_lrc_ring_space(struct xe_lrc * lrc)1417 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1418 {
1419 	const u32 head = xe_lrc_ring_head(lrc);
1420 	const u32 tail = lrc->ring.tail;
1421 	const u32 size = lrc->ring.size;
1422 
1423 	return ((head - tail - 1) & (size - 1)) + 1;
1424 }
1425 
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1426 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1427 				const void *data, size_t size)
1428 {
1429 	struct xe_device *xe = lrc_to_xe(lrc);
1430 
1431 	iosys_map_incr(&ring, lrc->ring.tail);
1432 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1433 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1434 }
1435 
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1436 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1437 {
1438 	struct xe_device *xe = lrc_to_xe(lrc);
1439 	struct iosys_map ring;
1440 	u32 rhs;
1441 	size_t aligned_size;
1442 
1443 	xe_assert(xe, IS_ALIGNED(size, 4));
1444 	aligned_size = ALIGN(size, 8);
1445 
1446 	ring = __xe_lrc_ring_map(lrc);
1447 
1448 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1449 	rhs = lrc->ring.size - lrc->ring.tail;
1450 	if (size > rhs) {
1451 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1452 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1453 	} else {
1454 		__xe_lrc_write_ring(lrc, ring, data, size);
1455 	}
1456 
1457 	if (aligned_size > size) {
1458 		u32 noop = MI_NOOP;
1459 
1460 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1461 	}
1462 }
1463 
xe_lrc_descriptor(struct xe_lrc * lrc)1464 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1465 {
1466 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1467 }
1468 
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1469 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1470 {
1471 	return __xe_lrc_seqno_ggtt_addr(lrc);
1472 }
1473 
1474 /**
1475  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1476  *
1477  * Allocate but don't initialize an lrc seqno fence.
1478  *
1479  * Return: Pointer to the allocated fence or
1480  * negative error pointer on error.
1481  */
xe_lrc_alloc_seqno_fence(void)1482 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1483 {
1484 	return xe_hw_fence_alloc();
1485 }
1486 
1487 /**
1488  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1489  * @fence: Pointer to the fence to free.
1490  *
1491  * Frees an lrc seqno fence that hasn't yet been
1492  * initialized.
1493  */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1494 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1495 {
1496 	xe_hw_fence_free(fence);
1497 }
1498 
1499 /**
1500  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1501  * @lrc: Pointer to the lrc.
1502  * @fence: Pointer to the fence to initialize.
1503  *
1504  * Initializes a pre-allocated lrc seqno fence.
1505  * After initialization, the fence is subject to normal
1506  * dma-fence refcounting.
1507  */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1508 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1509 {
1510 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1511 }
1512 
xe_lrc_seqno(struct xe_lrc * lrc)1513 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1514 {
1515 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1516 
1517 	return xe_map_read32(lrc_to_xe(lrc), &map);
1518 }
1519 
xe_lrc_start_seqno(struct xe_lrc * lrc)1520 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1521 {
1522 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1523 
1524 	return xe_map_read32(lrc_to_xe(lrc), &map);
1525 }
1526 
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1527 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1528 {
1529 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1530 }
1531 
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1532 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1533 {
1534 	return __xe_lrc_parallel_ggtt_addr(lrc);
1535 }
1536 
xe_lrc_parallel_map(struct xe_lrc * lrc)1537 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1538 {
1539 	return __xe_lrc_parallel_map(lrc);
1540 }
1541 
1542 /**
1543  * xe_lrc_engine_id() - Read engine id value
1544  * @lrc: Pointer to the lrc.
1545  *
1546  * Returns: context id value
1547  */
xe_lrc_engine_id(struct xe_lrc * lrc)1548 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1549 {
1550 	struct xe_device *xe = lrc_to_xe(lrc);
1551 	struct iosys_map map;
1552 
1553 	map = __xe_lrc_engine_id_map(lrc);
1554 	return xe_map_read32(xe, &map);
1555 }
1556 
instr_dw(u32 cmd_header)1557 static int instr_dw(u32 cmd_header)
1558 {
1559 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1560 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1561 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1562 		return 1;
1563 
1564 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1565 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1566 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1567 
1568 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1569 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1570 }
1571 
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1572 static int dump_mi_command(struct drm_printer *p,
1573 			   struct xe_gt *gt,
1574 			   u32 *dw,
1575 			   int remaining_dw)
1576 {
1577 	u32 inst_header = *dw;
1578 	u32 numdw = instr_dw(inst_header);
1579 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1580 	int num_noop;
1581 
1582 	/* First check for commands that don't have/use a '# DW' field */
1583 	switch (inst_header & MI_OPCODE) {
1584 	case MI_NOOP:
1585 		num_noop = 1;
1586 		while (num_noop < remaining_dw &&
1587 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1588 			num_noop++;
1589 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1590 		return num_noop;
1591 
1592 	case MI_TOPOLOGY_FILTER:
1593 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1594 		return 1;
1595 
1596 	case MI_BATCH_BUFFER_END:
1597 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1598 		/* Return 'remaining_dw' to consume the rest of the LRC */
1599 		return remaining_dw;
1600 	}
1601 
1602 	/*
1603 	 * Any remaining commands include a # of dwords.  We should make sure
1604 	 * it doesn't exceed the remaining size of the LRC.
1605 	 */
1606 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1607 		numdw = remaining_dw;
1608 
1609 	switch (inst_header & MI_OPCODE) {
1610 	case MI_LOAD_REGISTER_IMM:
1611 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1612 			   inst_header, (numdw - 1) / 2);
1613 		for (int i = 1; i < numdw; i += 2)
1614 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1615 		return numdw;
1616 
1617 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1618 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1619 			   inst_header,
1620 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1621 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1622 		if (numdw == 4)
1623 			drm_printf(p, " - %#6x = %#010llx\n",
1624 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1625 		else
1626 			drm_printf(p, " - %*ph (%s)\n",
1627 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1628 				   numdw < 4 ? "truncated" : "malformed");
1629 		return numdw;
1630 
1631 	case MI_FORCE_WAKEUP:
1632 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1633 		return numdw;
1634 
1635 	default:
1636 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1637 			   inst_header, opcode, numdw);
1638 		return numdw;
1639 	}
1640 }
1641 
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1642 static int dump_gfxpipe_command(struct drm_printer *p,
1643 				struct xe_gt *gt,
1644 				u32 *dw,
1645 				int remaining_dw)
1646 {
1647 	u32 numdw = instr_dw(*dw);
1648 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1649 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1650 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1651 
1652 	/*
1653 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1654 	 * remaining size of the LRC.
1655 	 */
1656 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1657 		numdw = remaining_dw;
1658 
1659 	switch (*dw & GFXPIPE_MATCH_MASK) {
1660 #define MATCH(cmd) \
1661 	case cmd: \
1662 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1663 		return numdw
1664 #define MATCH3D(cmd) \
1665 	case CMD_##cmd: \
1666 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1667 		return numdw
1668 
1669 	MATCH(STATE_BASE_ADDRESS);
1670 	MATCH(STATE_SIP);
1671 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1672 	MATCH(STATE_COMPUTE_MODE);
1673 	MATCH3D(3DSTATE_BTD);
1674 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1675 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1676 
1677 	MATCH3D(3DSTATE_VF_STATISTICS);
1678 
1679 	MATCH(PIPELINE_SELECT);
1680 
1681 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1682 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1683 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1684 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1685 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1686 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1687 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1688 	MATCH3D(3DSTATE_INDEX_BUFFER);
1689 	MATCH3D(3DSTATE_VF);
1690 	MATCH3D(3DSTATE_MULTISAMPLE);
1691 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1692 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1693 	MATCH3D(3DSTATE_VS);
1694 	MATCH3D(3DSTATE_GS);
1695 	MATCH3D(3DSTATE_CLIP);
1696 	MATCH3D(3DSTATE_SF);
1697 	MATCH3D(3DSTATE_WM);
1698 	MATCH3D(3DSTATE_CONSTANT_VS);
1699 	MATCH3D(3DSTATE_CONSTANT_GS);
1700 	MATCH3D(3DSTATE_CONSTANT_PS);
1701 	MATCH3D(3DSTATE_SAMPLE_MASK);
1702 	MATCH3D(3DSTATE_CONSTANT_HS);
1703 	MATCH3D(3DSTATE_CONSTANT_DS);
1704 	MATCH3D(3DSTATE_HS);
1705 	MATCH3D(3DSTATE_TE);
1706 	MATCH3D(3DSTATE_DS);
1707 	MATCH3D(3DSTATE_STREAMOUT);
1708 	MATCH3D(3DSTATE_SBE);
1709 	MATCH3D(3DSTATE_PS);
1710 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1711 	MATCH3D(3DSTATE_CPS_POINTERS);
1712 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1713 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1714 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1715 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1716 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1717 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1718 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1719 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1720 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1721 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1722 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1723 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1724 	MATCH3D(3DSTATE_VF_INSTANCING);
1725 	MATCH3D(3DSTATE_VF_SGVS);
1726 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1727 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1728 	MATCH3D(3DSTATE_PS_BLEND);
1729 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1730 	MATCH3D(3DSTATE_PS_EXTRA);
1731 	MATCH3D(3DSTATE_RASTER);
1732 	MATCH3D(3DSTATE_SBE_SWIZ);
1733 	MATCH3D(3DSTATE_WM_HZ_OP);
1734 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1735 	MATCH3D(3DSTATE_VF_SGVS_2);
1736 	MATCH3D(3DSTATE_VFG);
1737 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1738 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1739 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1740 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1741 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1742 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1743 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1744 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1745 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1746 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1747 	MATCH3D(3DSTATE_AMFS);
1748 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1749 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1750 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1751 	MATCH3D(3DSTATE_MESH_CONTROL);
1752 	MATCH3D(3DSTATE_MESH_DISTRIB);
1753 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1754 	MATCH3D(3DSTATE_MESH_SHADER);
1755 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1756 	MATCH3D(3DSTATE_TASK_CONTROL);
1757 	MATCH3D(3DSTATE_TASK_SHADER);
1758 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1759 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1760 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1761 	MATCH3D(3DSTATE_CLIP_MESH);
1762 	MATCH3D(3DSTATE_SBE_MESH);
1763 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1764 	MATCH3D(3DSTATE_COARSE_PIXEL);
1765 
1766 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1767 	MATCH3D(3DSTATE_CHROMA_KEY);
1768 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1769 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1770 	MATCH3D(3DSTATE_LINE_STIPPLE);
1771 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1772 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1773 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1774 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1775 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1776 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1777 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1778 	MATCH3D(3DSTATE_SO_DECL_LIST);
1779 	MATCH3D(3DSTATE_SO_BUFFER);
1780 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1781 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1782 	MATCH3D(3DSTATE_3D_MODE);
1783 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1784 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1785 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1786 
1787 	default:
1788 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1789 			   *dw, pipeline, opcode, subopcode, numdw);
1790 		return numdw;
1791 	}
1792 }
1793 
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1794 static int dump_gfx_state_command(struct drm_printer *p,
1795 				  struct xe_gt *gt,
1796 				  u32 *dw,
1797 				  int remaining_dw)
1798 {
1799 	u32 numdw = instr_dw(*dw);
1800 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1801 
1802 	/*
1803 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1804 	 * remaining size of the LRC.
1805 	 */
1806 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1807 		numdw = remaining_dw;
1808 
1809 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1810 	MATCH(STATE_WRITE_INLINE);
1811 
1812 	default:
1813 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1814 			   *dw, opcode, numdw);
1815 		return numdw;
1816 	}
1817 }
1818 
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)1819 void xe_lrc_dump_default(struct drm_printer *p,
1820 			 struct xe_gt *gt,
1821 			 enum xe_engine_class hwe_class)
1822 {
1823 	u32 *dw;
1824 	int remaining_dw, num_dw;
1825 
1826 	if (!gt->default_lrc[hwe_class]) {
1827 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1828 		return;
1829 	}
1830 
1831 	/*
1832 	 * Skip the beginning of the LRC since it contains the per-process
1833 	 * hardware status page.
1834 	 */
1835 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1836 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
1837 
1838 	while (remaining_dw > 0) {
1839 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1840 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1841 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1842 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1843 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1844 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1845 		} else {
1846 			num_dw = min(instr_dw(*dw), remaining_dw);
1847 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1848 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1849 				   num_dw);
1850 		}
1851 
1852 		dw += num_dw;
1853 		remaining_dw -= num_dw;
1854 	}
1855 }
1856 
1857 struct instr_state {
1858 	u32 instr;
1859 	u16 num_dw;
1860 };
1861 
1862 static const struct instr_state xe_hpg_svg_state[] = {
1863 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1864 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1865 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1866 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1867 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1868 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1869 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1870 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1871 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1872 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1873 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1874 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1875 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1876 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1877 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1878 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1879 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1880 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1881 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1882 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1883 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1884 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1885 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1886 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1887 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1888 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1889 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1890 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1891 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1892 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1893 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1894 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1895 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1896 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1897 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1898 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1899 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1900 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1901 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1902 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1903 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1904 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1905 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1906 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1907 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1908 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1909 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1910 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1911 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1912 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1913 };
1914 
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,u32 * cs)1915 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
1916 {
1917 	struct xe_gt *gt = q->hwe->gt;
1918 	struct xe_device *xe = gt_to_xe(gt);
1919 	const struct instr_state *state_table = NULL;
1920 	int state_table_size = 0;
1921 
1922 	/*
1923 	 * Wa_14019789679
1924 	 *
1925 	 * If the driver doesn't explicitly emit the SVG instructions while
1926 	 * setting up the default LRC, the context switch will write 0's
1927 	 * (noops) into the LRC memory rather than the expected instruction
1928 	 * headers.  Application contexts start out as a copy of the default
1929 	 * LRC, and if they also do not emit specific settings for some SVG
1930 	 * state, then on context restore they'll unintentionally inherit
1931 	 * whatever state setting the previous context had programmed into the
1932 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
1933 	 * prevent the hardware from resetting that state back to any specific
1934 	 * value).
1935 	 *
1936 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
1937 	 * since that's a specific state setting that can easily cause GPU
1938 	 * hangs if unintentionally inherited.  However to be safe we'll
1939 	 * continue to emit all of the SVG state since it's best not to leak
1940 	 * any of the state between contexts, even if that leakage is harmless.
1941 	 */
1942 	if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
1943 		state_table = xe_hpg_svg_state;
1944 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1945 	}
1946 
1947 	if (!state_table) {
1948 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1949 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1950 		return cs;
1951 	}
1952 
1953 	for (int i = 0; i < state_table_size; i++) {
1954 		u32 instr = state_table[i].instr;
1955 		u16 num_dw = state_table[i].num_dw;
1956 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1957 
1958 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1959 		xe_gt_assert(gt, num_dw != 0);
1960 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1961 
1962 		/*
1963 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1964 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1965 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1966 		 * Just make the replacement here rather than defining a
1967 		 * whole separate table for the single trivial change.
1968 		 */
1969 		if (GRAPHICS_VER(xe) >= 20 &&
1970 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1971 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1972 
1973 		*cs = instr;
1974 		if (!is_single_dw)
1975 			*cs |= (num_dw - 2);
1976 
1977 		cs += num_dw;
1978 	}
1979 
1980 	return cs;
1981 }
1982 
xe_lrc_snapshot_capture(struct xe_lrc * lrc)1983 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1984 {
1985 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1986 
1987 	if (!snapshot)
1988 		return NULL;
1989 
1990 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
1991 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
1992 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
1993 	snapshot->head = xe_lrc_ring_head(lrc);
1994 	snapshot->tail.internal = lrc->ring.tail;
1995 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
1996 	snapshot->start = xe_lrc_ring_start(lrc);
1997 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1998 	snapshot->seqno = xe_lrc_seqno(lrc);
1999 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2000 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2001 	snapshot->lrc_size = lrc->size;
2002 	snapshot->lrc_snapshot = NULL;
2003 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2004 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2005 	return snapshot;
2006 }
2007 
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)2008 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2009 {
2010 	struct xe_bo *bo;
2011 	struct iosys_map src;
2012 
2013 	if (!snapshot)
2014 		return;
2015 
2016 	bo = snapshot->lrc_bo;
2017 	snapshot->lrc_bo = NULL;
2018 
2019 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2020 	if (!snapshot->lrc_snapshot)
2021 		goto put_bo;
2022 
2023 	xe_bo_lock(bo, false);
2024 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2025 		xe_map_memcpy_from(xe_bo_device(bo),
2026 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2027 				   snapshot->lrc_size);
2028 		ttm_bo_vunmap(&bo->ttm, &src);
2029 	} else {
2030 		kvfree(snapshot->lrc_snapshot);
2031 		snapshot->lrc_snapshot = NULL;
2032 	}
2033 	xe_bo_unlock(bo);
2034 put_bo:
2035 	xe_bo_put(bo);
2036 }
2037 
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)2038 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2039 {
2040 	unsigned long i;
2041 
2042 	if (!snapshot)
2043 		return;
2044 
2045 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2046 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2047 		   snapshot->ring_addr);
2048 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2049 		   snapshot->indirect_context_desc);
2050 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2051 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2052 		   snapshot->tail.internal, snapshot->tail.memory);
2053 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2054 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2055 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2056 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2057 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2058 
2059 	if (!snapshot->lrc_snapshot)
2060 		return;
2061 
2062 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2063 	drm_puts(p, "\t[HWSP].data: ");
2064 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2065 		u32 *val = snapshot->lrc_snapshot + i;
2066 		char dumped[ASCII85_BUFSZ];
2067 
2068 		drm_puts(p, ascii85_encode(*val, dumped));
2069 	}
2070 
2071 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2072 	drm_puts(p, "\t[HWCTX].data: ");
2073 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2074 		u32 *val = snapshot->lrc_snapshot + i;
2075 		char dumped[ASCII85_BUFSZ];
2076 
2077 		drm_puts(p, ascii85_encode(*val, dumped));
2078 	}
2079 	drm_puts(p, "\n");
2080 }
2081 
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)2082 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2083 {
2084 	if (!snapshot)
2085 		return;
2086 
2087 	kvfree(snapshot->lrc_snapshot);
2088 	if (snapshot->lrc_bo)
2089 		xe_bo_put(snapshot->lrc_bo);
2090 
2091 	kfree(snapshot);
2092 }
2093 
get_ctx_timestamp(struct xe_lrc * lrc,u32 engine_id,u64 * reg_ctx_ts)2094 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2095 {
2096 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2097 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2098 	struct xe_hw_engine *hwe;
2099 	u64 val;
2100 
2101 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2102 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2103 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2104 			    class, instance))
2105 		return -1;
2106 
2107 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2108 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2109 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2110 	else
2111 		val = xe_mmio_read32(&hwe->gt->mmio,
2112 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2113 
2114 	*reg_ctx_ts = val;
2115 
2116 	return 0;
2117 }
2118 
2119 /**
2120  * xe_lrc_update_timestamp() - Update ctx timestamp
2121  * @lrc: Pointer to the lrc.
2122  * @old_ts: Old timestamp value
2123  *
2124  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2125  * update saved value. With support for active contexts, the calculation may be
2126  * slightly racy, so follow a read-again logic to ensure that the context is
2127  * still active before returning the right timestamp.
2128  *
2129  * Returns: New ctx timestamp value
2130  */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u64 * old_ts)2131 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2132 {
2133 	u64 lrc_ts, reg_ts;
2134 	u32 engine_id;
2135 
2136 	*old_ts = lrc->ctx_timestamp;
2137 
2138 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2139 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2140 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2141 		lrc->ctx_timestamp = lrc_ts;
2142 		goto done;
2143 	}
2144 
2145 	if (lrc_ts == CONTEXT_ACTIVE) {
2146 		engine_id = xe_lrc_engine_id(lrc);
2147 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2148 			lrc->ctx_timestamp = reg_ts;
2149 
2150 		/* read lrc again to ensure context is still active */
2151 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2152 	}
2153 
2154 	/*
2155 	 * If context switched out, just use the lrc_ts. Note that this needs to
2156 	 * be a separate if condition.
2157 	 */
2158 	if (lrc_ts != CONTEXT_ACTIVE)
2159 		lrc->ctx_timestamp = lrc_ts;
2160 
2161 done:
2162 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2163 
2164 	return lrc->ctx_timestamp;
2165 }
2166 
2167 /**
2168  * xe_lrc_ring_is_idle() - LRC is idle
2169  * @lrc: Pointer to the lrc.
2170  *
2171  * Compare LRC ring head and tail to determine if idle.
2172  *
2173  * Return: True is ring is idle, False otherwise
2174  */
xe_lrc_ring_is_idle(struct xe_lrc * lrc)2175 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2176 {
2177 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2178 }
2179