xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision d7b618bc41ee3d44c070212dff93949702ede997)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 
12 #include "instructions/xe_mi_commands.h"
13 #include "instructions/xe_gfxpipe_commands.h"
14 #include "instructions/xe_gfx_state_commands.h"
15 #include "regs/xe_engine_regs.h"
16 #include "regs/xe_lrc_layout.h"
17 #include "xe_bb.h"
18 #include "xe_bo.h"
19 #include "xe_device.h"
20 #include "xe_drm_client.h"
21 #include "xe_exec_queue_types.h"
22 #include "xe_gt.h"
23 #include "xe_gt_printk.h"
24 #include "xe_hw_fence.h"
25 #include "xe_map.h"
26 #include "xe_memirq.h"
27 #include "xe_mmio.h"
28 #include "xe_sriov.h"
29 #include "xe_trace_lrc.h"
30 #include "xe_vm.h"
31 #include "xe_wa.h"
32 
33 #define LRC_VALID				BIT_ULL(0)
34 #define LRC_PRIVILEGE				BIT_ULL(8)
35 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
36 #define LRC_LEGACY_64B_CONTEXT			3
37 
38 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
39 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
40 
41 #define LRC_PPHWSP_SIZE				SZ_4K
42 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
43 #define LRC_WA_BB_SIZE				SZ_4K
44 
45 static struct xe_device *
46 lrc_to_xe(struct xe_lrc *lrc)
47 {
48 	return gt_to_xe(lrc->fence_ctx.gt);
49 }
50 
51 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
52 {
53 	struct xe_device *xe = gt_to_xe(gt);
54 	size_t size;
55 
56 	/* Per-process HW status page (PPHWSP) */
57 	size = LRC_PPHWSP_SIZE;
58 
59 	/* Engine context image */
60 	switch (class) {
61 	case XE_ENGINE_CLASS_RENDER:
62 		if (GRAPHICS_VER(xe) >= 20)
63 			size += 3 * SZ_4K;
64 		else
65 			size += 13 * SZ_4K;
66 		break;
67 	case XE_ENGINE_CLASS_COMPUTE:
68 		if (GRAPHICS_VER(xe) >= 20)
69 			size += 2 * SZ_4K;
70 		else
71 			size += 13 * SZ_4K;
72 		break;
73 	default:
74 		WARN(1, "Unknown engine class: %d", class);
75 		fallthrough;
76 	case XE_ENGINE_CLASS_COPY:
77 	case XE_ENGINE_CLASS_VIDEO_DECODE:
78 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
79 	case XE_ENGINE_CLASS_OTHER:
80 		size += 1 * SZ_4K;
81 	}
82 
83 	/* Add indirect ring state page */
84 	if (xe_gt_has_indirect_ring_state(gt))
85 		size += LRC_INDIRECT_RING_STATE_SIZE;
86 
87 	return size;
88 }
89 
90 /*
91  * The per-platform tables are u8-encoded in @data. Decode @data and set the
92  * addresses' offset and commands in @regs. The following encoding is used
93  * for each byte. There are 2 steps: decoding commands and decoding addresses.
94  *
95  * Commands:
96  * [7]: create NOPs - number of NOPs are set in lower bits
97  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
98  *      MI_LRI_FORCE_POSTED
99  * [5:0]: Number of NOPs or registers to set values to in case of
100  *        MI_LOAD_REGISTER_IMM
101  *
102  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
103  * number of registers. They are set by using the REG/REG16 macros: the former
104  * is used for offsets smaller than 0x200 while the latter is for values bigger
105  * than that. Those macros already set all the bits documented below correctly:
106  *
107  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
108  *      follow, for the lower bits
109  * [6:0]: Register offset, without considering the engine base.
110  *
111  * This function only tweaks the commands and register offsets. Values are not
112  * filled out.
113  */
114 static void set_offsets(u32 *regs,
115 			const u8 *data,
116 			const struct xe_hw_engine *hwe)
117 #define NOP(x) (BIT(7) | (x))
118 #define LRI(count, flags) ((flags) << 6 | (count) | \
119 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
120 #define POSTED BIT(0)
121 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
122 #define REG16(x) \
123 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
124 	(((x) >> 2) & 0x7f)
125 {
126 	const u32 base = hwe->mmio_base;
127 
128 	while (*data) {
129 		u8 count, flags;
130 
131 		if (*data & BIT(7)) { /* skip */
132 			count = *data++ & ~BIT(7);
133 			regs += count;
134 			continue;
135 		}
136 
137 		count = *data & 0x3f;
138 		flags = *data >> 6;
139 		data++;
140 
141 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
142 		if (flags & POSTED)
143 			*regs |= MI_LRI_FORCE_POSTED;
144 		*regs |= MI_LRI_LRM_CS_MMIO;
145 		regs++;
146 
147 		xe_gt_assert(hwe->gt, count);
148 		do {
149 			u32 offset = 0;
150 			u8 v;
151 
152 			do {
153 				v = *data++;
154 				offset <<= 7;
155 				offset |= v & ~BIT(7);
156 			} while (v & BIT(7));
157 
158 			regs[0] = base + (offset << 2);
159 			regs += 2;
160 		} while (--count);
161 	}
162 
163 	*regs = MI_BATCH_BUFFER_END | BIT(0);
164 }
165 
166 static const u8 gen12_xcs_offsets[] = {
167 	NOP(1),
168 	LRI(13, POSTED),
169 	REG16(0x244),
170 	REG(0x034),
171 	REG(0x030),
172 	REG(0x038),
173 	REG(0x03c),
174 	REG(0x168),
175 	REG(0x140),
176 	REG(0x110),
177 	REG(0x1c0),
178 	REG(0x1c4),
179 	REG(0x1c8),
180 	REG(0x180),
181 	REG16(0x2b4),
182 
183 	NOP(5),
184 	LRI(9, POSTED),
185 	REG16(0x3a8),
186 	REG16(0x28c),
187 	REG16(0x288),
188 	REG16(0x284),
189 	REG16(0x280),
190 	REG16(0x27c),
191 	REG16(0x278),
192 	REG16(0x274),
193 	REG16(0x270),
194 
195 	0
196 };
197 
198 static const u8 dg2_xcs_offsets[] = {
199 	NOP(1),
200 	LRI(15, POSTED),
201 	REG16(0x244),
202 	REG(0x034),
203 	REG(0x030),
204 	REG(0x038),
205 	REG(0x03c),
206 	REG(0x168),
207 	REG(0x140),
208 	REG(0x110),
209 	REG(0x1c0),
210 	REG(0x1c4),
211 	REG(0x1c8),
212 	REG(0x180),
213 	REG16(0x2b4),
214 	REG(0x120),
215 	REG(0x124),
216 
217 	NOP(1),
218 	LRI(9, POSTED),
219 	REG16(0x3a8),
220 	REG16(0x28c),
221 	REG16(0x288),
222 	REG16(0x284),
223 	REG16(0x280),
224 	REG16(0x27c),
225 	REG16(0x278),
226 	REG16(0x274),
227 	REG16(0x270),
228 
229 	0
230 };
231 
232 static const u8 gen12_rcs_offsets[] = {
233 	NOP(1),
234 	LRI(13, POSTED),
235 	REG16(0x244),
236 	REG(0x034),
237 	REG(0x030),
238 	REG(0x038),
239 	REG(0x03c),
240 	REG(0x168),
241 	REG(0x140),
242 	REG(0x110),
243 	REG(0x1c0),
244 	REG(0x1c4),
245 	REG(0x1c8),
246 	REG(0x180),
247 	REG16(0x2b4),
248 
249 	NOP(5),
250 	LRI(9, POSTED),
251 	REG16(0x3a8),
252 	REG16(0x28c),
253 	REG16(0x288),
254 	REG16(0x284),
255 	REG16(0x280),
256 	REG16(0x27c),
257 	REG16(0x278),
258 	REG16(0x274),
259 	REG16(0x270),
260 
261 	LRI(3, POSTED),
262 	REG(0x1b0),
263 	REG16(0x5a8),
264 	REG16(0x5ac),
265 
266 	NOP(6),
267 	LRI(1, 0),
268 	REG(0x0c8),
269 	NOP(3 + 9 + 1),
270 
271 	LRI(51, POSTED),
272 	REG16(0x588),
273 	REG16(0x588),
274 	REG16(0x588),
275 	REG16(0x588),
276 	REG16(0x588),
277 	REG16(0x588),
278 	REG(0x028),
279 	REG(0x09c),
280 	REG(0x0c0),
281 	REG(0x178),
282 	REG(0x17c),
283 	REG16(0x358),
284 	REG(0x170),
285 	REG(0x150),
286 	REG(0x154),
287 	REG(0x158),
288 	REG16(0x41c),
289 	REG16(0x600),
290 	REG16(0x604),
291 	REG16(0x608),
292 	REG16(0x60c),
293 	REG16(0x610),
294 	REG16(0x614),
295 	REG16(0x618),
296 	REG16(0x61c),
297 	REG16(0x620),
298 	REG16(0x624),
299 	REG16(0x628),
300 	REG16(0x62c),
301 	REG16(0x630),
302 	REG16(0x634),
303 	REG16(0x638),
304 	REG16(0x63c),
305 	REG16(0x640),
306 	REG16(0x644),
307 	REG16(0x648),
308 	REG16(0x64c),
309 	REG16(0x650),
310 	REG16(0x654),
311 	REG16(0x658),
312 	REG16(0x65c),
313 	REG16(0x660),
314 	REG16(0x664),
315 	REG16(0x668),
316 	REG16(0x66c),
317 	REG16(0x670),
318 	REG16(0x674),
319 	REG16(0x678),
320 	REG16(0x67c),
321 	REG(0x068),
322 	REG(0x084),
323 	NOP(1),
324 
325 	0
326 };
327 
328 static const u8 xehp_rcs_offsets[] = {
329 	NOP(1),
330 	LRI(13, POSTED),
331 	REG16(0x244),
332 	REG(0x034),
333 	REG(0x030),
334 	REG(0x038),
335 	REG(0x03c),
336 	REG(0x168),
337 	REG(0x140),
338 	REG(0x110),
339 	REG(0x1c0),
340 	REG(0x1c4),
341 	REG(0x1c8),
342 	REG(0x180),
343 	REG16(0x2b4),
344 
345 	NOP(5),
346 	LRI(9, POSTED),
347 	REG16(0x3a8),
348 	REG16(0x28c),
349 	REG16(0x288),
350 	REG16(0x284),
351 	REG16(0x280),
352 	REG16(0x27c),
353 	REG16(0x278),
354 	REG16(0x274),
355 	REG16(0x270),
356 
357 	LRI(3, POSTED),
358 	REG(0x1b0),
359 	REG16(0x5a8),
360 	REG16(0x5ac),
361 
362 	NOP(6),
363 	LRI(1, 0),
364 	REG(0x0c8),
365 
366 	0
367 };
368 
369 static const u8 dg2_rcs_offsets[] = {
370 	NOP(1),
371 	LRI(15, POSTED),
372 	REG16(0x244),
373 	REG(0x034),
374 	REG(0x030),
375 	REG(0x038),
376 	REG(0x03c),
377 	REG(0x168),
378 	REG(0x140),
379 	REG(0x110),
380 	REG(0x1c0),
381 	REG(0x1c4),
382 	REG(0x1c8),
383 	REG(0x180),
384 	REG16(0x2b4),
385 	REG(0x120),
386 	REG(0x124),
387 
388 	NOP(1),
389 	LRI(9, POSTED),
390 	REG16(0x3a8),
391 	REG16(0x28c),
392 	REG16(0x288),
393 	REG16(0x284),
394 	REG16(0x280),
395 	REG16(0x27c),
396 	REG16(0x278),
397 	REG16(0x274),
398 	REG16(0x270),
399 
400 	LRI(3, POSTED),
401 	REG(0x1b0),
402 	REG16(0x5a8),
403 	REG16(0x5ac),
404 
405 	NOP(6),
406 	LRI(1, 0),
407 	REG(0x0c8),
408 
409 	0
410 };
411 
412 static const u8 mtl_rcs_offsets[] = {
413 	NOP(1),
414 	LRI(15, POSTED),
415 	REG16(0x244),
416 	REG(0x034),
417 	REG(0x030),
418 	REG(0x038),
419 	REG(0x03c),
420 	REG(0x168),
421 	REG(0x140),
422 	REG(0x110),
423 	REG(0x1c0),
424 	REG(0x1c4),
425 	REG(0x1c8),
426 	REG(0x180),
427 	REG16(0x2b4),
428 	REG(0x120),
429 	REG(0x124),
430 
431 	NOP(1),
432 	LRI(9, POSTED),
433 	REG16(0x3a8),
434 	REG16(0x28c),
435 	REG16(0x288),
436 	REG16(0x284),
437 	REG16(0x280),
438 	REG16(0x27c),
439 	REG16(0x278),
440 	REG16(0x274),
441 	REG16(0x270),
442 
443 	NOP(2),
444 	LRI(2, POSTED),
445 	REG16(0x5a8),
446 	REG16(0x5ac),
447 
448 	NOP(6),
449 	LRI(1, 0),
450 	REG(0x0c8),
451 
452 	0
453 };
454 
455 #define XE2_CTX_COMMON \
456 	NOP(1),                 /* [0x00] */ \
457 	LRI(15, POSTED),        /* [0x01] */ \
458 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
459 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
460 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
461 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
462 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
463 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
464 	REG(0x140),             /* [0x0e] BB_ADDR */ \
465 	REG(0x110),             /* [0x10] BB_STATE */ \
466 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
467 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
468 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
469 	REG(0x180),             /* [0x18] CCID */ \
470 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
471 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
472 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
473 	\
474 	NOP(1),                 /* [0x20] */ \
475 	LRI(9, POSTED),         /* [0x21] */ \
476 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
477 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
478 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
479 	REG16(0x284),           /* [0x28] dummy reg */ \
480 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
481 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
482 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
483 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
484 	REG16(0x270)            /* [0x32] PTBP_LDW */
485 
486 static const u8 xe2_rcs_offsets[] = {
487 	XE2_CTX_COMMON,
488 
489 	NOP(2),                 /* [0x34] */
490 	LRI(2, POSTED),         /* [0x36] */
491 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
492 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
493 
494 	NOP(6),                 /* [0x41] */
495 	LRI(1, 0),              /* [0x47] */
496 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
497 
498 	0
499 };
500 
501 static const u8 xe2_bcs_offsets[] = {
502 	XE2_CTX_COMMON,
503 
504 	NOP(4 + 8 + 1),         /* [0x34] */
505 	LRI(2, POSTED),         /* [0x41] */
506 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
507 	REG16(0x204),           /* [0x44] BLIT_CCTL */
508 
509 	0
510 };
511 
512 static const u8 xe2_xcs_offsets[] = {
513 	XE2_CTX_COMMON,
514 
515 	0
516 };
517 
518 static const u8 xe2_indirect_ring_state_offsets[] = {
519 	NOP(1),                 /* [0x00] */
520 	LRI(5, POSTED),         /* [0x01] */
521 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
522 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
523 	REG(0x038),             /* [0x06] RING_BUFFER_START */
524 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
525 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
526 
527 	NOP(5),                 /* [0x0c] */
528 	LRI(9, POSTED),         /* [0x11] */
529 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
530 	REG(0x140),             /* [0x14] BB_ADDR */
531 	REG(0x110),             /* [0x16] BB_STATE */
532 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
533 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
534 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
535 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
536 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
537 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
538 
539 	NOP(12),                 /* [0x00] */
540 
541 	0
542 };
543 
544 #undef REG16
545 #undef REG
546 #undef LRI
547 #undef NOP
548 
549 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
550 {
551 	if (class == XE_ENGINE_CLASS_RENDER) {
552 		if (GRAPHICS_VER(xe) >= 20)
553 			return xe2_rcs_offsets;
554 		else if (GRAPHICS_VERx100(xe) >= 1270)
555 			return mtl_rcs_offsets;
556 		else if (GRAPHICS_VERx100(xe) >= 1255)
557 			return dg2_rcs_offsets;
558 		else if (GRAPHICS_VERx100(xe) >= 1250)
559 			return xehp_rcs_offsets;
560 		else
561 			return gen12_rcs_offsets;
562 	} else if (class == XE_ENGINE_CLASS_COPY) {
563 		if (GRAPHICS_VER(xe) >= 20)
564 			return xe2_bcs_offsets;
565 		else
566 			return gen12_xcs_offsets;
567 	} else {
568 		if (GRAPHICS_VER(xe) >= 20)
569 			return xe2_xcs_offsets;
570 		else if (GRAPHICS_VERx100(xe) >= 1255)
571 			return dg2_xcs_offsets;
572 		else
573 			return gen12_xcs_offsets;
574 	}
575 }
576 
577 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
578 {
579 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
580 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
581 
582 	if (xe_gt_has_indirect_ring_state(hwe->gt))
583 		regs[CTX_CONTEXT_CONTROL] |=
584 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
585 
586 	/* TODO: Timestamp */
587 }
588 
589 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
590 {
591 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
592 	struct xe_device *xe = gt_to_xe(hwe->gt);
593 	u8 num_regs;
594 
595 	if (!xe_device_uses_memirq(xe))
596 		return;
597 
598 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
599 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
600 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
601 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
602 
603 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
604 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
605 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
606 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
607 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
608 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
609 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
610 
611 	if (xe_device_has_msix(xe)) {
612 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
613 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
614 	}
615 }
616 
617 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
618 {
619 	struct xe_device *xe = gt_to_xe(hwe->gt);
620 
621 	if (GRAPHICS_VERx100(xe) >= 1250)
622 		return 0x70;
623 	else
624 		return 0x60;
625 }
626 
627 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
628 {
629 	int x;
630 
631 	x = lrc_ring_mi_mode(hwe);
632 	regs[x + 1] &= ~STOP_RING;
633 	regs[x + 1] |= STOP_RING << 16;
634 }
635 
636 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
637 {
638 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
639 }
640 
641 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
642 {
643 	return 0;
644 }
645 
646 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
647 {
648 	return lrc->ring.size;
649 }
650 
651 /* Make the magic macros work */
652 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
653 #define __xe_lrc_regs_offset xe_lrc_regs_offset
654 
655 #define LRC_SEQNO_PPHWSP_OFFSET 512
656 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
657 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
658 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
659 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
660 
661 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
662 {
663 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
664 }
665 
666 static size_t lrc_reg_size(struct xe_device *xe)
667 {
668 	if (GRAPHICS_VERx100(xe) >= 1250)
669 		return 96 * sizeof(u32);
670 	else
671 		return 80 * sizeof(u32);
672 }
673 
674 size_t xe_lrc_skip_size(struct xe_device *xe)
675 {
676 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
677 }
678 
679 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
680 {
681 	/* The seqno is stored in the driver-defined portion of PPHWSP */
682 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
683 }
684 
685 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
686 {
687 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
688 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
689 }
690 
691 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
692 {
693 	/* This is stored in the driver-defined portion of PPHWSP */
694 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
695 }
696 
697 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
698 {
699 	/* The parallel is stored in the driver-defined portion of PPHWSP */
700 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
701 }
702 
703 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
704 {
705 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
706 }
707 
708 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
709 {
710 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
711 }
712 
713 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
714 {
715 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
716 }
717 
718 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
719 {
720 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_RING_STATE_SIZE;
721 }
722 
723 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
724 {
725 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
726 }
727 
728 #define DECL_MAP_ADDR_HELPERS(elem) \
729 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
730 { \
731 	struct iosys_map map = lrc->bo->vmap; \
732 \
733 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
734 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
735 	return map; \
736 } \
737 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
738 { \
739 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
740 } \
741 
742 DECL_MAP_ADDR_HELPERS(ring)
743 DECL_MAP_ADDR_HELPERS(pphwsp)
744 DECL_MAP_ADDR_HELPERS(seqno)
745 DECL_MAP_ADDR_HELPERS(regs)
746 DECL_MAP_ADDR_HELPERS(start_seqno)
747 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
748 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
749 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
750 DECL_MAP_ADDR_HELPERS(parallel)
751 DECL_MAP_ADDR_HELPERS(indirect_ring)
752 DECL_MAP_ADDR_HELPERS(engine_id)
753 
754 #undef DECL_MAP_ADDR_HELPERS
755 
756 /**
757  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
758  * @lrc: Pointer to the lrc.
759  *
760  * Returns: ctx timestamp GGTT address
761  */
762 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
763 {
764 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
765 }
766 
767 /**
768  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
769  * @lrc: Pointer to the lrc.
770  *
771  * Returns: ctx timestamp udw GGTT address
772  */
773 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
774 {
775 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
776 }
777 
778 /**
779  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
780  * @lrc: Pointer to the lrc.
781  *
782  * Returns: ctx timestamp value
783  */
784 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
785 {
786 	struct xe_device *xe = lrc_to_xe(lrc);
787 	struct iosys_map map;
788 	u32 ldw, udw = 0;
789 
790 	map = __xe_lrc_ctx_timestamp_map(lrc);
791 	ldw = xe_map_read32(xe, &map);
792 
793 	if (xe->info.has_64bit_timestamp) {
794 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
795 		udw = xe_map_read32(xe, &map);
796 	}
797 
798 	return (u64)udw << 32 | ldw;
799 }
800 
801 /**
802  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
803  * @lrc: Pointer to the lrc.
804  *
805  * Returns: ctx timestamp job GGTT address
806  */
807 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
808 {
809 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
810 }
811 
812 /**
813  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
814  * @lrc: Pointer to the lrc.
815  *
816  * Returns: ctx timestamp job value
817  */
818 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
819 {
820 	struct xe_device *xe = lrc_to_xe(lrc);
821 	struct iosys_map map;
822 
823 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
824 	return xe_map_read32(xe, &map);
825 }
826 
827 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
828 {
829 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
830 }
831 
832 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
833 {
834 	if (!xe_lrc_has_indirect_ring_state(lrc))
835 		return 0;
836 
837 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
838 }
839 
840 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
841 {
842 	struct xe_device *xe = lrc_to_xe(lrc);
843 	struct iosys_map map;
844 
845 	map = __xe_lrc_indirect_ring_map(lrc);
846 	iosys_map_incr(&map, reg_nr * sizeof(u32));
847 	return xe_map_read32(xe, &map);
848 }
849 
850 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
851 					  int reg_nr, u32 val)
852 {
853 	struct xe_device *xe = lrc_to_xe(lrc);
854 	struct iosys_map map;
855 
856 	map = __xe_lrc_indirect_ring_map(lrc);
857 	iosys_map_incr(&map, reg_nr * sizeof(u32));
858 	xe_map_write32(xe, &map, val);
859 }
860 
861 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
862 {
863 	struct xe_device *xe = lrc_to_xe(lrc);
864 	struct iosys_map map;
865 
866 	map = __xe_lrc_regs_map(lrc);
867 	iosys_map_incr(&map, reg_nr * sizeof(u32));
868 	return xe_map_read32(xe, &map);
869 }
870 
871 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
872 {
873 	struct xe_device *xe = lrc_to_xe(lrc);
874 	struct iosys_map map;
875 
876 	map = __xe_lrc_regs_map(lrc);
877 	iosys_map_incr(&map, reg_nr * sizeof(u32));
878 	xe_map_write32(xe, &map, val);
879 }
880 
881 static void *empty_lrc_data(struct xe_hw_engine *hwe)
882 {
883 	struct xe_gt *gt = hwe->gt;
884 	void *data;
885 	u32 *regs;
886 
887 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
888 	if (!data)
889 		return NULL;
890 
891 	/* 1st page: Per-Process of HW status Page */
892 	regs = data + LRC_PPHWSP_SIZE;
893 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
894 	set_context_control(regs, hwe);
895 	set_memory_based_intr(regs, hwe);
896 	reset_stop_ring(regs, hwe);
897 	if (xe_gt_has_indirect_ring_state(gt)) {
898 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
899 		       LRC_INDIRECT_RING_STATE_SIZE;
900 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
901 	}
902 
903 	return data;
904 }
905 
906 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
907 {
908 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
909 
910 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
911 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
912 }
913 
914 static void xe_lrc_finish(struct xe_lrc *lrc)
915 {
916 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
917 	xe_bo_unpin_map_no_vm(lrc->bo);
918 }
919 
920 /*
921  * wa_bb_setup_utilization() - Write commands to wa bb to assist
922  * in calculating active context run ticks.
923  *
924  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
925  * context, but only gets updated when the context switches out. In order to
926  * check how long a context has been active before it switches out, two things
927  * are required:
928  *
929  * (1) Determine if the context is running:
930  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
931  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
932  * initialized. During a query, we just check for this value to determine if the
933  * context is active. If the context switched out, it would overwrite this
934  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
935  * the last part of context restore, so reusing this LRC location will not
936  * clobber anything.
937  *
938  * (2) Calculate the time that the context has been active for:
939  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
940  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
941  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
942  * engine instance. Since we do not know which instance the context is running
943  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
944  * store it in the PPHSWP.
945  */
946 #define CONTEXT_ACTIVE 1ULL
947 static ssize_t wa_bb_setup_utilization(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
948 				       u32 *batch, size_t max_len)
949 {
950 	u32 *cmd = batch;
951 
952 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
953 		return -ENOSPC;
954 
955 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
956 	*cmd++ = ENGINE_ID(0).addr;
957 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
958 	*cmd++ = 0;
959 
960 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
961 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
962 	*cmd++ = 0;
963 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
964 
965 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
966 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
967 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
968 		*cmd++ = 0;
969 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
970 	}
971 
972 	return cmd - batch;
973 }
974 
975 struct wa_bb_setup {
976 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
977 			 u32 *batch, size_t max_size);
978 };
979 
980 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
981 {
982 	const size_t max_size = LRC_WA_BB_SIZE;
983 	static const struct wa_bb_setup funcs[] = {
984 		{ .setup = wa_bb_setup_utilization },
985 	};
986 	ssize_t remain;
987 	u32 *cmd, *buf = NULL;
988 
989 	if (lrc->bo->vmap.is_iomem) {
990 		buf = kmalloc(max_size, GFP_KERNEL);
991 		if (!buf)
992 			return -ENOMEM;
993 		cmd = buf;
994 	} else {
995 		cmd = lrc->bo->vmap.vaddr + __xe_lrc_wa_bb_offset(lrc);
996 	}
997 
998 	remain = max_size / sizeof(*cmd);
999 
1000 	for (size_t i = 0; i < ARRAY_SIZE(funcs); i++) {
1001 		ssize_t len = funcs[i].setup(lrc, hwe, cmd, remain);
1002 
1003 		remain -= len;
1004 
1005 		/*
1006 		 * There should always be at least 1 additional dword for
1007 		 * the end marker
1008 		 */
1009 		if (len < 0 || xe_gt_WARN_ON(lrc->gt, remain < 1))
1010 			goto fail;
1011 
1012 		cmd += len;
1013 	}
1014 
1015 	*cmd++ = MI_BATCH_BUFFER_END;
1016 
1017 	if (buf) {
1018 		xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bo->vmap,
1019 				 __xe_lrc_wa_bb_offset(lrc), buf,
1020 				 (cmd - buf) * sizeof(*cmd));
1021 		kfree(buf);
1022 	}
1023 
1024 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, xe_bo_ggtt_addr(lrc->bo) +
1025 			     __xe_lrc_wa_bb_offset(lrc) + 1);
1026 
1027 	return 0;
1028 
1029 fail:
1030 	kfree(buf);
1031 	return -ENOSPC;
1032 }
1033 
1034 #define PVC_CTX_ASID		(0x2e + 1)
1035 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
1036 
1037 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1038 		       struct xe_vm *vm, u32 ring_size, u16 msix_vec,
1039 		       u32 init_flags)
1040 {
1041 	struct xe_gt *gt = hwe->gt;
1042 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1043 	const u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1044 	struct xe_tile *tile = gt_to_tile(gt);
1045 	struct xe_device *xe = gt_to_xe(gt);
1046 	struct iosys_map map;
1047 	void *init_data = NULL;
1048 	u32 arb_enable;
1049 	u32 bo_flags;
1050 	int err;
1051 
1052 	kref_init(&lrc->refcount);
1053 	lrc->gt = gt;
1054 	lrc->size = lrc_size;
1055 	lrc->flags = 0;
1056 	lrc->ring.size = ring_size;
1057 	lrc->ring.tail = 0;
1058 	if (xe_gt_has_indirect_ring_state(gt))
1059 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1060 
1061 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1062 		   XE_BO_FLAG_GGTT_INVALIDATE;
1063 	if (vm && vm->xef) /* userspace */
1064 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE;
1065 
1066 	/*
1067 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
1068 	 * via VM bind calls.
1069 	 */
1070 	lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, bo_size,
1071 				       ttm_bo_type_kernel,
1072 				       bo_flags);
1073 	if (IS_ERR(lrc->bo))
1074 		return PTR_ERR(lrc->bo);
1075 
1076 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1077 			     hwe->fence_irq, hwe->name);
1078 
1079 	if (!gt->default_lrc[hwe->class]) {
1080 		init_data = empty_lrc_data(hwe);
1081 		if (!init_data) {
1082 			err = -ENOMEM;
1083 			goto err_lrc_finish;
1084 		}
1085 	}
1086 
1087 	/*
1088 	 * Init Per-Process of HW status Page, LRC / context state to known
1089 	 * values
1090 	 */
1091 	map = __xe_lrc_pphwsp_map(lrc);
1092 	if (!init_data) {
1093 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1094 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1095 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1096 				 lrc_size - LRC_PPHWSP_SIZE);
1097 	} else {
1098 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1099 		kfree(init_data);
1100 	}
1101 
1102 	if (vm) {
1103 		xe_lrc_set_ppgtt(lrc, vm);
1104 
1105 		if (vm->xef)
1106 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1107 	}
1108 
1109 	if (xe_device_has_msix(xe)) {
1110 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1111 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1112 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1113 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1114 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1115 	}
1116 
1117 	if (xe_gt_has_indirect_ring_state(gt)) {
1118 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1119 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1120 
1121 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1122 					      __xe_lrc_ring_ggtt_addr(lrc));
1123 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1124 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1125 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1126 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1127 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1128 	} else {
1129 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1130 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1131 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1132 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1133 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1134 	}
1135 
1136 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1137 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1138 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1139 				     _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1140 
1141 	if (init_flags & XE_LRC_CREATE_PXP)
1142 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1143 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1144 				     _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1145 
1146 	lrc->ctx_timestamp = 0;
1147 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1148 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1149 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1150 
1151 	if (xe->info.has_asid && vm)
1152 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
1153 
1154 	lrc->desc = LRC_VALID;
1155 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1156 	/* TODO: Priority */
1157 
1158 	/* While this appears to have something about privileged batches or
1159 	 * some such, it really just means PPGTT mode.
1160 	 */
1161 	if (vm)
1162 		lrc->desc |= LRC_PRIVILEGE;
1163 
1164 	if (GRAPHICS_VERx100(xe) < 1250) {
1165 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1166 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1167 	}
1168 
1169 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1170 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1171 
1172 	map = __xe_lrc_seqno_map(lrc);
1173 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1174 
1175 	map = __xe_lrc_start_seqno_map(lrc);
1176 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1177 
1178 	err = setup_wa_bb(lrc, hwe);
1179 	if (err)
1180 		goto err_lrc_finish;
1181 
1182 	return 0;
1183 
1184 err_lrc_finish:
1185 	xe_lrc_finish(lrc);
1186 	return err;
1187 }
1188 
1189 /**
1190  * xe_lrc_create - Create a LRC
1191  * @hwe: Hardware Engine
1192  * @vm: The VM (address space)
1193  * @ring_size: LRC ring size
1194  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1195  * @flags: LRC initialization flags
1196  *
1197  * Allocate and initialize the Logical Ring Context (LRC).
1198  *
1199  * Return pointer to created LRC upon success and an error pointer
1200  * upon failure.
1201  */
1202 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1203 			     u32 ring_size, u16 msix_vec, u32 flags)
1204 {
1205 	struct xe_lrc *lrc;
1206 	int err;
1207 
1208 	lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1209 	if (!lrc)
1210 		return ERR_PTR(-ENOMEM);
1211 
1212 	err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1213 	if (err) {
1214 		kfree(lrc);
1215 		return ERR_PTR(err);
1216 	}
1217 
1218 	return lrc;
1219 }
1220 
1221 /**
1222  * xe_lrc_destroy - Destroy the LRC
1223  * @ref: reference to LRC
1224  *
1225  * Called when ref == 0, release resources held by the Logical Ring Context
1226  * (LRC) and free the LRC memory.
1227  */
1228 void xe_lrc_destroy(struct kref *ref)
1229 {
1230 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1231 
1232 	xe_lrc_finish(lrc);
1233 	kfree(lrc);
1234 }
1235 
1236 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1237 {
1238 	if (xe_lrc_has_indirect_ring_state(lrc))
1239 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1240 	else
1241 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1242 }
1243 
1244 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1245 {
1246 	if (xe_lrc_has_indirect_ring_state(lrc))
1247 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1248 	else
1249 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1250 }
1251 
1252 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1253 {
1254 	if (xe_lrc_has_indirect_ring_state(lrc))
1255 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1256 	else
1257 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1258 }
1259 
1260 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1261 {
1262 	if (xe_lrc_has_indirect_ring_state(lrc))
1263 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1264 	else
1265 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1266 }
1267 
1268 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1269 {
1270 	if (xe_lrc_has_indirect_ring_state(lrc))
1271 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1272 	else
1273 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1274 }
1275 
1276 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1277 {
1278 	const u32 head = xe_lrc_ring_head(lrc);
1279 	const u32 tail = lrc->ring.tail;
1280 	const u32 size = lrc->ring.size;
1281 
1282 	return ((head - tail - 1) & (size - 1)) + 1;
1283 }
1284 
1285 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1286 				const void *data, size_t size)
1287 {
1288 	struct xe_device *xe = lrc_to_xe(lrc);
1289 
1290 	iosys_map_incr(&ring, lrc->ring.tail);
1291 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1292 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1293 }
1294 
1295 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1296 {
1297 	struct xe_device *xe = lrc_to_xe(lrc);
1298 	struct iosys_map ring;
1299 	u32 rhs;
1300 	size_t aligned_size;
1301 
1302 	xe_assert(xe, IS_ALIGNED(size, 4));
1303 	aligned_size = ALIGN(size, 8);
1304 
1305 	ring = __xe_lrc_ring_map(lrc);
1306 
1307 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1308 	rhs = lrc->ring.size - lrc->ring.tail;
1309 	if (size > rhs) {
1310 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1311 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1312 	} else {
1313 		__xe_lrc_write_ring(lrc, ring, data, size);
1314 	}
1315 
1316 	if (aligned_size > size) {
1317 		u32 noop = MI_NOOP;
1318 
1319 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1320 	}
1321 }
1322 
1323 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1324 {
1325 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1326 }
1327 
1328 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1329 {
1330 	return __xe_lrc_seqno_ggtt_addr(lrc);
1331 }
1332 
1333 /**
1334  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1335  *
1336  * Allocate but don't initialize an lrc seqno fence.
1337  *
1338  * Return: Pointer to the allocated fence or
1339  * negative error pointer on error.
1340  */
1341 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1342 {
1343 	return xe_hw_fence_alloc();
1344 }
1345 
1346 /**
1347  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1348  * @fence: Pointer to the fence to free.
1349  *
1350  * Frees an lrc seqno fence that hasn't yet been
1351  * initialized.
1352  */
1353 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1354 {
1355 	xe_hw_fence_free(fence);
1356 }
1357 
1358 /**
1359  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1360  * @lrc: Pointer to the lrc.
1361  * @fence: Pointer to the fence to initialize.
1362  *
1363  * Initializes a pre-allocated lrc seqno fence.
1364  * After initialization, the fence is subject to normal
1365  * dma-fence refcounting.
1366  */
1367 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1368 {
1369 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1370 }
1371 
1372 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1373 {
1374 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1375 
1376 	return xe_map_read32(lrc_to_xe(lrc), &map);
1377 }
1378 
1379 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1380 {
1381 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1382 
1383 	return xe_map_read32(lrc_to_xe(lrc), &map);
1384 }
1385 
1386 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1387 {
1388 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1389 }
1390 
1391 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1392 {
1393 	return __xe_lrc_parallel_ggtt_addr(lrc);
1394 }
1395 
1396 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1397 {
1398 	return __xe_lrc_parallel_map(lrc);
1399 }
1400 
1401 /**
1402  * xe_lrc_engine_id() - Read engine id value
1403  * @lrc: Pointer to the lrc.
1404  *
1405  * Returns: context id value
1406  */
1407 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1408 {
1409 	struct xe_device *xe = lrc_to_xe(lrc);
1410 	struct iosys_map map;
1411 
1412 	map = __xe_lrc_engine_id_map(lrc);
1413 	return xe_map_read32(xe, &map);
1414 }
1415 
1416 static int instr_dw(u32 cmd_header)
1417 {
1418 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1419 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1420 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1421 		return 1;
1422 
1423 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1424 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1425 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1426 
1427 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1428 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1429 }
1430 
1431 static int dump_mi_command(struct drm_printer *p,
1432 			   struct xe_gt *gt,
1433 			   u32 *dw,
1434 			   int remaining_dw)
1435 {
1436 	u32 inst_header = *dw;
1437 	u32 numdw = instr_dw(inst_header);
1438 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1439 	int num_noop;
1440 
1441 	/* First check for commands that don't have/use a '# DW' field */
1442 	switch (inst_header & MI_OPCODE) {
1443 	case MI_NOOP:
1444 		num_noop = 1;
1445 		while (num_noop < remaining_dw &&
1446 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1447 			num_noop++;
1448 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1449 		return num_noop;
1450 
1451 	case MI_TOPOLOGY_FILTER:
1452 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1453 		return 1;
1454 
1455 	case MI_BATCH_BUFFER_END:
1456 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1457 		/* Return 'remaining_dw' to consume the rest of the LRC */
1458 		return remaining_dw;
1459 	}
1460 
1461 	/*
1462 	 * Any remaining commands include a # of dwords.  We should make sure
1463 	 * it doesn't exceed the remaining size of the LRC.
1464 	 */
1465 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1466 		numdw = remaining_dw;
1467 
1468 	switch (inst_header & MI_OPCODE) {
1469 	case MI_LOAD_REGISTER_IMM:
1470 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1471 			   inst_header, (numdw - 1) / 2);
1472 		for (int i = 1; i < numdw; i += 2)
1473 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1474 		return numdw;
1475 
1476 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1477 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1478 			   inst_header,
1479 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1480 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1481 		if (numdw == 4)
1482 			drm_printf(p, " - %#6x = %#010llx\n",
1483 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1484 		else
1485 			drm_printf(p, " - %*ph (%s)\n",
1486 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1487 				   numdw < 4 ? "truncated" : "malformed");
1488 		return numdw;
1489 
1490 	case MI_FORCE_WAKEUP:
1491 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1492 		return numdw;
1493 
1494 	default:
1495 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1496 			   inst_header, opcode, numdw);
1497 		return numdw;
1498 	}
1499 }
1500 
1501 static int dump_gfxpipe_command(struct drm_printer *p,
1502 				struct xe_gt *gt,
1503 				u32 *dw,
1504 				int remaining_dw)
1505 {
1506 	u32 numdw = instr_dw(*dw);
1507 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1508 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1509 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1510 
1511 	/*
1512 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1513 	 * remaining size of the LRC.
1514 	 */
1515 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1516 		numdw = remaining_dw;
1517 
1518 	switch (*dw & GFXPIPE_MATCH_MASK) {
1519 #define MATCH(cmd) \
1520 	case cmd: \
1521 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1522 		return numdw
1523 #define MATCH3D(cmd) \
1524 	case CMD_##cmd: \
1525 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1526 		return numdw
1527 
1528 	MATCH(STATE_BASE_ADDRESS);
1529 	MATCH(STATE_SIP);
1530 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1531 	MATCH(STATE_COMPUTE_MODE);
1532 	MATCH3D(3DSTATE_BTD);
1533 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1534 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1535 
1536 	MATCH3D(3DSTATE_VF_STATISTICS);
1537 
1538 	MATCH(PIPELINE_SELECT);
1539 
1540 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1541 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1542 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1543 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1544 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1545 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1546 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1547 	MATCH3D(3DSTATE_INDEX_BUFFER);
1548 	MATCH3D(3DSTATE_VF);
1549 	MATCH3D(3DSTATE_MULTISAMPLE);
1550 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1551 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1552 	MATCH3D(3DSTATE_VS);
1553 	MATCH3D(3DSTATE_GS);
1554 	MATCH3D(3DSTATE_CLIP);
1555 	MATCH3D(3DSTATE_SF);
1556 	MATCH3D(3DSTATE_WM);
1557 	MATCH3D(3DSTATE_CONSTANT_VS);
1558 	MATCH3D(3DSTATE_CONSTANT_GS);
1559 	MATCH3D(3DSTATE_CONSTANT_PS);
1560 	MATCH3D(3DSTATE_SAMPLE_MASK);
1561 	MATCH3D(3DSTATE_CONSTANT_HS);
1562 	MATCH3D(3DSTATE_CONSTANT_DS);
1563 	MATCH3D(3DSTATE_HS);
1564 	MATCH3D(3DSTATE_TE);
1565 	MATCH3D(3DSTATE_DS);
1566 	MATCH3D(3DSTATE_STREAMOUT);
1567 	MATCH3D(3DSTATE_SBE);
1568 	MATCH3D(3DSTATE_PS);
1569 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1570 	MATCH3D(3DSTATE_CPS_POINTERS);
1571 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1572 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1573 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1574 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1575 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1576 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1577 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1578 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1579 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1580 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1581 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1582 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1583 	MATCH3D(3DSTATE_VF_INSTANCING);
1584 	MATCH3D(3DSTATE_VF_SGVS);
1585 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1586 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1587 	MATCH3D(3DSTATE_PS_BLEND);
1588 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1589 	MATCH3D(3DSTATE_PS_EXTRA);
1590 	MATCH3D(3DSTATE_RASTER);
1591 	MATCH3D(3DSTATE_SBE_SWIZ);
1592 	MATCH3D(3DSTATE_WM_HZ_OP);
1593 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1594 	MATCH3D(3DSTATE_VF_SGVS_2);
1595 	MATCH3D(3DSTATE_VFG);
1596 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1597 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1598 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1599 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1600 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1601 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1602 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1603 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1604 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1605 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1606 	MATCH3D(3DSTATE_AMFS);
1607 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1608 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1609 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1610 	MATCH3D(3DSTATE_MESH_CONTROL);
1611 	MATCH3D(3DSTATE_MESH_DISTRIB);
1612 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1613 	MATCH3D(3DSTATE_MESH_SHADER);
1614 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1615 	MATCH3D(3DSTATE_TASK_CONTROL);
1616 	MATCH3D(3DSTATE_TASK_SHADER);
1617 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1618 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1619 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1620 	MATCH3D(3DSTATE_CLIP_MESH);
1621 	MATCH3D(3DSTATE_SBE_MESH);
1622 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1623 	MATCH3D(3DSTATE_COARSE_PIXEL);
1624 
1625 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1626 	MATCH3D(3DSTATE_CHROMA_KEY);
1627 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1628 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1629 	MATCH3D(3DSTATE_LINE_STIPPLE);
1630 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1631 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1632 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1633 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1634 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1635 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1636 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1637 	MATCH3D(3DSTATE_SO_DECL_LIST);
1638 	MATCH3D(3DSTATE_SO_BUFFER);
1639 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1640 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1641 	MATCH3D(3DSTATE_3D_MODE);
1642 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1643 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1644 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1645 
1646 	default:
1647 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1648 			   *dw, pipeline, opcode, subopcode, numdw);
1649 		return numdw;
1650 	}
1651 }
1652 
1653 static int dump_gfx_state_command(struct drm_printer *p,
1654 				  struct xe_gt *gt,
1655 				  u32 *dw,
1656 				  int remaining_dw)
1657 {
1658 	u32 numdw = instr_dw(*dw);
1659 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1660 
1661 	/*
1662 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1663 	 * remaining size of the LRC.
1664 	 */
1665 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1666 		numdw = remaining_dw;
1667 
1668 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1669 	MATCH(STATE_WRITE_INLINE);
1670 
1671 	default:
1672 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1673 			   *dw, opcode, numdw);
1674 		return numdw;
1675 	}
1676 }
1677 
1678 void xe_lrc_dump_default(struct drm_printer *p,
1679 			 struct xe_gt *gt,
1680 			 enum xe_engine_class hwe_class)
1681 {
1682 	u32 *dw;
1683 	int remaining_dw, num_dw;
1684 
1685 	if (!gt->default_lrc[hwe_class]) {
1686 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1687 		return;
1688 	}
1689 
1690 	/*
1691 	 * Skip the beginning of the LRC since it contains the per-process
1692 	 * hardware status page.
1693 	 */
1694 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1695 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
1696 
1697 	while (remaining_dw > 0) {
1698 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1699 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1700 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1701 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1702 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1703 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1704 		} else {
1705 			num_dw = min(instr_dw(*dw), remaining_dw);
1706 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1707 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1708 				   num_dw);
1709 		}
1710 
1711 		dw += num_dw;
1712 		remaining_dw -= num_dw;
1713 	}
1714 }
1715 
1716 struct instr_state {
1717 	u32 instr;
1718 	u16 num_dw;
1719 };
1720 
1721 static const struct instr_state xe_hpg_svg_state[] = {
1722 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1723 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1724 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1725 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1726 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1727 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1728 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1729 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1730 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1731 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1732 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1733 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1734 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1735 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1736 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1737 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1738 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1739 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1740 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1741 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1742 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1743 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1744 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1745 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1746 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1747 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1748 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1749 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1750 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1751 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1752 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1753 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1754 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1755 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1756 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1757 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1758 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1759 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1760 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1761 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1762 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1763 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1764 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1765 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1766 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1767 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1768 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1769 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1770 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1771 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1772 };
1773 
1774 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1775 {
1776 	struct xe_gt *gt = q->hwe->gt;
1777 	struct xe_device *xe = gt_to_xe(gt);
1778 	const struct instr_state *state_table = NULL;
1779 	int state_table_size = 0;
1780 
1781 	/*
1782 	 * Wa_14019789679
1783 	 *
1784 	 * If the driver doesn't explicitly emit the SVG instructions while
1785 	 * setting up the default LRC, the context switch will write 0's
1786 	 * (noops) into the LRC memory rather than the expected instruction
1787 	 * headers.  Application contexts start out as a copy of the default
1788 	 * LRC, and if they also do not emit specific settings for some SVG
1789 	 * state, then on context restore they'll unintentionally inherit
1790 	 * whatever state setting the previous context had programmed into the
1791 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
1792 	 * prevent the hardware from resetting that state back to any specific
1793 	 * value).
1794 	 *
1795 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
1796 	 * since that's a specific state setting that can easily cause GPU
1797 	 * hangs if unintentionally inherited.  However to be safe we'll
1798 	 * continue to emit all of the SVG state since it's best not to leak
1799 	 * any of the state between contexts, even if that leakage is harmless.
1800 	 */
1801 	if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
1802 		state_table = xe_hpg_svg_state;
1803 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1804 	}
1805 
1806 	if (!state_table) {
1807 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1808 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1809 		return;
1810 	}
1811 
1812 	for (int i = 0; i < state_table_size; i++) {
1813 		u32 instr = state_table[i].instr;
1814 		u16 num_dw = state_table[i].num_dw;
1815 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1816 
1817 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1818 		xe_gt_assert(gt, num_dw != 0);
1819 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1820 
1821 		/*
1822 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1823 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1824 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1825 		 * Just make the replacement here rather than defining a
1826 		 * whole separate table for the single trivial change.
1827 		 */
1828 		if (GRAPHICS_VER(xe) >= 20 &&
1829 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1830 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1831 
1832 		bb->cs[bb->len] = instr;
1833 		if (!is_single_dw)
1834 			bb->cs[bb->len] |= (num_dw - 2);
1835 
1836 		bb->len += num_dw;
1837 	}
1838 }
1839 
1840 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1841 {
1842 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1843 
1844 	if (!snapshot)
1845 		return NULL;
1846 
1847 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
1848 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
1849 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
1850 	snapshot->head = xe_lrc_ring_head(lrc);
1851 	snapshot->tail.internal = lrc->ring.tail;
1852 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
1853 	snapshot->start = xe_lrc_ring_start(lrc);
1854 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1855 	snapshot->seqno = xe_lrc_seqno(lrc);
1856 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
1857 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
1858 	snapshot->lrc_size = lrc->size;
1859 	snapshot->lrc_snapshot = NULL;
1860 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
1861 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
1862 	return snapshot;
1863 }
1864 
1865 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
1866 {
1867 	struct xe_bo *bo;
1868 	struct iosys_map src;
1869 
1870 	if (!snapshot)
1871 		return;
1872 
1873 	bo = snapshot->lrc_bo;
1874 	snapshot->lrc_bo = NULL;
1875 
1876 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
1877 	if (!snapshot->lrc_snapshot)
1878 		goto put_bo;
1879 
1880 	xe_bo_lock(bo, false);
1881 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
1882 		xe_map_memcpy_from(xe_bo_device(bo),
1883 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
1884 				   snapshot->lrc_size);
1885 		ttm_bo_vunmap(&bo->ttm, &src);
1886 	} else {
1887 		kvfree(snapshot->lrc_snapshot);
1888 		snapshot->lrc_snapshot = NULL;
1889 	}
1890 	xe_bo_unlock(bo);
1891 put_bo:
1892 	xe_bo_put(bo);
1893 }
1894 
1895 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
1896 {
1897 	unsigned long i;
1898 
1899 	if (!snapshot)
1900 		return;
1901 
1902 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
1903 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
1904 		   snapshot->ring_addr);
1905 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
1906 		   snapshot->indirect_context_desc);
1907 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
1908 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
1909 		   snapshot->tail.internal, snapshot->tail.memory);
1910 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
1911 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
1912 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
1913 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
1914 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
1915 
1916 	if (!snapshot->lrc_snapshot)
1917 		return;
1918 
1919 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
1920 	drm_puts(p, "\t[HWSP].data: ");
1921 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
1922 		u32 *val = snapshot->lrc_snapshot + i;
1923 		char dumped[ASCII85_BUFSZ];
1924 
1925 		drm_puts(p, ascii85_encode(*val, dumped));
1926 	}
1927 
1928 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
1929 	drm_puts(p, "\t[HWCTX].data: ");
1930 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
1931 		u32 *val = snapshot->lrc_snapshot + i;
1932 		char dumped[ASCII85_BUFSZ];
1933 
1934 		drm_puts(p, ascii85_encode(*val, dumped));
1935 	}
1936 	drm_puts(p, "\n");
1937 }
1938 
1939 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
1940 {
1941 	if (!snapshot)
1942 		return;
1943 
1944 	kvfree(snapshot->lrc_snapshot);
1945 	if (snapshot->lrc_bo)
1946 		xe_bo_put(snapshot->lrc_bo);
1947 
1948 	kfree(snapshot);
1949 }
1950 
1951 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
1952 {
1953 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
1954 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
1955 	struct xe_hw_engine *hwe;
1956 	u64 val;
1957 
1958 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
1959 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
1960 			    "Unexpected engine class:instance %d:%d for context utilization\n",
1961 			    class, instance))
1962 		return -1;
1963 
1964 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1965 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
1966 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
1967 	else
1968 		val = xe_mmio_read32(&hwe->gt->mmio,
1969 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
1970 
1971 	*reg_ctx_ts = val;
1972 
1973 	return 0;
1974 }
1975 
1976 /**
1977  * xe_lrc_update_timestamp() - Update ctx timestamp
1978  * @lrc: Pointer to the lrc.
1979  * @old_ts: Old timestamp value
1980  *
1981  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
1982  * update saved value. With support for active contexts, the calculation may be
1983  * slightly racy, so follow a read-again logic to ensure that the context is
1984  * still active before returning the right timestamp.
1985  *
1986  * Returns: New ctx timestamp value
1987  */
1988 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
1989 {
1990 	u64 lrc_ts, reg_ts;
1991 	u32 engine_id;
1992 
1993 	*old_ts = lrc->ctx_timestamp;
1994 
1995 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
1996 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
1997 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
1998 		lrc->ctx_timestamp = lrc_ts;
1999 		goto done;
2000 	}
2001 
2002 	if (lrc_ts == CONTEXT_ACTIVE) {
2003 		engine_id = xe_lrc_engine_id(lrc);
2004 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2005 			lrc->ctx_timestamp = reg_ts;
2006 
2007 		/* read lrc again to ensure context is still active */
2008 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2009 	}
2010 
2011 	/*
2012 	 * If context switched out, just use the lrc_ts. Note that this needs to
2013 	 * be a separate if condition.
2014 	 */
2015 	if (lrc_ts != CONTEXT_ACTIVE)
2016 		lrc->ctx_timestamp = lrc_ts;
2017 
2018 done:
2019 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2020 
2021 	return lrc->ctx_timestamp;
2022 }
2023 
2024 /**
2025  * xe_lrc_ring_is_idle() - LRC is idle
2026  * @lrc: Pointer to the lrc.
2027  *
2028  * Compare LRC ring head and tail to determine if idle.
2029  *
2030  * Return: True is ring is idle, False otherwise
2031  */
2032 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2033 {
2034 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2035 }
2036