1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2021 Intel Corporation
4 */
5
6 #include "xe_lrc.h"
7
8 #include <generated/xe_wa_oob.h>
9
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_lrc_layout.h"
18 #include "xe_bb.h"
19 #include "xe_bo.h"
20 #include "xe_configfs.h"
21 #include "xe_device.h"
22 #include "xe_drm_client.h"
23 #include "xe_exec_queue_types.h"
24 #include "xe_gt.h"
25 #include "xe_gt_printk.h"
26 #include "xe_hw_fence.h"
27 #include "xe_map.h"
28 #include "xe_memirq.h"
29 #include "xe_mmio.h"
30 #include "xe_sriov.h"
31 #include "xe_trace_lrc.h"
32 #include "xe_vm.h"
33 #include "xe_wa.h"
34
35 #define LRC_VALID BIT_ULL(0)
36 #define LRC_PRIVILEGE BIT_ULL(8)
37 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3)
38 #define LRC_LEGACY_64B_CONTEXT 3
39
40 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61)
41 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
42
43 #define LRC_PPHWSP_SIZE SZ_4K
44 #define LRC_INDIRECT_CTX_BO_SIZE SZ_4K
45 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
46
47 #define LRC_PRIORITY GENMASK_ULL(10, 9)
48 #define LRC_PRIORITY_LOW 0
49 #define LRC_PRIORITY_NORMAL 1
50 #define LRC_PRIORITY_HIGH 2
51
52 /*
53 * Layout of the LRC and associated data allocated as
54 * lrc->bo:
55 *
56 * Region Size
57 * +============================+=================================+ <- __xe_lrc_ring_offset()
58 * | Ring | ring_size, see |
59 * | | xe_lrc_init() |
60 * +============================+=================================+ <- __xe_lrc_pphwsp_offset()
61 * | PPHWSP (includes SW state) | 4K |
62 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
63 * | Engine Context Image | n * 4K, see |
64 * | | xe_gt_lrc_size() |
65 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
66 * | Indirect Ring State Page | 0 or 4k, see |
67 * | | XE_LRC_FLAG_INDIRECT_RING_STATE |
68 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
69 * | Indirect Context Page | 0 or 4k, see |
70 * | | XE_LRC_FLAG_INDIRECT_CTX |
71 * +============================+=================================+ <- __xe_lrc_wa_bb_offset()
72 * | WA BB Per Ctx | 4k |
73 * +============================+=================================+ <- xe_bo_size(lrc->bo)
74 */
75
76 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)77 lrc_to_xe(struct xe_lrc *lrc)
78 {
79 return gt_to_xe(lrc->fence_ctx.gt);
80 }
81
82 static bool
gt_engine_needs_indirect_ctx(struct xe_gt * gt,enum xe_engine_class class)83 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
84 {
85 struct xe_device *xe = gt_to_xe(gt);
86
87 if (XE_GT_WA(gt, 16010904313) &&
88 (class == XE_ENGINE_CLASS_RENDER ||
89 class == XE_ENGINE_CLASS_COMPUTE))
90 return true;
91
92 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
93 class, NULL))
94 return true;
95
96 return false;
97 }
98
99 /**
100 * xe_gt_lrc_hang_replay_size() - Hang replay size
101 * @gt: The GT
102 * @class: Hardware engine class
103 *
104 * Determine size of GPU hang replay state for a GT and hardware engine class.
105 *
106 * Return: Size of GPU hang replay size
107 */
xe_gt_lrc_hang_replay_size(struct xe_gt * gt,enum xe_engine_class class)108 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
109 {
110 struct xe_device *xe = gt_to_xe(gt);
111 size_t size = 0;
112
113 /* Engine context image */
114 switch (class) {
115 case XE_ENGINE_CLASS_RENDER:
116 if (GRAPHICS_VER(xe) >= 20)
117 size += 3 * SZ_4K;
118 else
119 size += 13 * SZ_4K;
120 break;
121 case XE_ENGINE_CLASS_COMPUTE:
122 if (GRAPHICS_VER(xe) >= 20)
123 size += 2 * SZ_4K;
124 else
125 size += 13 * SZ_4K;
126 break;
127 default:
128 WARN(1, "Unknown engine class: %d", class);
129 fallthrough;
130 case XE_ENGINE_CLASS_COPY:
131 case XE_ENGINE_CLASS_VIDEO_DECODE:
132 case XE_ENGINE_CLASS_VIDEO_ENHANCE:
133 case XE_ENGINE_CLASS_OTHER:
134 size += 1 * SZ_4K;
135 }
136
137 return size;
138 }
139
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)140 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
141 {
142 size_t size = xe_gt_lrc_hang_replay_size(gt, class);
143
144 /* Add indirect ring state page */
145 if (xe_gt_has_indirect_ring_state(gt))
146 size += LRC_INDIRECT_RING_STATE_SIZE;
147
148 return size + LRC_PPHWSP_SIZE;
149 }
150
151 /*
152 * The per-platform tables are u8-encoded in @data. Decode @data and set the
153 * addresses' offset and commands in @regs. The following encoding is used
154 * for each byte. There are 2 steps: decoding commands and decoding addresses.
155 *
156 * Commands:
157 * [7]: create NOPs - number of NOPs are set in lower bits
158 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
159 * MI_LRI_FORCE_POSTED
160 * [5:0]: Number of NOPs or registers to set values to in case of
161 * MI_LOAD_REGISTER_IMM
162 *
163 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
164 * number of registers. They are set by using the REG/REG16 macros: the former
165 * is used for offsets smaller than 0x200 while the latter is for values bigger
166 * than that. Those macros already set all the bits documented below correctly:
167 *
168 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
169 * follow, for the lower bits
170 * [6:0]: Register offset, without considering the engine base.
171 *
172 * This function only tweaks the commands and register offsets. Values are not
173 * filled out.
174 */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)175 static void set_offsets(u32 *regs,
176 const u8 *data,
177 const struct xe_hw_engine *hwe)
178 #define NOP(x) (BIT(7) | (x))
179 #define LRI(count, flags) ((flags) << 6 | (count) | \
180 BUILD_BUG_ON_ZERO(count >= BIT(6)))
181 #define POSTED BIT(0)
182 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
183 #define REG16(x) \
184 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
185 (((x) >> 2) & 0x7f)
186 {
187 const u32 base = hwe->mmio_base;
188
189 while (*data) {
190 u8 count, flags;
191
192 if (*data & BIT(7)) { /* skip */
193 count = *data++ & ~BIT(7);
194 regs += count;
195 continue;
196 }
197
198 count = *data & 0x3f;
199 flags = *data >> 6;
200 data++;
201
202 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
203 if (flags & POSTED)
204 *regs |= MI_LRI_FORCE_POSTED;
205 *regs |= MI_LRI_LRM_CS_MMIO;
206 regs++;
207
208 xe_gt_assert(hwe->gt, count);
209 do {
210 u32 offset = 0;
211 u8 v;
212
213 do {
214 v = *data++;
215 offset <<= 7;
216 offset |= v & ~BIT(7);
217 } while (v & BIT(7));
218
219 regs[0] = base + (offset << 2);
220 regs += 2;
221 } while (--count);
222 }
223
224 *regs = MI_BATCH_BUFFER_END | BIT(0);
225 }
226
227 static const u8 gen12_xcs_offsets[] = {
228 NOP(1),
229 LRI(13, POSTED),
230 REG16(0x244),
231 REG(0x034),
232 REG(0x030),
233 REG(0x038),
234 REG(0x03c),
235 REG(0x168),
236 REG(0x140),
237 REG(0x110),
238 REG(0x1c0),
239 REG(0x1c4),
240 REG(0x1c8),
241 REG(0x180),
242 REG16(0x2b4),
243
244 NOP(5),
245 LRI(9, POSTED),
246 REG16(0x3a8),
247 REG16(0x28c),
248 REG16(0x288),
249 REG16(0x284),
250 REG16(0x280),
251 REG16(0x27c),
252 REG16(0x278),
253 REG16(0x274),
254 REG16(0x270),
255
256 0
257 };
258
259 static const u8 dg2_xcs_offsets[] = {
260 NOP(1),
261 LRI(15, POSTED),
262 REG16(0x244),
263 REG(0x034),
264 REG(0x030),
265 REG(0x038),
266 REG(0x03c),
267 REG(0x168),
268 REG(0x140),
269 REG(0x110),
270 REG(0x1c0),
271 REG(0x1c4),
272 REG(0x1c8),
273 REG(0x180),
274 REG16(0x2b4),
275 REG(0x120),
276 REG(0x124),
277
278 NOP(1),
279 LRI(9, POSTED),
280 REG16(0x3a8),
281 REG16(0x28c),
282 REG16(0x288),
283 REG16(0x284),
284 REG16(0x280),
285 REG16(0x27c),
286 REG16(0x278),
287 REG16(0x274),
288 REG16(0x270),
289
290 0
291 };
292
293 static const u8 gen12_rcs_offsets[] = {
294 NOP(1),
295 LRI(13, POSTED),
296 REG16(0x244),
297 REG(0x034),
298 REG(0x030),
299 REG(0x038),
300 REG(0x03c),
301 REG(0x168),
302 REG(0x140),
303 REG(0x110),
304 REG(0x1c0),
305 REG(0x1c4),
306 REG(0x1c8),
307 REG(0x180),
308 REG16(0x2b4),
309
310 NOP(5),
311 LRI(9, POSTED),
312 REG16(0x3a8),
313 REG16(0x28c),
314 REG16(0x288),
315 REG16(0x284),
316 REG16(0x280),
317 REG16(0x27c),
318 REG16(0x278),
319 REG16(0x274),
320 REG16(0x270),
321
322 LRI(3, POSTED),
323 REG(0x1b0),
324 REG16(0x5a8),
325 REG16(0x5ac),
326
327 NOP(6),
328 LRI(1, 0),
329 REG(0x0c8),
330 NOP(3 + 9 + 1),
331
332 LRI(51, POSTED),
333 REG16(0x588),
334 REG16(0x588),
335 REG16(0x588),
336 REG16(0x588),
337 REG16(0x588),
338 REG16(0x588),
339 REG(0x028),
340 REG(0x09c),
341 REG(0x0c0),
342 REG(0x178),
343 REG(0x17c),
344 REG16(0x358),
345 REG(0x170),
346 REG(0x150),
347 REG(0x154),
348 REG(0x158),
349 REG16(0x41c),
350 REG16(0x600),
351 REG16(0x604),
352 REG16(0x608),
353 REG16(0x60c),
354 REG16(0x610),
355 REG16(0x614),
356 REG16(0x618),
357 REG16(0x61c),
358 REG16(0x620),
359 REG16(0x624),
360 REG16(0x628),
361 REG16(0x62c),
362 REG16(0x630),
363 REG16(0x634),
364 REG16(0x638),
365 REG16(0x63c),
366 REG16(0x640),
367 REG16(0x644),
368 REG16(0x648),
369 REG16(0x64c),
370 REG16(0x650),
371 REG16(0x654),
372 REG16(0x658),
373 REG16(0x65c),
374 REG16(0x660),
375 REG16(0x664),
376 REG16(0x668),
377 REG16(0x66c),
378 REG16(0x670),
379 REG16(0x674),
380 REG16(0x678),
381 REG16(0x67c),
382 REG(0x068),
383 REG(0x084),
384 NOP(1),
385
386 0
387 };
388
389 static const u8 xehp_rcs_offsets[] = {
390 NOP(1),
391 LRI(13, POSTED),
392 REG16(0x244),
393 REG(0x034),
394 REG(0x030),
395 REG(0x038),
396 REG(0x03c),
397 REG(0x168),
398 REG(0x140),
399 REG(0x110),
400 REG(0x1c0),
401 REG(0x1c4),
402 REG(0x1c8),
403 REG(0x180),
404 REG16(0x2b4),
405
406 NOP(5),
407 LRI(9, POSTED),
408 REG16(0x3a8),
409 REG16(0x28c),
410 REG16(0x288),
411 REG16(0x284),
412 REG16(0x280),
413 REG16(0x27c),
414 REG16(0x278),
415 REG16(0x274),
416 REG16(0x270),
417
418 LRI(3, POSTED),
419 REG(0x1b0),
420 REG16(0x5a8),
421 REG16(0x5ac),
422
423 NOP(6),
424 LRI(1, 0),
425 REG(0x0c8),
426
427 0
428 };
429
430 static const u8 dg2_rcs_offsets[] = {
431 NOP(1),
432 LRI(15, POSTED),
433 REG16(0x244),
434 REG(0x034),
435 REG(0x030),
436 REG(0x038),
437 REG(0x03c),
438 REG(0x168),
439 REG(0x140),
440 REG(0x110),
441 REG(0x1c0),
442 REG(0x1c4),
443 REG(0x1c8),
444 REG(0x180),
445 REG16(0x2b4),
446 REG(0x120),
447 REG(0x124),
448
449 NOP(1),
450 LRI(9, POSTED),
451 REG16(0x3a8),
452 REG16(0x28c),
453 REG16(0x288),
454 REG16(0x284),
455 REG16(0x280),
456 REG16(0x27c),
457 REG16(0x278),
458 REG16(0x274),
459 REG16(0x270),
460
461 LRI(3, POSTED),
462 REG(0x1b0),
463 REG16(0x5a8),
464 REG16(0x5ac),
465
466 NOP(6),
467 LRI(1, 0),
468 REG(0x0c8),
469
470 0
471 };
472
473 static const u8 mtl_rcs_offsets[] = {
474 NOP(1),
475 LRI(15, POSTED),
476 REG16(0x244),
477 REG(0x034),
478 REG(0x030),
479 REG(0x038),
480 REG(0x03c),
481 REG(0x168),
482 REG(0x140),
483 REG(0x110),
484 REG(0x1c0),
485 REG(0x1c4),
486 REG(0x1c8),
487 REG(0x180),
488 REG16(0x2b4),
489 REG(0x120),
490 REG(0x124),
491
492 NOP(1),
493 LRI(9, POSTED),
494 REG16(0x3a8),
495 REG16(0x28c),
496 REG16(0x288),
497 REG16(0x284),
498 REG16(0x280),
499 REG16(0x27c),
500 REG16(0x278),
501 REG16(0x274),
502 REG16(0x270),
503
504 NOP(2),
505 LRI(2, POSTED),
506 REG16(0x5a8),
507 REG16(0x5ac),
508
509 NOP(6),
510 LRI(1, 0),
511 REG(0x0c8),
512
513 0
514 };
515
516 #define XE2_CTX_COMMON \
517 NOP(1), /* [0x00] */ \
518 LRI(15, POSTED), /* [0x01] */ \
519 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \
520 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \
521 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \
522 REG(0x038), /* [0x08] RING_BUFFER_START */ \
523 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \
524 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \
525 REG(0x140), /* [0x0e] BB_ADDR */ \
526 REG(0x110), /* [0x10] BB_STATE */ \
527 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \
528 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \
529 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
530 REG(0x180), /* [0x18] CCID */ \
531 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \
532 REG(0x120), /* [0x1c] PRT_BB_STATE */ \
533 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \
534 \
535 NOP(1), /* [0x20] */ \
536 LRI(9, POSTED), /* [0x21] */ \
537 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \
538 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \
539 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \
540 REG16(0x284), /* [0x28] dummy reg */ \
541 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \
542 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \
543 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \
544 REG16(0x274), /* [0x30] PTBP_UDW */ \
545 REG16(0x270) /* [0x32] PTBP_LDW */
546
547 static const u8 xe2_rcs_offsets[] = {
548 XE2_CTX_COMMON,
549
550 NOP(2), /* [0x34] */
551 LRI(2, POSTED), /* [0x36] */
552 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
553 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */
554
555 NOP(6), /* [0x41] */
556 LRI(1, 0), /* [0x47] */
557 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */
558
559 0
560 };
561
562 static const u8 xe2_bcs_offsets[] = {
563 XE2_CTX_COMMON,
564
565 NOP(4 + 8 + 1), /* [0x34] */
566 LRI(2, POSTED), /* [0x41] */
567 REG16(0x200), /* [0x42] BCS_SWCTRL */
568 REG16(0x204), /* [0x44] BLIT_CCTL */
569
570 0
571 };
572
573 static const u8 xe2_xcs_offsets[] = {
574 XE2_CTX_COMMON,
575
576 0
577 };
578
579 static const u8 xe2_indirect_ring_state_offsets[] = {
580 NOP(1), /* [0x00] */
581 LRI(5, POSTED), /* [0x01] */
582 REG(0x034), /* [0x02] RING_BUFFER_HEAD */
583 REG(0x030), /* [0x04] RING_BUFFER_TAIL */
584 REG(0x038), /* [0x06] RING_BUFFER_START */
585 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */
586 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */
587
588 NOP(5), /* [0x0c] */
589 LRI(9, POSTED), /* [0x11] */
590 REG(0x168), /* [0x12] BB_ADDR_UDW */
591 REG(0x140), /* [0x14] BB_ADDR */
592 REG(0x110), /* [0x16] BB_STATE */
593 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */
594 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */
595 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */
596 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */
597 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */
598 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */
599
600 NOP(12), /* [0x00] */
601
602 0
603 };
604
605 #undef REG16
606 #undef REG
607 #undef LRI
608 #undef NOP
609
reg_offsets(struct xe_device * xe,enum xe_engine_class class)610 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
611 {
612 if (class == XE_ENGINE_CLASS_RENDER) {
613 if (GRAPHICS_VER(xe) >= 20)
614 return xe2_rcs_offsets;
615 else if (GRAPHICS_VERx100(xe) >= 1270)
616 return mtl_rcs_offsets;
617 else if (GRAPHICS_VERx100(xe) >= 1255)
618 return dg2_rcs_offsets;
619 else if (GRAPHICS_VERx100(xe) >= 1250)
620 return xehp_rcs_offsets;
621 else
622 return gen12_rcs_offsets;
623 } else if (class == XE_ENGINE_CLASS_COPY) {
624 if (GRAPHICS_VER(xe) >= 20)
625 return xe2_bcs_offsets;
626 else
627 return gen12_xcs_offsets;
628 } else {
629 if (GRAPHICS_VER(xe) >= 20)
630 return xe2_xcs_offsets;
631 else if (GRAPHICS_VERx100(xe) >= 1255)
632 return dg2_xcs_offsets;
633 else
634 return gen12_xcs_offsets;
635 }
636 }
637
set_context_control(u32 * regs,struct xe_hw_engine * hwe)638 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
639 {
640 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
641 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
642
643 if (xe_gt_has_indirect_ring_state(hwe->gt))
644 regs[CTX_CONTEXT_CONTROL] |=
645 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
646 }
647
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)648 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
649 {
650 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq;
651 struct xe_device *xe = gt_to_xe(hwe->gt);
652 u8 num_regs;
653
654 if (!xe_device_uses_memirq(xe))
655 return;
656
657 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
658 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
659 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
660 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
661
662 num_regs = xe_device_has_msix(xe) ? 3 : 2;
663 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
664 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
665 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
666 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
667 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
668 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
669
670 if (xe_device_has_msix(xe)) {
671 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
672 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
673 }
674 }
675
lrc_ring_mi_mode(struct xe_hw_engine * hwe)676 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
677 {
678 struct xe_device *xe = gt_to_xe(hwe->gt);
679
680 if (GRAPHICS_VERx100(xe) >= 1250)
681 return 0x70;
682 else
683 return 0x60;
684 }
685
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)686 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
687 {
688 int x;
689
690 x = lrc_ring_mi_mode(hwe);
691 regs[x + 1] &= ~STOP_RING;
692 regs[x + 1] |= STOP_RING << 16;
693 }
694
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)695 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
696 {
697 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
698 }
699
__xe_lrc_ring_offset(struct xe_lrc * lrc)700 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
701 {
702 return 0;
703 }
704
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)705 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
706 {
707 return lrc->ring.size;
708 }
709
710 /* Make the magic macros work */
711 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
712 #define __xe_lrc_regs_offset xe_lrc_regs_offset
713
714 #define LRC_SEQNO_PPHWSP_OFFSET 512
715 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
716 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
717 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
718 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
719
xe_lrc_regs_offset(struct xe_lrc * lrc)720 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
721 {
722 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
723 }
724
725 /**
726 * xe_lrc_reg_size() - Get size of the LRC registers area within queues
727 * @xe: the &xe_device struct instance
728 *
729 * Returns: Size of the LRC registers area for current platform
730 */
xe_lrc_reg_size(struct xe_device * xe)731 size_t xe_lrc_reg_size(struct xe_device *xe)
732 {
733 if (GRAPHICS_VERx100(xe) >= 1250)
734 return 96 * sizeof(u32);
735 else
736 return 80 * sizeof(u32);
737 }
738
xe_lrc_skip_size(struct xe_device * xe)739 size_t xe_lrc_skip_size(struct xe_device *xe)
740 {
741 return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
742 }
743
__xe_lrc_seqno_offset(struct xe_lrc * lrc)744 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
745 {
746 /* The seqno is stored in the driver-defined portion of PPHWSP */
747 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
748 }
749
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)750 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
751 {
752 /* The start seqno is stored in the driver-defined portion of PPHWSP */
753 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
754 }
755
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)756 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
757 {
758 /* This is stored in the driver-defined portion of PPHWSP */
759 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
760 }
761
__xe_lrc_parallel_offset(struct xe_lrc * lrc)762 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
763 {
764 /* The parallel is stored in the driver-defined portion of PPHWSP */
765 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
766 }
767
__xe_lrc_engine_id_offset(struct xe_lrc * lrc)768 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
769 {
770 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
771 }
772
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)773 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
774 {
775 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
776 }
777
__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc * lrc)778 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
779 {
780 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
781 }
782
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)783 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
784 {
785 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
786 LRC_INDIRECT_RING_STATE_SIZE;
787
788 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
789 offset -= LRC_INDIRECT_CTX_BO_SIZE;
790
791 return offset;
792 }
793
__xe_lrc_indirect_ctx_offset(struct xe_lrc * lrc)794 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
795 {
796 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
797 }
798
__xe_lrc_wa_bb_offset(struct xe_lrc * lrc)799 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
800 {
801 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
802 }
803
804 #define DECL_MAP_ADDR_HELPERS(elem) \
805 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
806 { \
807 struct iosys_map map = lrc->bo->vmap; \
808 \
809 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \
810 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
811 return map; \
812 } \
813 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
814 { \
815 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
816 } \
817
818 DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)819 DECL_MAP_ADDR_HELPERS(pphwsp)
820 DECL_MAP_ADDR_HELPERS(seqno)
821 DECL_MAP_ADDR_HELPERS(regs)
822 DECL_MAP_ADDR_HELPERS(start_seqno)
823 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
824 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
825 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
826 DECL_MAP_ADDR_HELPERS(parallel)
827 DECL_MAP_ADDR_HELPERS(indirect_ring)
828 DECL_MAP_ADDR_HELPERS(engine_id)
829
830 #undef DECL_MAP_ADDR_HELPERS
831
832 /**
833 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
834 * @lrc: Pointer to the lrc.
835 *
836 * Returns: ctx timestamp GGTT address
837 */
838 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
839 {
840 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
841 }
842
843 /**
844 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
845 * @lrc: Pointer to the lrc.
846 *
847 * Returns: ctx timestamp udw GGTT address
848 */
xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc * lrc)849 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
850 {
851 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
852 }
853
854 /**
855 * xe_lrc_ctx_timestamp() - Read ctx timestamp value
856 * @lrc: Pointer to the lrc.
857 *
858 * Returns: ctx timestamp value
859 */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)860 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
861 {
862 struct xe_device *xe = lrc_to_xe(lrc);
863 struct iosys_map map;
864 u32 ldw, udw = 0;
865
866 map = __xe_lrc_ctx_timestamp_map(lrc);
867 ldw = xe_map_read32(xe, &map);
868
869 if (xe->info.has_64bit_timestamp) {
870 map = __xe_lrc_ctx_timestamp_udw_map(lrc);
871 udw = xe_map_read32(xe, &map);
872 }
873
874 return (u64)udw << 32 | ldw;
875 }
876
877 /**
878 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
879 * @lrc: Pointer to the lrc.
880 *
881 * Returns: ctx timestamp job GGTT address
882 */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)883 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
884 {
885 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
886 }
887
888 /**
889 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
890 * @lrc: Pointer to the lrc.
891 *
892 * Returns: ctx timestamp job value
893 */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)894 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
895 {
896 struct xe_device *xe = lrc_to_xe(lrc);
897 struct iosys_map map;
898
899 map = __xe_lrc_ctx_job_timestamp_map(lrc);
900 return xe_map_read32(xe, &map);
901 }
902
xe_lrc_ggtt_addr(struct xe_lrc * lrc)903 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
904 {
905 return __xe_lrc_pphwsp_ggtt_addr(lrc);
906 }
907
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)908 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
909 {
910 if (!xe_lrc_has_indirect_ring_state(lrc))
911 return 0;
912
913 return __xe_lrc_indirect_ring_ggtt_addr(lrc);
914 }
915
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)916 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
917 {
918 struct xe_device *xe = lrc_to_xe(lrc);
919 struct iosys_map map;
920
921 map = __xe_lrc_indirect_ring_map(lrc);
922 iosys_map_incr(&map, reg_nr * sizeof(u32));
923 return xe_map_read32(xe, &map);
924 }
925
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)926 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
927 int reg_nr, u32 val)
928 {
929 struct xe_device *xe = lrc_to_xe(lrc);
930 struct iosys_map map;
931
932 map = __xe_lrc_indirect_ring_map(lrc);
933 iosys_map_incr(&map, reg_nr * sizeof(u32));
934 xe_map_write32(xe, &map, val);
935 }
936
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)937 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
938 {
939 struct xe_device *xe = lrc_to_xe(lrc);
940 struct iosys_map map;
941
942 map = __xe_lrc_regs_map(lrc);
943 iosys_map_incr(&map, reg_nr * sizeof(u32));
944 return xe_map_read32(xe, &map);
945 }
946
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)947 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
948 {
949 struct xe_device *xe = lrc_to_xe(lrc);
950 struct iosys_map map;
951
952 map = __xe_lrc_regs_map(lrc);
953 iosys_map_incr(&map, reg_nr * sizeof(u32));
954 xe_map_write32(xe, &map, val);
955 }
956
empty_lrc_data(struct xe_hw_engine * hwe)957 static void *empty_lrc_data(struct xe_hw_engine *hwe)
958 {
959 struct xe_gt *gt = hwe->gt;
960 void *data;
961 u32 *regs;
962
963 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
964 if (!data)
965 return NULL;
966
967 /* 1st page: Per-Process of HW status Page */
968 regs = data + LRC_PPHWSP_SIZE;
969 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
970 set_context_control(regs, hwe);
971 set_memory_based_intr(regs, hwe);
972 reset_stop_ring(regs, hwe);
973 if (xe_gt_has_indirect_ring_state(gt)) {
974 regs = data + xe_gt_lrc_size(gt, hwe->class) -
975 LRC_INDIRECT_RING_STATE_SIZE;
976 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
977 }
978
979 return data;
980 }
981
982 /**
983 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
984 * of given engine.
985 * @hwe: the &xe_hw_engine struct instance
986 */
xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine * hwe)987 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
988 {
989 struct xe_gt *gt = hwe->gt;
990 u32 *regs;
991
992 if (!gt->default_lrc[hwe->class])
993 return;
994
995 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
996 set_memory_based_intr(regs, hwe);
997 }
998
999 /**
1000 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
1001 * for given LRC.
1002 * @lrc: the &xe_lrc struct instance
1003 * @hwe: the &xe_hw_engine struct instance
1004 * @regs: scratch buffer to be used as temporary storage
1005 */
xe_lrc_update_memirq_regs_with_address(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * regs)1006 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1007 u32 *regs)
1008 {
1009 struct xe_gt *gt = hwe->gt;
1010 struct iosys_map map;
1011 size_t regs_len;
1012
1013 if (!xe_device_uses_memirq(gt_to_xe(gt)))
1014 return;
1015
1016 map = __xe_lrc_regs_map(lrc);
1017 regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1018 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1019 set_memory_based_intr(regs, hwe);
1020 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1021 }
1022
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)1023 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1024 {
1025 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1026
1027 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1028 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1029 }
1030
xe_lrc_finish(struct xe_lrc * lrc)1031 static void xe_lrc_finish(struct xe_lrc *lrc)
1032 {
1033 xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1034 xe_bo_unpin_map_no_vm(lrc->bo);
1035 }
1036
1037 /*
1038 * wa_bb_setup_utilization() - Write commands to wa bb to assist
1039 * in calculating active context run ticks.
1040 *
1041 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1042 * context, but only gets updated when the context switches out. In order to
1043 * check how long a context has been active before it switches out, two things
1044 * are required:
1045 *
1046 * (1) Determine if the context is running:
1047 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1048 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1049 * initialized. During a query, we just check for this value to determine if the
1050 * context is active. If the context switched out, it would overwrite this
1051 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1052 * the last part of context restore, so reusing this LRC location will not
1053 * clobber anything.
1054 *
1055 * (2) Calculate the time that the context has been active for:
1056 * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1057 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1058 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1059 * engine instance. Since we do not know which instance the context is running
1060 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1061 * store it in the PPHSWP.
1062 */
1063 #define CONTEXT_ACTIVE 1ULL
setup_utilization_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1064 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1065 struct xe_hw_engine *hwe,
1066 u32 *batch,
1067 size_t max_len)
1068 {
1069 u32 *cmd = batch;
1070
1071 if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1072 return 0;
1073
1074 if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1075 return -ENOSPC;
1076
1077 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1078 *cmd++ = ENGINE_ID(0).addr;
1079 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1080 *cmd++ = 0;
1081
1082 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1083 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1084 *cmd++ = 0;
1085 *cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1086
1087 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1088 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1089 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1090 *cmd++ = 0;
1091 *cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1092 }
1093
1094 return cmd - batch;
1095 }
1096
setup_timestamp_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1097 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1098 u32 *batch, size_t max_len)
1099 {
1100 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1101 u32 *cmd = batch;
1102
1103 if (!XE_GT_WA(lrc->gt, 16010904313) ||
1104 !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1105 hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1106 hwe->class == XE_ENGINE_CLASS_COPY ||
1107 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1108 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1109 return 0;
1110
1111 if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1112 return -ENOSPC;
1113
1114 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1115 MI_LRM_ASYNC;
1116 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1117 *cmd++ = ts_addr;
1118 *cmd++ = 0;
1119
1120 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1121 MI_LRM_ASYNC;
1122 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1123 *cmd++ = ts_addr;
1124 *cmd++ = 0;
1125
1126 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1127 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1128 *cmd++ = ts_addr;
1129 *cmd++ = 0;
1130
1131 return cmd - batch;
1132 }
1133
setup_configfs_post_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1134 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1135 struct xe_hw_engine *hwe,
1136 u32 *batch, size_t max_len)
1137 {
1138 struct xe_device *xe = gt_to_xe(lrc->gt);
1139 const u32 *user_batch;
1140 u32 *cmd = batch;
1141 u32 count;
1142
1143 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1144 hwe->class, &user_batch);
1145 if (!count)
1146 return 0;
1147
1148 if (count > max_len)
1149 return -ENOSPC;
1150
1151 /*
1152 * This should be used only for tests and validation. Taint the kernel
1153 * as anything could be submitted directly in context switches
1154 */
1155 add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1156
1157 memcpy(cmd, user_batch, count * sizeof(u32));
1158 cmd += count;
1159
1160 return cmd - batch;
1161 }
1162
setup_configfs_mid_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1163 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1164 struct xe_hw_engine *hwe,
1165 u32 *batch, size_t max_len)
1166 {
1167 struct xe_device *xe = gt_to_xe(lrc->gt);
1168 const u32 *user_batch;
1169 u32 *cmd = batch;
1170 u32 count;
1171
1172 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1173 hwe->class, &user_batch);
1174 if (!count)
1175 return 0;
1176
1177 if (count > max_len)
1178 return -ENOSPC;
1179
1180 /*
1181 * This should be used only for tests and validation. Taint the kernel
1182 * as anything could be submitted directly in context switches
1183 */
1184 add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1185
1186 memcpy(cmd, user_batch, count * sizeof(u32));
1187 cmd += count;
1188
1189 return cmd - batch;
1190 }
1191
setup_invalidate_state_cache_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1192 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1193 struct xe_hw_engine *hwe,
1194 u32 *batch, size_t max_len)
1195 {
1196 u32 *cmd = batch;
1197
1198 if (!XE_GT_WA(lrc->gt, 18022495364) ||
1199 hwe->class != XE_ENGINE_CLASS_RENDER)
1200 return 0;
1201
1202 if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1203 return -ENOSPC;
1204
1205 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1206 *cmd++ = CS_DEBUG_MODE2(0).addr;
1207 *cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1208
1209 return cmd - batch;
1210 }
1211
1212 struct bo_setup {
1213 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1214 u32 *batch, size_t max_size);
1215 };
1216
1217 struct bo_setup_state {
1218 /* Input: */
1219 struct xe_lrc *lrc;
1220 struct xe_hw_engine *hwe;
1221 size_t max_size;
1222 size_t reserve_dw;
1223 unsigned int offset;
1224 const struct bo_setup *funcs;
1225 unsigned int num_funcs;
1226
1227 /* State: */
1228 u32 *buffer;
1229 u32 *ptr;
1230 unsigned int written;
1231 };
1232
setup_bo(struct bo_setup_state * state)1233 static int setup_bo(struct bo_setup_state *state)
1234 {
1235 ssize_t remain;
1236
1237 if (state->lrc->bo->vmap.is_iomem) {
1238 xe_gt_assert(state->hwe->gt, state->buffer);
1239 state->ptr = state->buffer;
1240 } else {
1241 state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1242 }
1243
1244 remain = state->max_size / sizeof(u32);
1245
1246 for (size_t i = 0; i < state->num_funcs; i++) {
1247 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1248 state->ptr, remain);
1249
1250 remain -= len;
1251
1252 /*
1253 * Caller has asked for at least reserve_dw to remain unused.
1254 */
1255 if (len < 0 ||
1256 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1257 goto fail;
1258
1259 state->ptr += len;
1260 state->written += len;
1261 }
1262
1263 return 0;
1264
1265 fail:
1266 return -ENOSPC;
1267 }
1268
finish_bo(struct bo_setup_state * state)1269 static void finish_bo(struct bo_setup_state *state)
1270 {
1271 if (!state->lrc->bo->vmap.is_iomem)
1272 return;
1273
1274 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1275 state->offset, state->buffer,
1276 state->written * sizeof(u32));
1277 }
1278
1279 /**
1280 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1281 * @lrc: the &xe_lrc struct instance
1282 * @hwe: the &xe_hw_engine struct instance
1283 * @scratch: preallocated scratch buffer for temporary storage
1284 * Return: 0 on success, negative error code on failure
1285 */
xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * scratch)1286 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1287 {
1288 static const struct bo_setup funcs[] = {
1289 { .setup = setup_timestamp_wa },
1290 { .setup = setup_invalidate_state_cache_wa },
1291 { .setup = setup_utilization_wa },
1292 { .setup = setup_configfs_post_ctx_restore_bb },
1293 };
1294 struct bo_setup_state state = {
1295 .lrc = lrc,
1296 .hwe = hwe,
1297 .max_size = LRC_WA_BB_SIZE,
1298 .buffer = scratch,
1299 .reserve_dw = 1,
1300 .offset = __xe_lrc_wa_bb_offset(lrc),
1301 .funcs = funcs,
1302 .num_funcs = ARRAY_SIZE(funcs),
1303 };
1304 int ret;
1305
1306 ret = setup_bo(&state);
1307 if (ret)
1308 return ret;
1309
1310 *state.ptr++ = MI_BATCH_BUFFER_END;
1311 state.written++;
1312
1313 finish_bo(&state);
1314
1315 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1316 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1317
1318 return 0;
1319 }
1320
setup_wa_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1321 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1322 {
1323 u32 *buf = NULL;
1324 int ret;
1325
1326 if (lrc->bo->vmap.is_iomem) {
1327 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1328 if (!buf)
1329 return -ENOMEM;
1330 }
1331
1332 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1333
1334 kfree(buf);
1335
1336 return ret;
1337 }
1338
1339 static int
setup_indirect_ctx(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1340 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1341 {
1342 static const struct bo_setup rcs_funcs[] = {
1343 { .setup = setup_timestamp_wa },
1344 { .setup = setup_configfs_mid_ctx_restore_bb },
1345 };
1346 static const struct bo_setup xcs_funcs[] = {
1347 { .setup = setup_configfs_mid_ctx_restore_bb },
1348 };
1349 struct bo_setup_state state = {
1350 .lrc = lrc,
1351 .hwe = hwe,
1352 .max_size = (63 * 64) /* max 63 cachelines */,
1353 .buffer = NULL,
1354 .offset = __xe_lrc_indirect_ctx_offset(lrc),
1355 };
1356 int ret;
1357
1358 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1359 return 0;
1360
1361 if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1362 hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1363 state.funcs = rcs_funcs;
1364 state.num_funcs = ARRAY_SIZE(rcs_funcs);
1365 } else {
1366 state.funcs = xcs_funcs;
1367 state.num_funcs = ARRAY_SIZE(xcs_funcs);
1368 }
1369
1370 if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1371 return 0;
1372
1373 if (lrc->bo->vmap.is_iomem) {
1374 state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1375 if (!state.buffer)
1376 return -ENOMEM;
1377 }
1378
1379 ret = setup_bo(&state);
1380 if (ret) {
1381 kfree(state.buffer);
1382 return ret;
1383 }
1384
1385 /*
1386 * Align to 64B cacheline so there's no garbage at the end for CS to
1387 * execute: size for indirect ctx must be a multiple of 64.
1388 */
1389 while (state.written & 0xf) {
1390 *state.ptr++ = MI_NOOP;
1391 state.written++;
1392 }
1393
1394 finish_bo(&state);
1395 kfree(state.buffer);
1396
1397 /*
1398 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1399 * varies per engine class, but the default is good enough
1400 */
1401 xe_lrc_write_ctx_reg(lrc,
1402 CTX_CS_INDIRECT_CTX,
1403 (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1404 /* Size in CLs. */
1405 (state.written * sizeof(u32) / 64));
1406
1407 return 0;
1408 }
1409
xe_multi_queue_prio_to_lrc(struct xe_lrc * lrc,enum xe_multi_queue_priority priority)1410 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1411 {
1412 struct xe_device *xe = gt_to_xe(lrc->gt);
1413
1414 xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1415 priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1416
1417 /* xe_multi_queue_priority is directly mapped to LRC priority values */
1418 return priority;
1419 }
1420
1421 /**
1422 * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1423 * @lrc: Logical Ring Context
1424 * @priority: Multi queue priority of the exec queue
1425 *
1426 * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1427 */
xe_lrc_set_multi_queue_priority(struct xe_lrc * lrc,enum xe_multi_queue_priority priority)1428 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1429 {
1430 lrc->desc &= ~LRC_PRIORITY;
1431 lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1432 }
1433
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,void * replay_state,u32 ring_size,u16 msix_vec,u32 init_flags)1434 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1435 struct xe_vm *vm, void *replay_state, u32 ring_size,
1436 u16 msix_vec,
1437 u32 init_flags)
1438 {
1439 struct xe_gt *gt = hwe->gt;
1440 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1441 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1442 struct xe_tile *tile = gt_to_tile(gt);
1443 struct xe_device *xe = gt_to_xe(gt);
1444 struct iosys_map map;
1445 u32 arb_enable;
1446 u32 bo_flags;
1447 int err;
1448
1449 kref_init(&lrc->refcount);
1450 lrc->gt = gt;
1451 lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1452 lrc->size = lrc_size;
1453 lrc->flags = 0;
1454 lrc->ring.size = ring_size;
1455 lrc->ring.tail = 0;
1456
1457 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1458 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1459 bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1460 }
1461
1462 if (xe_gt_has_indirect_ring_state(gt))
1463 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1464
1465 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1466 XE_BO_FLAG_GGTT_INVALIDATE;
1467
1468 if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1469 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1470
1471 lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
1472 bo_size,
1473 ttm_bo_type_kernel,
1474 bo_flags, false);
1475 if (IS_ERR(lrc->bo))
1476 return PTR_ERR(lrc->bo);
1477
1478 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1479 hwe->fence_irq, hwe->name);
1480
1481 /*
1482 * Init Per-Process of HW status Page, LRC / context state to known
1483 * values. If there's already a primed default_lrc, just copy it, otherwise
1484 * it's the early submission to record the lrc: build a new empty one from
1485 * scratch.
1486 */
1487 map = __xe_lrc_pphwsp_map(lrc);
1488 if (gt->default_lrc[hwe->class] || replay_state) {
1489 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
1490 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1491 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1492 lrc_size - LRC_PPHWSP_SIZE);
1493 if (replay_state)
1494 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1495 replay_state, lrc->replay_size);
1496 } else {
1497 void *init_data = empty_lrc_data(hwe);
1498
1499 if (!init_data) {
1500 err = -ENOMEM;
1501 goto err_lrc_finish;
1502 }
1503
1504 xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1505 kfree(init_data);
1506 }
1507
1508 if (vm) {
1509 xe_lrc_set_ppgtt(lrc, vm);
1510
1511 if (vm->xef)
1512 xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1513 }
1514
1515 if (xe_device_has_msix(xe)) {
1516 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1517 xe_memirq_status_ptr(&tile->memirq, hwe));
1518 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1519 xe_memirq_source_ptr(&tile->memirq, hwe));
1520 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1521 }
1522
1523 if (xe_gt_has_indirect_ring_state(gt)) {
1524 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1525 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1526
1527 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1528 __xe_lrc_ring_ggtt_addr(lrc));
1529 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1530 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1531 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1532 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1533 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1534 } else {
1535 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1536 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1537 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1538 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1539 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1540 }
1541
1542 if (init_flags & XE_LRC_CREATE_RUNALONE)
1543 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1544 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1545 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1546
1547 if (init_flags & XE_LRC_CREATE_PXP)
1548 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1549 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1550 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1551
1552 lrc->ctx_timestamp = 0;
1553 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1554 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1555 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1556
1557 if (xe->info.has_asid && vm)
1558 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1559
1560 lrc->desc = LRC_VALID;
1561 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1562 /* TODO: Priority */
1563
1564 /* While this appears to have something about privileged batches or
1565 * some such, it really just means PPGTT mode.
1566 */
1567 if (vm)
1568 lrc->desc |= LRC_PRIVILEGE;
1569
1570 if (GRAPHICS_VERx100(xe) < 1250) {
1571 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1572 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1573 }
1574
1575 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1576 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1577
1578 map = __xe_lrc_seqno_map(lrc);
1579 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1580
1581 map = __xe_lrc_start_seqno_map(lrc);
1582 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1583
1584 err = setup_wa_bb(lrc, hwe);
1585 if (err)
1586 goto err_lrc_finish;
1587
1588 err = setup_indirect_ctx(lrc, hwe);
1589 if (err)
1590 goto err_lrc_finish;
1591
1592 return 0;
1593
1594 err_lrc_finish:
1595 xe_lrc_finish(lrc);
1596 return err;
1597 }
1598
1599 /**
1600 * xe_lrc_create - Create a LRC
1601 * @hwe: Hardware Engine
1602 * @vm: The VM (address space)
1603 * @replay_state: GPU hang replay state
1604 * @ring_size: LRC ring size
1605 * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1606 * @flags: LRC initialization flags
1607 *
1608 * Allocate and initialize the Logical Ring Context (LRC).
1609 *
1610 * Return pointer to created LRC upon success and an error pointer
1611 * upon failure.
1612 */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,void * replay_state,u32 ring_size,u16 msix_vec,u32 flags)1613 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1614 void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1615 {
1616 struct xe_lrc *lrc;
1617 int err;
1618
1619 lrc = kzalloc_obj(*lrc);
1620 if (!lrc)
1621 return ERR_PTR(-ENOMEM);
1622
1623 err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1624 if (err) {
1625 kfree(lrc);
1626 return ERR_PTR(err);
1627 }
1628
1629 return lrc;
1630 }
1631
1632 /**
1633 * xe_lrc_destroy - Destroy the LRC
1634 * @ref: reference to LRC
1635 *
1636 * Called when ref == 0, release resources held by the Logical Ring Context
1637 * (LRC) and free the LRC memory.
1638 */
xe_lrc_destroy(struct kref * ref)1639 void xe_lrc_destroy(struct kref *ref)
1640 {
1641 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1642
1643 xe_lrc_finish(lrc);
1644 kfree(lrc);
1645 }
1646
1647 /**
1648 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1649 * @lrc: the &xe_lrc struct instance
1650 */
xe_lrc_update_hwctx_regs_with_address(struct xe_lrc * lrc)1651 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1652 {
1653 if (xe_lrc_has_indirect_ring_state(lrc)) {
1654 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1655 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1656
1657 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1658 __xe_lrc_ring_ggtt_addr(lrc));
1659 } else {
1660 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1661 }
1662 }
1663
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1664 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1665 {
1666 if (xe_lrc_has_indirect_ring_state(lrc))
1667 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1668 else
1669 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1670 }
1671
xe_lrc_ring_tail(struct xe_lrc * lrc)1672 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1673 {
1674 if (xe_lrc_has_indirect_ring_state(lrc))
1675 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1676 else
1677 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1678 }
1679
xe_lrc_ring_start(struct xe_lrc * lrc)1680 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1681 {
1682 if (xe_lrc_has_indirect_ring_state(lrc))
1683 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1684 else
1685 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1686 }
1687
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1688 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1689 {
1690 if (xe_lrc_has_indirect_ring_state(lrc))
1691 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1692 else
1693 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1694 }
1695
xe_lrc_ring_head(struct xe_lrc * lrc)1696 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1697 {
1698 if (xe_lrc_has_indirect_ring_state(lrc))
1699 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1700 else
1701 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1702 }
1703
xe_lrc_ring_space(struct xe_lrc * lrc)1704 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1705 {
1706 const u32 head = xe_lrc_ring_head(lrc);
1707 const u32 tail = lrc->ring.tail;
1708 const u32 size = lrc->ring.size;
1709
1710 return ((head - tail - 1) & (size - 1)) + 1;
1711 }
1712
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1713 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1714 const void *data, size_t size)
1715 {
1716 struct xe_device *xe = lrc_to_xe(lrc);
1717
1718 iosys_map_incr(&ring, lrc->ring.tail);
1719 xe_map_memcpy_to(xe, &ring, 0, data, size);
1720 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1721 }
1722
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1723 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1724 {
1725 struct xe_device *xe = lrc_to_xe(lrc);
1726 struct iosys_map ring;
1727 u32 rhs;
1728 size_t aligned_size;
1729
1730 xe_assert(xe, IS_ALIGNED(size, 4));
1731 aligned_size = ALIGN(size, 8);
1732
1733 ring = __xe_lrc_ring_map(lrc);
1734
1735 xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1736 rhs = lrc->ring.size - lrc->ring.tail;
1737 if (size > rhs) {
1738 __xe_lrc_write_ring(lrc, ring, data, rhs);
1739 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1740 } else {
1741 __xe_lrc_write_ring(lrc, ring, data, size);
1742 }
1743
1744 if (aligned_size > size) {
1745 u32 noop = MI_NOOP;
1746
1747 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1748 }
1749 }
1750
xe_lrc_descriptor(struct xe_lrc * lrc)1751 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1752 {
1753 return lrc->desc | xe_lrc_ggtt_addr(lrc);
1754 }
1755
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1756 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1757 {
1758 return __xe_lrc_seqno_ggtt_addr(lrc);
1759 }
1760
1761 /**
1762 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1763 *
1764 * Allocate but don't initialize an lrc seqno fence.
1765 *
1766 * Return: Pointer to the allocated fence or
1767 * negative error pointer on error.
1768 */
xe_lrc_alloc_seqno_fence(void)1769 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1770 {
1771 return xe_hw_fence_alloc();
1772 }
1773
1774 /**
1775 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1776 * @fence: Pointer to the fence to free.
1777 *
1778 * Frees an lrc seqno fence that hasn't yet been
1779 * initialized.
1780 */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1781 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1782 {
1783 xe_hw_fence_free(fence);
1784 }
1785
1786 /**
1787 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1788 * @lrc: Pointer to the lrc.
1789 * @fence: Pointer to the fence to initialize.
1790 *
1791 * Initializes a pre-allocated lrc seqno fence.
1792 * After initialization, the fence is subject to normal
1793 * dma-fence refcounting.
1794 */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1795 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1796 {
1797 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1798 }
1799
xe_lrc_seqno(struct xe_lrc * lrc)1800 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1801 {
1802 struct iosys_map map = __xe_lrc_seqno_map(lrc);
1803
1804 return xe_map_read32(lrc_to_xe(lrc), &map);
1805 }
1806
xe_lrc_start_seqno(struct xe_lrc * lrc)1807 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1808 {
1809 struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1810
1811 return xe_map_read32(lrc_to_xe(lrc), &map);
1812 }
1813
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1814 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1815 {
1816 return __xe_lrc_start_seqno_ggtt_addr(lrc);
1817 }
1818
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1819 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1820 {
1821 return __xe_lrc_parallel_ggtt_addr(lrc);
1822 }
1823
xe_lrc_parallel_map(struct xe_lrc * lrc)1824 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1825 {
1826 return __xe_lrc_parallel_map(lrc);
1827 }
1828
1829 /**
1830 * xe_lrc_engine_id() - Read engine id value
1831 * @lrc: Pointer to the lrc.
1832 *
1833 * Returns: context id value
1834 */
xe_lrc_engine_id(struct xe_lrc * lrc)1835 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1836 {
1837 struct xe_device *xe = lrc_to_xe(lrc);
1838 struct iosys_map map;
1839
1840 map = __xe_lrc_engine_id_map(lrc);
1841 return xe_map_read32(xe, &map);
1842 }
1843
instr_dw(u32 cmd_header)1844 static int instr_dw(u32 cmd_header)
1845 {
1846 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1847 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1848 GFXPIPE_SINGLE_DW_CMD(0, 0))
1849 return 1;
1850
1851 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1852 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1853 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1854
1855 /* Most instructions have the # of dwords (minus 2) in 7:0 */
1856 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1857 }
1858
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1859 static int dump_mi_command(struct drm_printer *p,
1860 struct xe_gt *gt,
1861 u32 *dw,
1862 int remaining_dw)
1863 {
1864 u32 inst_header = *dw;
1865 u32 numdw = instr_dw(inst_header);
1866 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1867 int num_noop;
1868
1869 /* First check for commands that don't have/use a '# DW' field */
1870 switch (inst_header & MI_OPCODE) {
1871 case MI_NOOP:
1872 num_noop = 1;
1873 while (num_noop < remaining_dw &&
1874 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1875 num_noop++;
1876 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1877 return num_noop;
1878
1879 case MI_TOPOLOGY_FILTER:
1880 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1881 return 1;
1882
1883 case MI_BATCH_BUFFER_END:
1884 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1885 /* Return 'remaining_dw' to consume the rest of the LRC */
1886 return remaining_dw;
1887 }
1888
1889 /*
1890 * Any remaining commands include a # of dwords. We should make sure
1891 * it doesn't exceed the remaining size of the LRC.
1892 */
1893 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1894 numdw = remaining_dw;
1895
1896 switch (inst_header & MI_OPCODE) {
1897 case MI_LOAD_REGISTER_IMM:
1898 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1899 inst_header, (numdw - 1) / 2);
1900 for (int i = 1; i < numdw; i += 2)
1901 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1902 return numdw;
1903
1904 case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1905 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1906 inst_header,
1907 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1908 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1909 if (numdw == 4)
1910 drm_printf(p, " - %#6x = %#010llx\n",
1911 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1912 else
1913 drm_printf(p, " - %*ph (%s)\n",
1914 (int)sizeof(u32) * (numdw - 1), dw + 1,
1915 numdw < 4 ? "truncated" : "malformed");
1916 return numdw;
1917
1918 case MI_FORCE_WAKEUP:
1919 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1920 return numdw;
1921
1922 default:
1923 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1924 inst_header, opcode, numdw);
1925 return numdw;
1926 }
1927 }
1928
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1929 static int dump_gfxpipe_command(struct drm_printer *p,
1930 struct xe_gt *gt,
1931 u32 *dw,
1932 int remaining_dw)
1933 {
1934 u32 numdw = instr_dw(*dw);
1935 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1936 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1937 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1938
1939 /*
1940 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1941 * remaining size of the LRC.
1942 */
1943 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1944 numdw = remaining_dw;
1945
1946 switch (*dw & GFXPIPE_MATCH_MASK) {
1947 #define MATCH(cmd) \
1948 case cmd: \
1949 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1950 return numdw
1951 #define MATCH3D(cmd) \
1952 case CMD_##cmd: \
1953 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1954 return numdw
1955
1956 MATCH(STATE_BASE_ADDRESS);
1957 MATCH(STATE_SIP);
1958 MATCH(GPGPU_CSR_BASE_ADDRESS);
1959 MATCH(STATE_COMPUTE_MODE);
1960 MATCH3D(3DSTATE_BTD);
1961 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1962 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1963
1964 MATCH3D(3DSTATE_VF_STATISTICS);
1965
1966 MATCH(PIPELINE_SELECT);
1967
1968 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1969 MATCH3D(3DSTATE_CLEAR_PARAMS);
1970 MATCH3D(3DSTATE_DEPTH_BUFFER);
1971 MATCH3D(3DSTATE_STENCIL_BUFFER);
1972 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1973 MATCH3D(3DSTATE_VERTEX_BUFFERS);
1974 MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1975 MATCH3D(3DSTATE_INDEX_BUFFER);
1976 MATCH3D(3DSTATE_VF);
1977 MATCH3D(3DSTATE_MULTISAMPLE);
1978 MATCH3D(3DSTATE_CC_STATE_POINTERS);
1979 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1980 MATCH3D(3DSTATE_VS);
1981 MATCH3D(3DSTATE_GS);
1982 MATCH3D(3DSTATE_CLIP);
1983 MATCH3D(3DSTATE_SF);
1984 MATCH3D(3DSTATE_WM);
1985 MATCH3D(3DSTATE_CONSTANT_VS);
1986 MATCH3D(3DSTATE_CONSTANT_GS);
1987 MATCH3D(3DSTATE_CONSTANT_PS);
1988 MATCH3D(3DSTATE_SAMPLE_MASK);
1989 MATCH3D(3DSTATE_CONSTANT_HS);
1990 MATCH3D(3DSTATE_CONSTANT_DS);
1991 MATCH3D(3DSTATE_HS);
1992 MATCH3D(3DSTATE_TE);
1993 MATCH3D(3DSTATE_DS);
1994 MATCH3D(3DSTATE_STREAMOUT);
1995 MATCH3D(3DSTATE_SBE);
1996 MATCH3D(3DSTATE_PS);
1997 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1998 MATCH3D(3DSTATE_CPS_POINTERS);
1999 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2000 MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2001 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2002 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2003 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2004 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2005 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2006 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2007 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2008 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2009 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2010 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2011 MATCH3D(3DSTATE_VF_INSTANCING);
2012 MATCH3D(3DSTATE_VF_SGVS);
2013 MATCH3D(3DSTATE_VF_TOPOLOGY);
2014 MATCH3D(3DSTATE_WM_CHROMAKEY);
2015 MATCH3D(3DSTATE_PS_BLEND);
2016 MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2017 MATCH3D(3DSTATE_PS_EXTRA);
2018 MATCH3D(3DSTATE_RASTER);
2019 MATCH3D(3DSTATE_SBE_SWIZ);
2020 MATCH3D(3DSTATE_WM_HZ_OP);
2021 MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2022 MATCH3D(3DSTATE_VF_SGVS_2);
2023 MATCH3D(3DSTATE_VFG);
2024 MATCH3D(3DSTATE_URB_ALLOC_VS);
2025 MATCH3D(3DSTATE_URB_ALLOC_HS);
2026 MATCH3D(3DSTATE_URB_ALLOC_DS);
2027 MATCH3D(3DSTATE_URB_ALLOC_GS);
2028 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2029 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2030 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2031 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2032 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2033 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2034 MATCH3D(3DSTATE_AMFS);
2035 MATCH3D(3DSTATE_DEPTH_BOUNDS);
2036 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2037 MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2038 MATCH3D(3DSTATE_MESH_CONTROL);
2039 MATCH3D(3DSTATE_MESH_DISTRIB);
2040 MATCH3D(3DSTATE_TASK_REDISTRIB);
2041 MATCH3D(3DSTATE_MESH_SHADER);
2042 MATCH3D(3DSTATE_MESH_SHADER_DATA);
2043 MATCH3D(3DSTATE_TASK_CONTROL);
2044 MATCH3D(3DSTATE_TASK_SHADER);
2045 MATCH3D(3DSTATE_TASK_SHADER_DATA);
2046 MATCH3D(3DSTATE_URB_ALLOC_MESH);
2047 MATCH3D(3DSTATE_URB_ALLOC_TASK);
2048 MATCH3D(3DSTATE_CLIP_MESH);
2049 MATCH3D(3DSTATE_SBE_MESH);
2050 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2051 MATCH3D(3DSTATE_COARSE_PIXEL);
2052
2053 MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2054 MATCH3D(3DSTATE_CHROMA_KEY);
2055 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2056 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2057 MATCH3D(3DSTATE_LINE_STIPPLE);
2058 MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2059 MATCH3D(3DSTATE_MONOFILTER_SIZE);
2060 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2061 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2062 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2063 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2064 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2065 MATCH3D(3DSTATE_SO_DECL_LIST);
2066 MATCH3D(3DSTATE_SO_BUFFER);
2067 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2068 MATCH3D(3DSTATE_SAMPLE_PATTERN);
2069 MATCH3D(3DSTATE_3D_MODE);
2070 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2071 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2072 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2073
2074 default:
2075 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2076 *dw, pipeline, opcode, subopcode, numdw);
2077 return numdw;
2078 }
2079 }
2080
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)2081 static int dump_gfx_state_command(struct drm_printer *p,
2082 struct xe_gt *gt,
2083 u32 *dw,
2084 int remaining_dw)
2085 {
2086 u32 numdw = instr_dw(*dw);
2087 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2088
2089 /*
2090 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2091 * remaining size of the LRC.
2092 */
2093 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2094 numdw = remaining_dw;
2095
2096 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2097 MATCH(STATE_WRITE_INLINE);
2098
2099 default:
2100 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2101 *dw, opcode, numdw);
2102 return numdw;
2103 }
2104 }
2105
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)2106 void xe_lrc_dump_default(struct drm_printer *p,
2107 struct xe_gt *gt,
2108 enum xe_engine_class hwe_class)
2109 {
2110 u32 *dw;
2111 int remaining_dw, num_dw;
2112
2113 if (!gt->default_lrc[hwe_class]) {
2114 drm_printf(p, "No default LRC for class %d\n", hwe_class);
2115 return;
2116 }
2117
2118 /*
2119 * Skip the beginning of the LRC since it contains the per-process
2120 * hardware status page.
2121 */
2122 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2123 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2124
2125 while (remaining_dw > 0) {
2126 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2127 num_dw = dump_mi_command(p, gt, dw, remaining_dw);
2128 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2129 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
2130 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2131 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
2132 } else {
2133 num_dw = min(instr_dw(*dw), remaining_dw);
2134 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2135 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2136 num_dw);
2137 }
2138
2139 dw += num_dw;
2140 remaining_dw -= num_dw;
2141 }
2142 }
2143
2144 struct instr_state {
2145 u32 instr;
2146 u16 num_dw;
2147 };
2148
2149 static const struct instr_state xe_hpg_svg_state[] = {
2150 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2151 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2152 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2153 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2154 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2155 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2156 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2157 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2158 { .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2159 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2160 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2161 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2162 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2163 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2164 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2165 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2166 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2167 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2168 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2169 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2170 { .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2171 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2172 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2173 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2174 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2175 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2176 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2177 { .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2178 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2179 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2180 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2181 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2182 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2183 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2184 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2185 { .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2186 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2187 { .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2188 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2189 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2190 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2191 { .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2192 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2193 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2194 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2195 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2196 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2197 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2198 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2199 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2200 };
2201
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,u32 * cs)2202 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2203 {
2204 struct xe_gt *gt = q->hwe->gt;
2205 struct xe_device *xe = gt_to_xe(gt);
2206 const struct instr_state *state_table = NULL;
2207 int state_table_size = 0;
2208
2209 /*
2210 * Wa_14019789679
2211 *
2212 * If the driver doesn't explicitly emit the SVG instructions while
2213 * setting up the default LRC, the context switch will write 0's
2214 * (noops) into the LRC memory rather than the expected instruction
2215 * headers. Application contexts start out as a copy of the default
2216 * LRC, and if they also do not emit specific settings for some SVG
2217 * state, then on context restore they'll unintentionally inherit
2218 * whatever state setting the previous context had programmed into the
2219 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2220 * prevent the hardware from resetting that state back to any specific
2221 * value).
2222 *
2223 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2224 * since that's a specific state setting that can easily cause GPU
2225 * hangs if unintentionally inherited. However to be safe we'll
2226 * continue to emit all of the SVG state since it's best not to leak
2227 * any of the state between contexts, even if that leakage is harmless.
2228 */
2229 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2230 state_table = xe_hpg_svg_state;
2231 state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2232 }
2233
2234 if (!state_table) {
2235 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2236 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2237 return cs;
2238 }
2239
2240 for (int i = 0; i < state_table_size; i++) {
2241 u32 instr = state_table[i].instr;
2242 u16 num_dw = state_table[i].num_dw;
2243 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2244
2245 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2246 xe_gt_assert(gt, num_dw != 0);
2247 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2248
2249 /*
2250 * Xe2's SVG context is the same as the one on DG2 / MTL
2251 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2252 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2253 * Just make the replacement here rather than defining a
2254 * whole separate table for the single trivial change.
2255 */
2256 if (GRAPHICS_VER(xe) >= 20 &&
2257 instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2258 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2259
2260 *cs = instr;
2261 if (!is_single_dw)
2262 *cs |= (num_dw - 2);
2263
2264 cs += num_dw;
2265 }
2266
2267 return cs;
2268 }
2269
xe_lrc_snapshot_capture(struct xe_lrc * lrc)2270 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2271 {
2272 struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2273
2274 if (!snapshot)
2275 return NULL;
2276
2277 snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2278 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2279 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2280 snapshot->head = xe_lrc_ring_head(lrc);
2281 snapshot->tail.internal = lrc->ring.tail;
2282 snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2283 snapshot->start = xe_lrc_ring_start(lrc);
2284 snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2285 snapshot->seqno = xe_lrc_seqno(lrc);
2286 snapshot->lrc_bo = xe_bo_get(lrc->bo);
2287 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2288 snapshot->lrc_size = lrc->size;
2289 snapshot->replay_offset = 0;
2290 snapshot->replay_size = lrc->replay_size;
2291 snapshot->lrc_snapshot = NULL;
2292 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2293 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2294 return snapshot;
2295 }
2296
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)2297 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2298 {
2299 struct xe_bo *bo;
2300 struct iosys_map src;
2301
2302 if (!snapshot)
2303 return;
2304
2305 bo = snapshot->lrc_bo;
2306 snapshot->lrc_bo = NULL;
2307
2308 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2309 if (!snapshot->lrc_snapshot)
2310 goto put_bo;
2311
2312 xe_bo_lock(bo, false);
2313 if (!ttm_bo_vmap(&bo->ttm, &src)) {
2314 xe_map_memcpy_from(xe_bo_device(bo),
2315 snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2316 snapshot->lrc_size);
2317 ttm_bo_vunmap(&bo->ttm, &src);
2318 } else {
2319 kvfree(snapshot->lrc_snapshot);
2320 snapshot->lrc_snapshot = NULL;
2321 }
2322 xe_bo_unlock(bo);
2323 put_bo:
2324 xe_bo_put(bo);
2325 }
2326
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)2327 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2328 {
2329 unsigned long i;
2330
2331 if (!snapshot)
2332 return;
2333
2334 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2335 drm_printf(p, "\tHW Ring address: 0x%08x\n",
2336 snapshot->ring_addr);
2337 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2338 snapshot->indirect_context_desc);
2339 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2340 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2341 snapshot->tail.internal, snapshot->tail.memory);
2342 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2343 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2344 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2345 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2346 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2347
2348 if (!snapshot->lrc_snapshot)
2349 return;
2350
2351 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2352 drm_puts(p, "\t[HWSP].data: ");
2353 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2354 u32 *val = snapshot->lrc_snapshot + i;
2355 char dumped[ASCII85_BUFSZ];
2356
2357 drm_puts(p, ascii85_encode(*val, dumped));
2358 }
2359
2360 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2361 drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2362 drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2363
2364 drm_puts(p, "\t[HWCTX].data: ");
2365 for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2366 u32 *val = snapshot->lrc_snapshot + i;
2367 char dumped[ASCII85_BUFSZ];
2368
2369 drm_puts(p, ascii85_encode(*val, dumped));
2370 }
2371 drm_puts(p, "\n");
2372 }
2373
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)2374 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2375 {
2376 if (!snapshot)
2377 return;
2378
2379 kvfree(snapshot->lrc_snapshot);
2380 if (snapshot->lrc_bo)
2381 xe_bo_put(snapshot->lrc_bo);
2382
2383 kfree(snapshot);
2384 }
2385
get_ctx_timestamp(struct xe_lrc * lrc,u32 engine_id,u64 * reg_ctx_ts)2386 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2387 {
2388 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2389 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2390 struct xe_hw_engine *hwe;
2391 u64 val;
2392
2393 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2394 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2395 "Unexpected engine class:instance %d:%d for context utilization\n",
2396 class, instance))
2397 return -1;
2398
2399 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2400 val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2401 RING_CTX_TIMESTAMP(hwe->mmio_base));
2402 else
2403 val = xe_mmio_read32(&hwe->gt->mmio,
2404 RING_CTX_TIMESTAMP(hwe->mmio_base));
2405
2406 *reg_ctx_ts = val;
2407
2408 return 0;
2409 }
2410
2411 /**
2412 * xe_lrc_timestamp() - Current ctx timestamp
2413 * @lrc: Pointer to the lrc.
2414 *
2415 * Return latest ctx timestamp. With support for active contexts, the
2416 * calculation may bb slightly racy, so follow a read-again logic to ensure that
2417 * the context is still active before returning the right timestamp.
2418 *
2419 * Returns: New ctx timestamp value
2420 */
xe_lrc_timestamp(struct xe_lrc * lrc)2421 u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2422 {
2423 u64 lrc_ts, reg_ts, new_ts;
2424 u32 engine_id;
2425
2426 lrc_ts = xe_lrc_ctx_timestamp(lrc);
2427 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2428 if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2429 new_ts = lrc_ts;
2430 goto done;
2431 }
2432
2433 if (lrc_ts == CONTEXT_ACTIVE) {
2434 engine_id = xe_lrc_engine_id(lrc);
2435 if (!get_ctx_timestamp(lrc, engine_id, ®_ts))
2436 new_ts = reg_ts;
2437
2438 /* read lrc again to ensure context is still active */
2439 lrc_ts = xe_lrc_ctx_timestamp(lrc);
2440 }
2441
2442 /*
2443 * If context switched out, just use the lrc_ts. Note that this needs to
2444 * be a separate if condition.
2445 */
2446 if (lrc_ts != CONTEXT_ACTIVE)
2447 new_ts = lrc_ts;
2448
2449 done:
2450 return new_ts;
2451 }
2452
2453 /**
2454 * xe_lrc_update_timestamp() - Update ctx timestamp
2455 * @lrc: Pointer to the lrc.
2456 * @old_ts: Old timestamp value
2457 *
2458 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2459 * update saved value.
2460 *
2461 * Returns: New ctx timestamp value
2462 */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u64 * old_ts)2463 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2464 {
2465 *old_ts = lrc->ctx_timestamp;
2466 lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
2467
2468 trace_xe_lrc_update_timestamp(lrc, *old_ts);
2469
2470 return lrc->ctx_timestamp;
2471 }
2472
2473 /**
2474 * xe_lrc_ring_is_idle() - LRC is idle
2475 * @lrc: Pointer to the lrc.
2476 *
2477 * Compare LRC ring head and tail to determine if idle.
2478 *
2479 * Return: True is ring is idle, False otherwise
2480 */
xe_lrc_ring_is_idle(struct xe_lrc * lrc)2481 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2482 {
2483 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2484 }
2485