1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2021 Intel Corporation
4 */
5
6 #include "xe_lrc.h"
7
8 #include <generated/xe_wa_oob.h>
9
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_lrc_layout.h"
18 #include "xe_bb.h"
19 #include "xe_bo.h"
20 #include "xe_configfs.h"
21 #include "xe_device.h"
22 #include "xe_drm_client.h"
23 #include "xe_exec_queue_types.h"
24 #include "xe_gt.h"
25 #include "xe_gt_printk.h"
26 #include "xe_hw_fence.h"
27 #include "xe_map.h"
28 #include "xe_memirq.h"
29 #include "xe_mmio.h"
30 #include "xe_sriov.h"
31 #include "xe_trace_lrc.h"
32 #include "xe_vm.h"
33 #include "xe_wa.h"
34
35 #define LRC_VALID BIT_ULL(0)
36 #define LRC_PRIVILEGE BIT_ULL(8)
37 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3)
38 #define LRC_LEGACY_64B_CONTEXT 3
39
40 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61)
41 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
42
43 #define LRC_PPHWSP_SIZE SZ_4K
44 #define LRC_INDIRECT_CTX_BO_SIZE SZ_4K
45 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
46
47 /*
48 * Layout of the LRC and associated data allocated as
49 * lrc->bo:
50 *
51 * Region Size
52 * +============================+=================================+ <- __xe_lrc_ring_offset()
53 * | Ring | ring_size, see |
54 * | | xe_lrc_init() |
55 * +============================+=================================+ <- __xe_lrc_pphwsp_offset()
56 * | PPHWSP (includes SW state) | 4K |
57 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
58 * | Engine Context Image | n * 4K, see |
59 * | | xe_gt_lrc_size() |
60 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
61 * | Indirect Ring State Page | 0 or 4k, see |
62 * | | XE_LRC_FLAG_INDIRECT_RING_STATE |
63 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
64 * | Indirect Context Page | 0 or 4k, see |
65 * | | XE_LRC_FLAG_INDIRECT_CTX |
66 * +============================+=================================+ <- __xe_lrc_wa_bb_offset()
67 * | WA BB Per Ctx | 4k |
68 * +============================+=================================+ <- xe_bo_size(lrc->bo)
69 */
70
71 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)72 lrc_to_xe(struct xe_lrc *lrc)
73 {
74 return gt_to_xe(lrc->fence_ctx.gt);
75 }
76
77 static bool
gt_engine_needs_indirect_ctx(struct xe_gt * gt,enum xe_engine_class class)78 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
79 {
80 struct xe_device *xe = gt_to_xe(gt);
81
82 if (XE_GT_WA(gt, 16010904313) &&
83 (class == XE_ENGINE_CLASS_RENDER ||
84 class == XE_ENGINE_CLASS_COMPUTE))
85 return true;
86
87 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
88 class, NULL))
89 return true;
90
91 return false;
92 }
93
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)94 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
95 {
96 struct xe_device *xe = gt_to_xe(gt);
97 size_t size;
98
99 /* Per-process HW status page (PPHWSP) */
100 size = LRC_PPHWSP_SIZE;
101
102 /* Engine context image */
103 switch (class) {
104 case XE_ENGINE_CLASS_RENDER:
105 if (GRAPHICS_VER(xe) >= 20)
106 size += 3 * SZ_4K;
107 else
108 size += 13 * SZ_4K;
109 break;
110 case XE_ENGINE_CLASS_COMPUTE:
111 if (GRAPHICS_VER(xe) >= 20)
112 size += 2 * SZ_4K;
113 else
114 size += 13 * SZ_4K;
115 break;
116 default:
117 WARN(1, "Unknown engine class: %d", class);
118 fallthrough;
119 case XE_ENGINE_CLASS_COPY:
120 case XE_ENGINE_CLASS_VIDEO_DECODE:
121 case XE_ENGINE_CLASS_VIDEO_ENHANCE:
122 case XE_ENGINE_CLASS_OTHER:
123 size += 1 * SZ_4K;
124 }
125
126 /* Add indirect ring state page */
127 if (xe_gt_has_indirect_ring_state(gt))
128 size += LRC_INDIRECT_RING_STATE_SIZE;
129
130 return size;
131 }
132
133 /*
134 * The per-platform tables are u8-encoded in @data. Decode @data and set the
135 * addresses' offset and commands in @regs. The following encoding is used
136 * for each byte. There are 2 steps: decoding commands and decoding addresses.
137 *
138 * Commands:
139 * [7]: create NOPs - number of NOPs are set in lower bits
140 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
141 * MI_LRI_FORCE_POSTED
142 * [5:0]: Number of NOPs or registers to set values to in case of
143 * MI_LOAD_REGISTER_IMM
144 *
145 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
146 * number of registers. They are set by using the REG/REG16 macros: the former
147 * is used for offsets smaller than 0x200 while the latter is for values bigger
148 * than that. Those macros already set all the bits documented below correctly:
149 *
150 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
151 * follow, for the lower bits
152 * [6:0]: Register offset, without considering the engine base.
153 *
154 * This function only tweaks the commands and register offsets. Values are not
155 * filled out.
156 */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)157 static void set_offsets(u32 *regs,
158 const u8 *data,
159 const struct xe_hw_engine *hwe)
160 #define NOP(x) (BIT(7) | (x))
161 #define LRI(count, flags) ((flags) << 6 | (count) | \
162 BUILD_BUG_ON_ZERO(count >= BIT(6)))
163 #define POSTED BIT(0)
164 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
165 #define REG16(x) \
166 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
167 (((x) >> 2) & 0x7f)
168 {
169 const u32 base = hwe->mmio_base;
170
171 while (*data) {
172 u8 count, flags;
173
174 if (*data & BIT(7)) { /* skip */
175 count = *data++ & ~BIT(7);
176 regs += count;
177 continue;
178 }
179
180 count = *data & 0x3f;
181 flags = *data >> 6;
182 data++;
183
184 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
185 if (flags & POSTED)
186 *regs |= MI_LRI_FORCE_POSTED;
187 *regs |= MI_LRI_LRM_CS_MMIO;
188 regs++;
189
190 xe_gt_assert(hwe->gt, count);
191 do {
192 u32 offset = 0;
193 u8 v;
194
195 do {
196 v = *data++;
197 offset <<= 7;
198 offset |= v & ~BIT(7);
199 } while (v & BIT(7));
200
201 regs[0] = base + (offset << 2);
202 regs += 2;
203 } while (--count);
204 }
205
206 *regs = MI_BATCH_BUFFER_END | BIT(0);
207 }
208
209 static const u8 gen12_xcs_offsets[] = {
210 NOP(1),
211 LRI(13, POSTED),
212 REG16(0x244),
213 REG(0x034),
214 REG(0x030),
215 REG(0x038),
216 REG(0x03c),
217 REG(0x168),
218 REG(0x140),
219 REG(0x110),
220 REG(0x1c0),
221 REG(0x1c4),
222 REG(0x1c8),
223 REG(0x180),
224 REG16(0x2b4),
225
226 NOP(5),
227 LRI(9, POSTED),
228 REG16(0x3a8),
229 REG16(0x28c),
230 REG16(0x288),
231 REG16(0x284),
232 REG16(0x280),
233 REG16(0x27c),
234 REG16(0x278),
235 REG16(0x274),
236 REG16(0x270),
237
238 0
239 };
240
241 static const u8 dg2_xcs_offsets[] = {
242 NOP(1),
243 LRI(15, POSTED),
244 REG16(0x244),
245 REG(0x034),
246 REG(0x030),
247 REG(0x038),
248 REG(0x03c),
249 REG(0x168),
250 REG(0x140),
251 REG(0x110),
252 REG(0x1c0),
253 REG(0x1c4),
254 REG(0x1c8),
255 REG(0x180),
256 REG16(0x2b4),
257 REG(0x120),
258 REG(0x124),
259
260 NOP(1),
261 LRI(9, POSTED),
262 REG16(0x3a8),
263 REG16(0x28c),
264 REG16(0x288),
265 REG16(0x284),
266 REG16(0x280),
267 REG16(0x27c),
268 REG16(0x278),
269 REG16(0x274),
270 REG16(0x270),
271
272 0
273 };
274
275 static const u8 gen12_rcs_offsets[] = {
276 NOP(1),
277 LRI(13, POSTED),
278 REG16(0x244),
279 REG(0x034),
280 REG(0x030),
281 REG(0x038),
282 REG(0x03c),
283 REG(0x168),
284 REG(0x140),
285 REG(0x110),
286 REG(0x1c0),
287 REG(0x1c4),
288 REG(0x1c8),
289 REG(0x180),
290 REG16(0x2b4),
291
292 NOP(5),
293 LRI(9, POSTED),
294 REG16(0x3a8),
295 REG16(0x28c),
296 REG16(0x288),
297 REG16(0x284),
298 REG16(0x280),
299 REG16(0x27c),
300 REG16(0x278),
301 REG16(0x274),
302 REG16(0x270),
303
304 LRI(3, POSTED),
305 REG(0x1b0),
306 REG16(0x5a8),
307 REG16(0x5ac),
308
309 NOP(6),
310 LRI(1, 0),
311 REG(0x0c8),
312 NOP(3 + 9 + 1),
313
314 LRI(51, POSTED),
315 REG16(0x588),
316 REG16(0x588),
317 REG16(0x588),
318 REG16(0x588),
319 REG16(0x588),
320 REG16(0x588),
321 REG(0x028),
322 REG(0x09c),
323 REG(0x0c0),
324 REG(0x178),
325 REG(0x17c),
326 REG16(0x358),
327 REG(0x170),
328 REG(0x150),
329 REG(0x154),
330 REG(0x158),
331 REG16(0x41c),
332 REG16(0x600),
333 REG16(0x604),
334 REG16(0x608),
335 REG16(0x60c),
336 REG16(0x610),
337 REG16(0x614),
338 REG16(0x618),
339 REG16(0x61c),
340 REG16(0x620),
341 REG16(0x624),
342 REG16(0x628),
343 REG16(0x62c),
344 REG16(0x630),
345 REG16(0x634),
346 REG16(0x638),
347 REG16(0x63c),
348 REG16(0x640),
349 REG16(0x644),
350 REG16(0x648),
351 REG16(0x64c),
352 REG16(0x650),
353 REG16(0x654),
354 REG16(0x658),
355 REG16(0x65c),
356 REG16(0x660),
357 REG16(0x664),
358 REG16(0x668),
359 REG16(0x66c),
360 REG16(0x670),
361 REG16(0x674),
362 REG16(0x678),
363 REG16(0x67c),
364 REG(0x068),
365 REG(0x084),
366 NOP(1),
367
368 0
369 };
370
371 static const u8 xehp_rcs_offsets[] = {
372 NOP(1),
373 LRI(13, POSTED),
374 REG16(0x244),
375 REG(0x034),
376 REG(0x030),
377 REG(0x038),
378 REG(0x03c),
379 REG(0x168),
380 REG(0x140),
381 REG(0x110),
382 REG(0x1c0),
383 REG(0x1c4),
384 REG(0x1c8),
385 REG(0x180),
386 REG16(0x2b4),
387
388 NOP(5),
389 LRI(9, POSTED),
390 REG16(0x3a8),
391 REG16(0x28c),
392 REG16(0x288),
393 REG16(0x284),
394 REG16(0x280),
395 REG16(0x27c),
396 REG16(0x278),
397 REG16(0x274),
398 REG16(0x270),
399
400 LRI(3, POSTED),
401 REG(0x1b0),
402 REG16(0x5a8),
403 REG16(0x5ac),
404
405 NOP(6),
406 LRI(1, 0),
407 REG(0x0c8),
408
409 0
410 };
411
412 static const u8 dg2_rcs_offsets[] = {
413 NOP(1),
414 LRI(15, POSTED),
415 REG16(0x244),
416 REG(0x034),
417 REG(0x030),
418 REG(0x038),
419 REG(0x03c),
420 REG(0x168),
421 REG(0x140),
422 REG(0x110),
423 REG(0x1c0),
424 REG(0x1c4),
425 REG(0x1c8),
426 REG(0x180),
427 REG16(0x2b4),
428 REG(0x120),
429 REG(0x124),
430
431 NOP(1),
432 LRI(9, POSTED),
433 REG16(0x3a8),
434 REG16(0x28c),
435 REG16(0x288),
436 REG16(0x284),
437 REG16(0x280),
438 REG16(0x27c),
439 REG16(0x278),
440 REG16(0x274),
441 REG16(0x270),
442
443 LRI(3, POSTED),
444 REG(0x1b0),
445 REG16(0x5a8),
446 REG16(0x5ac),
447
448 NOP(6),
449 LRI(1, 0),
450 REG(0x0c8),
451
452 0
453 };
454
455 static const u8 mtl_rcs_offsets[] = {
456 NOP(1),
457 LRI(15, POSTED),
458 REG16(0x244),
459 REG(0x034),
460 REG(0x030),
461 REG(0x038),
462 REG(0x03c),
463 REG(0x168),
464 REG(0x140),
465 REG(0x110),
466 REG(0x1c0),
467 REG(0x1c4),
468 REG(0x1c8),
469 REG(0x180),
470 REG16(0x2b4),
471 REG(0x120),
472 REG(0x124),
473
474 NOP(1),
475 LRI(9, POSTED),
476 REG16(0x3a8),
477 REG16(0x28c),
478 REG16(0x288),
479 REG16(0x284),
480 REG16(0x280),
481 REG16(0x27c),
482 REG16(0x278),
483 REG16(0x274),
484 REG16(0x270),
485
486 NOP(2),
487 LRI(2, POSTED),
488 REG16(0x5a8),
489 REG16(0x5ac),
490
491 NOP(6),
492 LRI(1, 0),
493 REG(0x0c8),
494
495 0
496 };
497
498 #define XE2_CTX_COMMON \
499 NOP(1), /* [0x00] */ \
500 LRI(15, POSTED), /* [0x01] */ \
501 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \
502 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \
503 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \
504 REG(0x038), /* [0x08] RING_BUFFER_START */ \
505 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \
506 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \
507 REG(0x140), /* [0x0e] BB_ADDR */ \
508 REG(0x110), /* [0x10] BB_STATE */ \
509 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \
510 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \
511 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
512 REG(0x180), /* [0x18] CCID */ \
513 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \
514 REG(0x120), /* [0x1c] PRT_BB_STATE */ \
515 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \
516 \
517 NOP(1), /* [0x20] */ \
518 LRI(9, POSTED), /* [0x21] */ \
519 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \
520 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \
521 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \
522 REG16(0x284), /* [0x28] dummy reg */ \
523 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \
524 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \
525 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \
526 REG16(0x274), /* [0x30] PTBP_UDW */ \
527 REG16(0x270) /* [0x32] PTBP_LDW */
528
529 static const u8 xe2_rcs_offsets[] = {
530 XE2_CTX_COMMON,
531
532 NOP(2), /* [0x34] */
533 LRI(2, POSTED), /* [0x36] */
534 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
535 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */
536
537 NOP(6), /* [0x41] */
538 LRI(1, 0), /* [0x47] */
539 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */
540
541 0
542 };
543
544 static const u8 xe2_bcs_offsets[] = {
545 XE2_CTX_COMMON,
546
547 NOP(4 + 8 + 1), /* [0x34] */
548 LRI(2, POSTED), /* [0x41] */
549 REG16(0x200), /* [0x42] BCS_SWCTRL */
550 REG16(0x204), /* [0x44] BLIT_CCTL */
551
552 0
553 };
554
555 static const u8 xe2_xcs_offsets[] = {
556 XE2_CTX_COMMON,
557
558 0
559 };
560
561 static const u8 xe2_indirect_ring_state_offsets[] = {
562 NOP(1), /* [0x00] */
563 LRI(5, POSTED), /* [0x01] */
564 REG(0x034), /* [0x02] RING_BUFFER_HEAD */
565 REG(0x030), /* [0x04] RING_BUFFER_TAIL */
566 REG(0x038), /* [0x06] RING_BUFFER_START */
567 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */
568 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */
569
570 NOP(5), /* [0x0c] */
571 LRI(9, POSTED), /* [0x11] */
572 REG(0x168), /* [0x12] BB_ADDR_UDW */
573 REG(0x140), /* [0x14] BB_ADDR */
574 REG(0x110), /* [0x16] BB_STATE */
575 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */
576 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */
577 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */
578 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */
579 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */
580 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */
581
582 NOP(12), /* [0x00] */
583
584 0
585 };
586
587 #undef REG16
588 #undef REG
589 #undef LRI
590 #undef NOP
591
reg_offsets(struct xe_device * xe,enum xe_engine_class class)592 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
593 {
594 if (class == XE_ENGINE_CLASS_RENDER) {
595 if (GRAPHICS_VER(xe) >= 20)
596 return xe2_rcs_offsets;
597 else if (GRAPHICS_VERx100(xe) >= 1270)
598 return mtl_rcs_offsets;
599 else if (GRAPHICS_VERx100(xe) >= 1255)
600 return dg2_rcs_offsets;
601 else if (GRAPHICS_VERx100(xe) >= 1250)
602 return xehp_rcs_offsets;
603 else
604 return gen12_rcs_offsets;
605 } else if (class == XE_ENGINE_CLASS_COPY) {
606 if (GRAPHICS_VER(xe) >= 20)
607 return xe2_bcs_offsets;
608 else
609 return gen12_xcs_offsets;
610 } else {
611 if (GRAPHICS_VER(xe) >= 20)
612 return xe2_xcs_offsets;
613 else if (GRAPHICS_VERx100(xe) >= 1255)
614 return dg2_xcs_offsets;
615 else
616 return gen12_xcs_offsets;
617 }
618 }
619
set_context_control(u32 * regs,struct xe_hw_engine * hwe)620 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
621 {
622 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
623 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
624
625 if (xe_gt_has_indirect_ring_state(hwe->gt))
626 regs[CTX_CONTEXT_CONTROL] |=
627 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
628 }
629
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)630 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
631 {
632 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq;
633 struct xe_device *xe = gt_to_xe(hwe->gt);
634 u8 num_regs;
635
636 if (!xe_device_uses_memirq(xe))
637 return;
638
639 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
640 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
641 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
642 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
643
644 num_regs = xe_device_has_msix(xe) ? 3 : 2;
645 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
646 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
647 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
648 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
649 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
650 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
651
652 if (xe_device_has_msix(xe)) {
653 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
654 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
655 }
656 }
657
lrc_ring_mi_mode(struct xe_hw_engine * hwe)658 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
659 {
660 struct xe_device *xe = gt_to_xe(hwe->gt);
661
662 if (GRAPHICS_VERx100(xe) >= 1250)
663 return 0x70;
664 else
665 return 0x60;
666 }
667
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)668 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
669 {
670 int x;
671
672 x = lrc_ring_mi_mode(hwe);
673 regs[x + 1] &= ~STOP_RING;
674 regs[x + 1] |= STOP_RING << 16;
675 }
676
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)677 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
678 {
679 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
680 }
681
__xe_lrc_ring_offset(struct xe_lrc * lrc)682 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
683 {
684 return 0;
685 }
686
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)687 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
688 {
689 return lrc->ring.size;
690 }
691
692 /* Make the magic macros work */
693 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
694 #define __xe_lrc_regs_offset xe_lrc_regs_offset
695
696 #define LRC_SEQNO_PPHWSP_OFFSET 512
697 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
698 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
699 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
700 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
701
xe_lrc_regs_offset(struct xe_lrc * lrc)702 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
703 {
704 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
705 }
706
707 /**
708 * xe_lrc_reg_size() - Get size of the LRC registers area within queues
709 * @xe: the &xe_device struct instance
710 *
711 * Returns: Size of the LRC registers area for current platform
712 */
xe_lrc_reg_size(struct xe_device * xe)713 size_t xe_lrc_reg_size(struct xe_device *xe)
714 {
715 if (GRAPHICS_VERx100(xe) >= 1250)
716 return 96 * sizeof(u32);
717 else
718 return 80 * sizeof(u32);
719 }
720
xe_lrc_skip_size(struct xe_device * xe)721 size_t xe_lrc_skip_size(struct xe_device *xe)
722 {
723 return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
724 }
725
__xe_lrc_seqno_offset(struct xe_lrc * lrc)726 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
727 {
728 /* The seqno is stored in the driver-defined portion of PPHWSP */
729 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
730 }
731
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)732 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
733 {
734 /* The start seqno is stored in the driver-defined portion of PPHWSP */
735 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
736 }
737
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)738 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
739 {
740 /* This is stored in the driver-defined portion of PPHWSP */
741 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
742 }
743
__xe_lrc_parallel_offset(struct xe_lrc * lrc)744 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
745 {
746 /* The parallel is stored in the driver-defined portion of PPHWSP */
747 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
748 }
749
__xe_lrc_engine_id_offset(struct xe_lrc * lrc)750 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
751 {
752 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
753 }
754
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)755 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
756 {
757 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
758 }
759
__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc * lrc)760 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
761 {
762 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
763 }
764
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)765 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
766 {
767 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
768 LRC_INDIRECT_RING_STATE_SIZE;
769
770 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
771 offset -= LRC_INDIRECT_CTX_BO_SIZE;
772
773 return offset;
774 }
775
__xe_lrc_indirect_ctx_offset(struct xe_lrc * lrc)776 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
777 {
778 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
779 }
780
__xe_lrc_wa_bb_offset(struct xe_lrc * lrc)781 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
782 {
783 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
784 }
785
786 #define DECL_MAP_ADDR_HELPERS(elem) \
787 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
788 { \
789 struct iosys_map map = lrc->bo->vmap; \
790 \
791 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \
792 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
793 return map; \
794 } \
795 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
796 { \
797 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
798 } \
799
800 DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)801 DECL_MAP_ADDR_HELPERS(pphwsp)
802 DECL_MAP_ADDR_HELPERS(seqno)
803 DECL_MAP_ADDR_HELPERS(regs)
804 DECL_MAP_ADDR_HELPERS(start_seqno)
805 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
806 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
807 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
808 DECL_MAP_ADDR_HELPERS(parallel)
809 DECL_MAP_ADDR_HELPERS(indirect_ring)
810 DECL_MAP_ADDR_HELPERS(engine_id)
811
812 #undef DECL_MAP_ADDR_HELPERS
813
814 /**
815 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
816 * @lrc: Pointer to the lrc.
817 *
818 * Returns: ctx timestamp GGTT address
819 */
820 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
821 {
822 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
823 }
824
825 /**
826 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
827 * @lrc: Pointer to the lrc.
828 *
829 * Returns: ctx timestamp udw GGTT address
830 */
xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc * lrc)831 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
832 {
833 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
834 }
835
836 /**
837 * xe_lrc_ctx_timestamp() - Read ctx timestamp value
838 * @lrc: Pointer to the lrc.
839 *
840 * Returns: ctx timestamp value
841 */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)842 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
843 {
844 struct xe_device *xe = lrc_to_xe(lrc);
845 struct iosys_map map;
846 u32 ldw, udw = 0;
847
848 map = __xe_lrc_ctx_timestamp_map(lrc);
849 ldw = xe_map_read32(xe, &map);
850
851 if (xe->info.has_64bit_timestamp) {
852 map = __xe_lrc_ctx_timestamp_udw_map(lrc);
853 udw = xe_map_read32(xe, &map);
854 }
855
856 return (u64)udw << 32 | ldw;
857 }
858
859 /**
860 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
861 * @lrc: Pointer to the lrc.
862 *
863 * Returns: ctx timestamp job GGTT address
864 */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)865 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
866 {
867 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
868 }
869
870 /**
871 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
872 * @lrc: Pointer to the lrc.
873 *
874 * Returns: ctx timestamp job value
875 */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)876 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
877 {
878 struct xe_device *xe = lrc_to_xe(lrc);
879 struct iosys_map map;
880
881 map = __xe_lrc_ctx_job_timestamp_map(lrc);
882 return xe_map_read32(xe, &map);
883 }
884
xe_lrc_ggtt_addr(struct xe_lrc * lrc)885 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
886 {
887 return __xe_lrc_pphwsp_ggtt_addr(lrc);
888 }
889
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)890 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
891 {
892 if (!xe_lrc_has_indirect_ring_state(lrc))
893 return 0;
894
895 return __xe_lrc_indirect_ring_ggtt_addr(lrc);
896 }
897
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)898 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
899 {
900 struct xe_device *xe = lrc_to_xe(lrc);
901 struct iosys_map map;
902
903 map = __xe_lrc_indirect_ring_map(lrc);
904 iosys_map_incr(&map, reg_nr * sizeof(u32));
905 return xe_map_read32(xe, &map);
906 }
907
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)908 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
909 int reg_nr, u32 val)
910 {
911 struct xe_device *xe = lrc_to_xe(lrc);
912 struct iosys_map map;
913
914 map = __xe_lrc_indirect_ring_map(lrc);
915 iosys_map_incr(&map, reg_nr * sizeof(u32));
916 xe_map_write32(xe, &map, val);
917 }
918
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)919 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
920 {
921 struct xe_device *xe = lrc_to_xe(lrc);
922 struct iosys_map map;
923
924 map = __xe_lrc_regs_map(lrc);
925 iosys_map_incr(&map, reg_nr * sizeof(u32));
926 return xe_map_read32(xe, &map);
927 }
928
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)929 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
930 {
931 struct xe_device *xe = lrc_to_xe(lrc);
932 struct iosys_map map;
933
934 map = __xe_lrc_regs_map(lrc);
935 iosys_map_incr(&map, reg_nr * sizeof(u32));
936 xe_map_write32(xe, &map, val);
937 }
938
empty_lrc_data(struct xe_hw_engine * hwe)939 static void *empty_lrc_data(struct xe_hw_engine *hwe)
940 {
941 struct xe_gt *gt = hwe->gt;
942 void *data;
943 u32 *regs;
944
945 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
946 if (!data)
947 return NULL;
948
949 /* 1st page: Per-Process of HW status Page */
950 regs = data + LRC_PPHWSP_SIZE;
951 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
952 set_context_control(regs, hwe);
953 set_memory_based_intr(regs, hwe);
954 reset_stop_ring(regs, hwe);
955 if (xe_gt_has_indirect_ring_state(gt)) {
956 regs = data + xe_gt_lrc_size(gt, hwe->class) -
957 LRC_INDIRECT_RING_STATE_SIZE;
958 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
959 }
960
961 return data;
962 }
963
964 /**
965 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
966 * of given engine.
967 * @hwe: the &xe_hw_engine struct instance
968 */
xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine * hwe)969 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
970 {
971 struct xe_gt *gt = hwe->gt;
972 u32 *regs;
973
974 if (!gt->default_lrc[hwe->class])
975 return;
976
977 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
978 set_memory_based_intr(regs, hwe);
979 }
980
981 /**
982 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
983 * for given LRC.
984 * @lrc: the &xe_lrc struct instance
985 * @hwe: the &xe_hw_engine struct instance
986 * @regs: scratch buffer to be used as temporary storage
987 */
xe_lrc_update_memirq_regs_with_address(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * regs)988 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
989 u32 *regs)
990 {
991 struct xe_gt *gt = hwe->gt;
992 struct iosys_map map;
993 size_t regs_len;
994
995 if (!xe_device_uses_memirq(gt_to_xe(gt)))
996 return;
997
998 map = __xe_lrc_regs_map(lrc);
999 regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1000 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1001 set_memory_based_intr(regs, hwe);
1002 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1003 }
1004
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)1005 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1006 {
1007 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1008
1009 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1010 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1011 }
1012
xe_lrc_finish(struct xe_lrc * lrc)1013 static void xe_lrc_finish(struct xe_lrc *lrc)
1014 {
1015 xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1016 xe_bo_unpin_map_no_vm(lrc->bo);
1017 }
1018
1019 /*
1020 * wa_bb_setup_utilization() - Write commands to wa bb to assist
1021 * in calculating active context run ticks.
1022 *
1023 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1024 * context, but only gets updated when the context switches out. In order to
1025 * check how long a context has been active before it switches out, two things
1026 * are required:
1027 *
1028 * (1) Determine if the context is running:
1029 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1030 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1031 * initialized. During a query, we just check for this value to determine if the
1032 * context is active. If the context switched out, it would overwrite this
1033 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1034 * the last part of context restore, so reusing this LRC location will not
1035 * clobber anything.
1036 *
1037 * (2) Calculate the time that the context has been active for:
1038 * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1039 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1040 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1041 * engine instance. Since we do not know which instance the context is running
1042 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1043 * store it in the PPHSWP.
1044 */
1045 #define CONTEXT_ACTIVE 1ULL
setup_utilization_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1046 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1047 struct xe_hw_engine *hwe,
1048 u32 *batch,
1049 size_t max_len)
1050 {
1051 u32 *cmd = batch;
1052
1053 if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1054 return 0;
1055
1056 if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1057 return -ENOSPC;
1058
1059 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1060 *cmd++ = ENGINE_ID(0).addr;
1061 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1062 *cmd++ = 0;
1063
1064 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1065 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1066 *cmd++ = 0;
1067 *cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1068
1069 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1070 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1071 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1072 *cmd++ = 0;
1073 *cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1074 }
1075
1076 return cmd - batch;
1077 }
1078
setup_timestamp_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1079 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1080 u32 *batch, size_t max_len)
1081 {
1082 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1083 u32 *cmd = batch;
1084
1085 if (!XE_GT_WA(lrc->gt, 16010904313) ||
1086 !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1087 hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1088 hwe->class == XE_ENGINE_CLASS_COPY ||
1089 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1090 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1091 return 0;
1092
1093 if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1094 return -ENOSPC;
1095
1096 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1097 MI_LRM_ASYNC;
1098 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1099 *cmd++ = ts_addr;
1100 *cmd++ = 0;
1101
1102 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1103 MI_LRM_ASYNC;
1104 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1105 *cmd++ = ts_addr;
1106 *cmd++ = 0;
1107
1108 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1109 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1110 *cmd++ = ts_addr;
1111 *cmd++ = 0;
1112
1113 return cmd - batch;
1114 }
1115
setup_configfs_post_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1116 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1117 struct xe_hw_engine *hwe,
1118 u32 *batch, size_t max_len)
1119 {
1120 struct xe_device *xe = gt_to_xe(lrc->gt);
1121 const u32 *user_batch;
1122 u32 *cmd = batch;
1123 u32 count;
1124
1125 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1126 hwe->class, &user_batch);
1127 if (!count)
1128 return 0;
1129
1130 if (count > max_len)
1131 return -ENOSPC;
1132
1133 /*
1134 * This should be used only for tests and validation. Taint the kernel
1135 * as anything could be submitted directly in context switches
1136 */
1137 add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1138
1139 memcpy(cmd, user_batch, count * sizeof(u32));
1140 cmd += count;
1141
1142 return cmd - batch;
1143 }
1144
setup_configfs_mid_ctx_restore_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1145 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1146 struct xe_hw_engine *hwe,
1147 u32 *batch, size_t max_len)
1148 {
1149 struct xe_device *xe = gt_to_xe(lrc->gt);
1150 const u32 *user_batch;
1151 u32 *cmd = batch;
1152 u32 count;
1153
1154 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1155 hwe->class, &user_batch);
1156 if (!count)
1157 return 0;
1158
1159 if (count > max_len)
1160 return -ENOSPC;
1161
1162 /*
1163 * This should be used only for tests and validation. Taint the kernel
1164 * as anything could be submitted directly in context switches
1165 */
1166 add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1167
1168 memcpy(cmd, user_batch, count * sizeof(u32));
1169 cmd += count;
1170
1171 return cmd - batch;
1172 }
1173
setup_invalidate_state_cache_wa(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * batch,size_t max_len)1174 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1175 struct xe_hw_engine *hwe,
1176 u32 *batch, size_t max_len)
1177 {
1178 u32 *cmd = batch;
1179
1180 if (!XE_GT_WA(lrc->gt, 18022495364) ||
1181 hwe->class != XE_ENGINE_CLASS_RENDER)
1182 return 0;
1183
1184 if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1185 return -ENOSPC;
1186
1187 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1188 *cmd++ = CS_DEBUG_MODE1(0).addr;
1189 *cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1190
1191 return cmd - batch;
1192 }
1193
1194 struct bo_setup {
1195 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1196 u32 *batch, size_t max_size);
1197 };
1198
1199 struct bo_setup_state {
1200 /* Input: */
1201 struct xe_lrc *lrc;
1202 struct xe_hw_engine *hwe;
1203 size_t max_size;
1204 size_t reserve_dw;
1205 unsigned int offset;
1206 const struct bo_setup *funcs;
1207 unsigned int num_funcs;
1208
1209 /* State: */
1210 u32 *buffer;
1211 u32 *ptr;
1212 unsigned int written;
1213 };
1214
setup_bo(struct bo_setup_state * state)1215 static int setup_bo(struct bo_setup_state *state)
1216 {
1217 ssize_t remain;
1218
1219 if (state->lrc->bo->vmap.is_iomem) {
1220 xe_gt_assert(state->hwe->gt, state->buffer);
1221 state->ptr = state->buffer;
1222 } else {
1223 state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1224 }
1225
1226 remain = state->max_size / sizeof(u32);
1227
1228 for (size_t i = 0; i < state->num_funcs; i++) {
1229 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1230 state->ptr, remain);
1231
1232 remain -= len;
1233
1234 /*
1235 * Caller has asked for at least reserve_dw to remain unused.
1236 */
1237 if (len < 0 ||
1238 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1239 goto fail;
1240
1241 state->ptr += len;
1242 state->written += len;
1243 }
1244
1245 return 0;
1246
1247 fail:
1248 return -ENOSPC;
1249 }
1250
finish_bo(struct bo_setup_state * state)1251 static void finish_bo(struct bo_setup_state *state)
1252 {
1253 if (!state->lrc->bo->vmap.is_iomem)
1254 return;
1255
1256 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1257 state->offset, state->buffer,
1258 state->written * sizeof(u32));
1259 }
1260
1261 /**
1262 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1263 * @lrc: the &xe_lrc struct instance
1264 * @hwe: the &xe_hw_engine struct instance
1265 * @scratch: preallocated scratch buffer for temporary storage
1266 * Return: 0 on success, negative error code on failure
1267 */
xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc * lrc,struct xe_hw_engine * hwe,u32 * scratch)1268 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1269 {
1270 static const struct bo_setup funcs[] = {
1271 { .setup = setup_timestamp_wa },
1272 { .setup = setup_invalidate_state_cache_wa },
1273 { .setup = setup_utilization_wa },
1274 { .setup = setup_configfs_post_ctx_restore_bb },
1275 };
1276 struct bo_setup_state state = {
1277 .lrc = lrc,
1278 .hwe = hwe,
1279 .max_size = LRC_WA_BB_SIZE,
1280 .buffer = scratch,
1281 .reserve_dw = 1,
1282 .offset = __xe_lrc_wa_bb_offset(lrc),
1283 .funcs = funcs,
1284 .num_funcs = ARRAY_SIZE(funcs),
1285 };
1286 int ret;
1287
1288 ret = setup_bo(&state);
1289 if (ret)
1290 return ret;
1291
1292 *state.ptr++ = MI_BATCH_BUFFER_END;
1293 state.written++;
1294
1295 finish_bo(&state);
1296
1297 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1298 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1299
1300 return 0;
1301 }
1302
setup_wa_bb(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1303 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1304 {
1305 u32 *buf = NULL;
1306 int ret;
1307
1308 if (lrc->bo->vmap.is_iomem) {
1309 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1310 if (!buf)
1311 return -ENOMEM;
1312 }
1313
1314 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1315
1316 kfree(buf);
1317
1318 return ret;
1319 }
1320
1321 static int
setup_indirect_ctx(struct xe_lrc * lrc,struct xe_hw_engine * hwe)1322 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1323 {
1324 static const struct bo_setup rcs_funcs[] = {
1325 { .setup = setup_timestamp_wa },
1326 { .setup = setup_configfs_mid_ctx_restore_bb },
1327 };
1328 static const struct bo_setup xcs_funcs[] = {
1329 { .setup = setup_configfs_mid_ctx_restore_bb },
1330 };
1331 struct bo_setup_state state = {
1332 .lrc = lrc,
1333 .hwe = hwe,
1334 .max_size = (63 * 64) /* max 63 cachelines */,
1335 .buffer = NULL,
1336 .offset = __xe_lrc_indirect_ctx_offset(lrc),
1337 };
1338 int ret;
1339
1340 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1341 return 0;
1342
1343 if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1344 hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1345 state.funcs = rcs_funcs;
1346 state.num_funcs = ARRAY_SIZE(rcs_funcs);
1347 } else {
1348 state.funcs = xcs_funcs;
1349 state.num_funcs = ARRAY_SIZE(xcs_funcs);
1350 }
1351
1352 if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1353 return 0;
1354
1355 if (lrc->bo->vmap.is_iomem) {
1356 state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1357 if (!state.buffer)
1358 return -ENOMEM;
1359 }
1360
1361 ret = setup_bo(&state);
1362 if (ret) {
1363 kfree(state.buffer);
1364 return ret;
1365 }
1366
1367 /*
1368 * Align to 64B cacheline so there's no garbage at the end for CS to
1369 * execute: size for indirect ctx must be a multiple of 64.
1370 */
1371 while (state.written & 0xf) {
1372 *state.ptr++ = MI_NOOP;
1373 state.written++;
1374 }
1375
1376 finish_bo(&state);
1377 kfree(state.buffer);
1378
1379 /*
1380 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1381 * varies per engine class, but the default is good enough
1382 */
1383 xe_lrc_write_ctx_reg(lrc,
1384 CTX_CS_INDIRECT_CTX,
1385 (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1386 /* Size in CLs. */
1387 (state.written * sizeof(u32) / 64));
1388
1389 return 0;
1390 }
1391
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 init_flags)1392 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1393 struct xe_vm *vm, u32 ring_size, u16 msix_vec,
1394 u32 init_flags)
1395 {
1396 struct xe_gt *gt = hwe->gt;
1397 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1398 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1399 struct xe_tile *tile = gt_to_tile(gt);
1400 struct xe_device *xe = gt_to_xe(gt);
1401 struct iosys_map map;
1402 u32 arb_enable;
1403 u32 bo_flags;
1404 int err;
1405
1406 kref_init(&lrc->refcount);
1407 lrc->gt = gt;
1408 lrc->size = lrc_size;
1409 lrc->flags = 0;
1410 lrc->ring.size = ring_size;
1411 lrc->ring.tail = 0;
1412
1413 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1414 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1415 bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1416 }
1417
1418 if (xe_gt_has_indirect_ring_state(gt))
1419 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1420
1421 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1422 XE_BO_FLAG_GGTT_INVALIDATE;
1423
1424 if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1425 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1426
1427 lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
1428 bo_size,
1429 ttm_bo_type_kernel,
1430 bo_flags, false);
1431 if (IS_ERR(lrc->bo))
1432 return PTR_ERR(lrc->bo);
1433
1434 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1435 hwe->fence_irq, hwe->name);
1436
1437 /*
1438 * Init Per-Process of HW status Page, LRC / context state to known
1439 * values. If there's already a primed default_lrc, just copy it, otherwise
1440 * it's the early submission to record the lrc: build a new empty one from
1441 * scratch.
1442 */
1443 map = __xe_lrc_pphwsp_map(lrc);
1444 if (gt->default_lrc[hwe->class]) {
1445 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
1446 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1447 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1448 lrc_size - LRC_PPHWSP_SIZE);
1449 } else {
1450 void *init_data = empty_lrc_data(hwe);
1451
1452 if (!init_data) {
1453 err = -ENOMEM;
1454 goto err_lrc_finish;
1455 }
1456
1457 xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
1458 kfree(init_data);
1459 }
1460
1461 if (vm) {
1462 xe_lrc_set_ppgtt(lrc, vm);
1463
1464 if (vm->xef)
1465 xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1466 }
1467
1468 if (xe_device_has_msix(xe)) {
1469 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1470 xe_memirq_status_ptr(&tile->memirq, hwe));
1471 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1472 xe_memirq_source_ptr(&tile->memirq, hwe));
1473 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1474 }
1475
1476 if (xe_gt_has_indirect_ring_state(gt)) {
1477 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1478 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1479
1480 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1481 __xe_lrc_ring_ggtt_addr(lrc));
1482 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1483 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1484 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1485 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1486 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1487 } else {
1488 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1489 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1490 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1491 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1492 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1493 }
1494
1495 if (init_flags & XE_LRC_CREATE_RUNALONE)
1496 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1497 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1498 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1499
1500 if (init_flags & XE_LRC_CREATE_PXP)
1501 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1502 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1503 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1504
1505 lrc->ctx_timestamp = 0;
1506 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1507 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1508 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1509
1510 if (xe->info.has_asid && vm)
1511 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1512
1513 lrc->desc = LRC_VALID;
1514 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1515 /* TODO: Priority */
1516
1517 /* While this appears to have something about privileged batches or
1518 * some such, it really just means PPGTT mode.
1519 */
1520 if (vm)
1521 lrc->desc |= LRC_PRIVILEGE;
1522
1523 if (GRAPHICS_VERx100(xe) < 1250) {
1524 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1525 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1526 }
1527
1528 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1529 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1530
1531 map = __xe_lrc_seqno_map(lrc);
1532 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1533
1534 map = __xe_lrc_start_seqno_map(lrc);
1535 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1536
1537 err = setup_wa_bb(lrc, hwe);
1538 if (err)
1539 goto err_lrc_finish;
1540
1541 err = setup_indirect_ctx(lrc, hwe);
1542 if (err)
1543 goto err_lrc_finish;
1544
1545 return 0;
1546
1547 err_lrc_finish:
1548 xe_lrc_finish(lrc);
1549 return err;
1550 }
1551
1552 /**
1553 * xe_lrc_create - Create a LRC
1554 * @hwe: Hardware Engine
1555 * @vm: The VM (address space)
1556 * @ring_size: LRC ring size
1557 * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1558 * @flags: LRC initialization flags
1559 *
1560 * Allocate and initialize the Logical Ring Context (LRC).
1561 *
1562 * Return pointer to created LRC upon success and an error pointer
1563 * upon failure.
1564 */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 flags)1565 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1566 u32 ring_size, u16 msix_vec, u32 flags)
1567 {
1568 struct xe_lrc *lrc;
1569 int err;
1570
1571 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1572 if (!lrc)
1573 return ERR_PTR(-ENOMEM);
1574
1575 err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1576 if (err) {
1577 kfree(lrc);
1578 return ERR_PTR(err);
1579 }
1580
1581 return lrc;
1582 }
1583
1584 /**
1585 * xe_lrc_destroy - Destroy the LRC
1586 * @ref: reference to LRC
1587 *
1588 * Called when ref == 0, release resources held by the Logical Ring Context
1589 * (LRC) and free the LRC memory.
1590 */
xe_lrc_destroy(struct kref * ref)1591 void xe_lrc_destroy(struct kref *ref)
1592 {
1593 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1594
1595 xe_lrc_finish(lrc);
1596 kfree(lrc);
1597 }
1598
1599 /**
1600 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1601 * @lrc: the &xe_lrc struct instance
1602 */
xe_lrc_update_hwctx_regs_with_address(struct xe_lrc * lrc)1603 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1604 {
1605 if (xe_lrc_has_indirect_ring_state(lrc)) {
1606 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1607 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1608
1609 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1610 __xe_lrc_ring_ggtt_addr(lrc));
1611 } else {
1612 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1613 }
1614 }
1615
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1616 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1617 {
1618 if (xe_lrc_has_indirect_ring_state(lrc))
1619 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1620 else
1621 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1622 }
1623
xe_lrc_ring_tail(struct xe_lrc * lrc)1624 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1625 {
1626 if (xe_lrc_has_indirect_ring_state(lrc))
1627 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1628 else
1629 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1630 }
1631
xe_lrc_ring_start(struct xe_lrc * lrc)1632 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1633 {
1634 if (xe_lrc_has_indirect_ring_state(lrc))
1635 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1636 else
1637 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1638 }
1639
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1640 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1641 {
1642 if (xe_lrc_has_indirect_ring_state(lrc))
1643 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1644 else
1645 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1646 }
1647
xe_lrc_ring_head(struct xe_lrc * lrc)1648 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1649 {
1650 if (xe_lrc_has_indirect_ring_state(lrc))
1651 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1652 else
1653 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1654 }
1655
xe_lrc_ring_space(struct xe_lrc * lrc)1656 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1657 {
1658 const u32 head = xe_lrc_ring_head(lrc);
1659 const u32 tail = lrc->ring.tail;
1660 const u32 size = lrc->ring.size;
1661
1662 return ((head - tail - 1) & (size - 1)) + 1;
1663 }
1664
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1665 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1666 const void *data, size_t size)
1667 {
1668 struct xe_device *xe = lrc_to_xe(lrc);
1669
1670 iosys_map_incr(&ring, lrc->ring.tail);
1671 xe_map_memcpy_to(xe, &ring, 0, data, size);
1672 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1673 }
1674
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1675 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1676 {
1677 struct xe_device *xe = lrc_to_xe(lrc);
1678 struct iosys_map ring;
1679 u32 rhs;
1680 size_t aligned_size;
1681
1682 xe_assert(xe, IS_ALIGNED(size, 4));
1683 aligned_size = ALIGN(size, 8);
1684
1685 ring = __xe_lrc_ring_map(lrc);
1686
1687 xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1688 rhs = lrc->ring.size - lrc->ring.tail;
1689 if (size > rhs) {
1690 __xe_lrc_write_ring(lrc, ring, data, rhs);
1691 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1692 } else {
1693 __xe_lrc_write_ring(lrc, ring, data, size);
1694 }
1695
1696 if (aligned_size > size) {
1697 u32 noop = MI_NOOP;
1698
1699 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1700 }
1701 }
1702
xe_lrc_descriptor(struct xe_lrc * lrc)1703 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1704 {
1705 return lrc->desc | xe_lrc_ggtt_addr(lrc);
1706 }
1707
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1708 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1709 {
1710 return __xe_lrc_seqno_ggtt_addr(lrc);
1711 }
1712
1713 /**
1714 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1715 *
1716 * Allocate but don't initialize an lrc seqno fence.
1717 *
1718 * Return: Pointer to the allocated fence or
1719 * negative error pointer on error.
1720 */
xe_lrc_alloc_seqno_fence(void)1721 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1722 {
1723 return xe_hw_fence_alloc();
1724 }
1725
1726 /**
1727 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1728 * @fence: Pointer to the fence to free.
1729 *
1730 * Frees an lrc seqno fence that hasn't yet been
1731 * initialized.
1732 */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1733 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1734 {
1735 xe_hw_fence_free(fence);
1736 }
1737
1738 /**
1739 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1740 * @lrc: Pointer to the lrc.
1741 * @fence: Pointer to the fence to initialize.
1742 *
1743 * Initializes a pre-allocated lrc seqno fence.
1744 * After initialization, the fence is subject to normal
1745 * dma-fence refcounting.
1746 */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1747 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1748 {
1749 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1750 }
1751
xe_lrc_seqno(struct xe_lrc * lrc)1752 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1753 {
1754 struct iosys_map map = __xe_lrc_seqno_map(lrc);
1755
1756 return xe_map_read32(lrc_to_xe(lrc), &map);
1757 }
1758
xe_lrc_start_seqno(struct xe_lrc * lrc)1759 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1760 {
1761 struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1762
1763 return xe_map_read32(lrc_to_xe(lrc), &map);
1764 }
1765
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1766 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1767 {
1768 return __xe_lrc_start_seqno_ggtt_addr(lrc);
1769 }
1770
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1771 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1772 {
1773 return __xe_lrc_parallel_ggtt_addr(lrc);
1774 }
1775
xe_lrc_parallel_map(struct xe_lrc * lrc)1776 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1777 {
1778 return __xe_lrc_parallel_map(lrc);
1779 }
1780
1781 /**
1782 * xe_lrc_engine_id() - Read engine id value
1783 * @lrc: Pointer to the lrc.
1784 *
1785 * Returns: context id value
1786 */
xe_lrc_engine_id(struct xe_lrc * lrc)1787 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1788 {
1789 struct xe_device *xe = lrc_to_xe(lrc);
1790 struct iosys_map map;
1791
1792 map = __xe_lrc_engine_id_map(lrc);
1793 return xe_map_read32(xe, &map);
1794 }
1795
instr_dw(u32 cmd_header)1796 static int instr_dw(u32 cmd_header)
1797 {
1798 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1799 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1800 GFXPIPE_SINGLE_DW_CMD(0, 0))
1801 return 1;
1802
1803 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1804 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1805 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1806
1807 /* Most instructions have the # of dwords (minus 2) in 7:0 */
1808 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1809 }
1810
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1811 static int dump_mi_command(struct drm_printer *p,
1812 struct xe_gt *gt,
1813 u32 *dw,
1814 int remaining_dw)
1815 {
1816 u32 inst_header = *dw;
1817 u32 numdw = instr_dw(inst_header);
1818 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1819 int num_noop;
1820
1821 /* First check for commands that don't have/use a '# DW' field */
1822 switch (inst_header & MI_OPCODE) {
1823 case MI_NOOP:
1824 num_noop = 1;
1825 while (num_noop < remaining_dw &&
1826 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1827 num_noop++;
1828 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1829 return num_noop;
1830
1831 case MI_TOPOLOGY_FILTER:
1832 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1833 return 1;
1834
1835 case MI_BATCH_BUFFER_END:
1836 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1837 /* Return 'remaining_dw' to consume the rest of the LRC */
1838 return remaining_dw;
1839 }
1840
1841 /*
1842 * Any remaining commands include a # of dwords. We should make sure
1843 * it doesn't exceed the remaining size of the LRC.
1844 */
1845 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1846 numdw = remaining_dw;
1847
1848 switch (inst_header & MI_OPCODE) {
1849 case MI_LOAD_REGISTER_IMM:
1850 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1851 inst_header, (numdw - 1) / 2);
1852 for (int i = 1; i < numdw; i += 2)
1853 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1854 return numdw;
1855
1856 case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1857 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1858 inst_header,
1859 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1860 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1861 if (numdw == 4)
1862 drm_printf(p, " - %#6x = %#010llx\n",
1863 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1864 else
1865 drm_printf(p, " - %*ph (%s)\n",
1866 (int)sizeof(u32) * (numdw - 1), dw + 1,
1867 numdw < 4 ? "truncated" : "malformed");
1868 return numdw;
1869
1870 case MI_FORCE_WAKEUP:
1871 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1872 return numdw;
1873
1874 default:
1875 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1876 inst_header, opcode, numdw);
1877 return numdw;
1878 }
1879 }
1880
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1881 static int dump_gfxpipe_command(struct drm_printer *p,
1882 struct xe_gt *gt,
1883 u32 *dw,
1884 int remaining_dw)
1885 {
1886 u32 numdw = instr_dw(*dw);
1887 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1888 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1889 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1890
1891 /*
1892 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1893 * remaining size of the LRC.
1894 */
1895 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1896 numdw = remaining_dw;
1897
1898 switch (*dw & GFXPIPE_MATCH_MASK) {
1899 #define MATCH(cmd) \
1900 case cmd: \
1901 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1902 return numdw
1903 #define MATCH3D(cmd) \
1904 case CMD_##cmd: \
1905 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1906 return numdw
1907
1908 MATCH(STATE_BASE_ADDRESS);
1909 MATCH(STATE_SIP);
1910 MATCH(GPGPU_CSR_BASE_ADDRESS);
1911 MATCH(STATE_COMPUTE_MODE);
1912 MATCH3D(3DSTATE_BTD);
1913 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1914 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1915
1916 MATCH3D(3DSTATE_VF_STATISTICS);
1917
1918 MATCH(PIPELINE_SELECT);
1919
1920 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1921 MATCH3D(3DSTATE_CLEAR_PARAMS);
1922 MATCH3D(3DSTATE_DEPTH_BUFFER);
1923 MATCH3D(3DSTATE_STENCIL_BUFFER);
1924 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1925 MATCH3D(3DSTATE_VERTEX_BUFFERS);
1926 MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1927 MATCH3D(3DSTATE_INDEX_BUFFER);
1928 MATCH3D(3DSTATE_VF);
1929 MATCH3D(3DSTATE_MULTISAMPLE);
1930 MATCH3D(3DSTATE_CC_STATE_POINTERS);
1931 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1932 MATCH3D(3DSTATE_VS);
1933 MATCH3D(3DSTATE_GS);
1934 MATCH3D(3DSTATE_CLIP);
1935 MATCH3D(3DSTATE_SF);
1936 MATCH3D(3DSTATE_WM);
1937 MATCH3D(3DSTATE_CONSTANT_VS);
1938 MATCH3D(3DSTATE_CONSTANT_GS);
1939 MATCH3D(3DSTATE_CONSTANT_PS);
1940 MATCH3D(3DSTATE_SAMPLE_MASK);
1941 MATCH3D(3DSTATE_CONSTANT_HS);
1942 MATCH3D(3DSTATE_CONSTANT_DS);
1943 MATCH3D(3DSTATE_HS);
1944 MATCH3D(3DSTATE_TE);
1945 MATCH3D(3DSTATE_DS);
1946 MATCH3D(3DSTATE_STREAMOUT);
1947 MATCH3D(3DSTATE_SBE);
1948 MATCH3D(3DSTATE_PS);
1949 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1950 MATCH3D(3DSTATE_CPS_POINTERS);
1951 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1952 MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1953 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1954 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1955 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1956 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1957 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1958 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1959 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1960 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1961 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1962 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1963 MATCH3D(3DSTATE_VF_INSTANCING);
1964 MATCH3D(3DSTATE_VF_SGVS);
1965 MATCH3D(3DSTATE_VF_TOPOLOGY);
1966 MATCH3D(3DSTATE_WM_CHROMAKEY);
1967 MATCH3D(3DSTATE_PS_BLEND);
1968 MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1969 MATCH3D(3DSTATE_PS_EXTRA);
1970 MATCH3D(3DSTATE_RASTER);
1971 MATCH3D(3DSTATE_SBE_SWIZ);
1972 MATCH3D(3DSTATE_WM_HZ_OP);
1973 MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1974 MATCH3D(3DSTATE_VF_SGVS_2);
1975 MATCH3D(3DSTATE_VFG);
1976 MATCH3D(3DSTATE_URB_ALLOC_VS);
1977 MATCH3D(3DSTATE_URB_ALLOC_HS);
1978 MATCH3D(3DSTATE_URB_ALLOC_DS);
1979 MATCH3D(3DSTATE_URB_ALLOC_GS);
1980 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1981 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1982 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1983 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1984 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1985 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1986 MATCH3D(3DSTATE_AMFS);
1987 MATCH3D(3DSTATE_DEPTH_BOUNDS);
1988 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1989 MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1990 MATCH3D(3DSTATE_MESH_CONTROL);
1991 MATCH3D(3DSTATE_MESH_DISTRIB);
1992 MATCH3D(3DSTATE_TASK_REDISTRIB);
1993 MATCH3D(3DSTATE_MESH_SHADER);
1994 MATCH3D(3DSTATE_MESH_SHADER_DATA);
1995 MATCH3D(3DSTATE_TASK_CONTROL);
1996 MATCH3D(3DSTATE_TASK_SHADER);
1997 MATCH3D(3DSTATE_TASK_SHADER_DATA);
1998 MATCH3D(3DSTATE_URB_ALLOC_MESH);
1999 MATCH3D(3DSTATE_URB_ALLOC_TASK);
2000 MATCH3D(3DSTATE_CLIP_MESH);
2001 MATCH3D(3DSTATE_SBE_MESH);
2002 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2003 MATCH3D(3DSTATE_COARSE_PIXEL);
2004
2005 MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2006 MATCH3D(3DSTATE_CHROMA_KEY);
2007 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2008 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2009 MATCH3D(3DSTATE_LINE_STIPPLE);
2010 MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2011 MATCH3D(3DSTATE_MONOFILTER_SIZE);
2012 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2013 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2014 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2015 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2016 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2017 MATCH3D(3DSTATE_SO_DECL_LIST);
2018 MATCH3D(3DSTATE_SO_BUFFER);
2019 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2020 MATCH3D(3DSTATE_SAMPLE_PATTERN);
2021 MATCH3D(3DSTATE_3D_MODE);
2022 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2023 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2024 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2025
2026 default:
2027 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2028 *dw, pipeline, opcode, subopcode, numdw);
2029 return numdw;
2030 }
2031 }
2032
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)2033 static int dump_gfx_state_command(struct drm_printer *p,
2034 struct xe_gt *gt,
2035 u32 *dw,
2036 int remaining_dw)
2037 {
2038 u32 numdw = instr_dw(*dw);
2039 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2040
2041 /*
2042 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2043 * remaining size of the LRC.
2044 */
2045 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2046 numdw = remaining_dw;
2047
2048 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2049 MATCH(STATE_WRITE_INLINE);
2050
2051 default:
2052 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2053 *dw, opcode, numdw);
2054 return numdw;
2055 }
2056 }
2057
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)2058 void xe_lrc_dump_default(struct drm_printer *p,
2059 struct xe_gt *gt,
2060 enum xe_engine_class hwe_class)
2061 {
2062 u32 *dw;
2063 int remaining_dw, num_dw;
2064
2065 if (!gt->default_lrc[hwe_class]) {
2066 drm_printf(p, "No default LRC for class %d\n", hwe_class);
2067 return;
2068 }
2069
2070 /*
2071 * Skip the beginning of the LRC since it contains the per-process
2072 * hardware status page.
2073 */
2074 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2075 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2076
2077 while (remaining_dw > 0) {
2078 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2079 num_dw = dump_mi_command(p, gt, dw, remaining_dw);
2080 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2081 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
2082 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2083 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
2084 } else {
2085 num_dw = min(instr_dw(*dw), remaining_dw);
2086 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2087 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2088 num_dw);
2089 }
2090
2091 dw += num_dw;
2092 remaining_dw -= num_dw;
2093 }
2094 }
2095
2096 struct instr_state {
2097 u32 instr;
2098 u16 num_dw;
2099 };
2100
2101 static const struct instr_state xe_hpg_svg_state[] = {
2102 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2103 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2104 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2105 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2106 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2107 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2108 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2109 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2110 { .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2111 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2112 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2113 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2114 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2115 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2116 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2117 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2118 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2119 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2120 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2121 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2122 { .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2123 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2124 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2125 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2126 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2127 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2128 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2129 { .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2130 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2131 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2132 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2133 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2134 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2135 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2136 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2137 { .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2138 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2139 { .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2140 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2141 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2142 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2143 { .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2144 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2145 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2146 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2147 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2148 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2149 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2150 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2151 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2152 };
2153
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,u32 * cs)2154 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2155 {
2156 struct xe_gt *gt = q->hwe->gt;
2157 struct xe_device *xe = gt_to_xe(gt);
2158 const struct instr_state *state_table = NULL;
2159 int state_table_size = 0;
2160
2161 /*
2162 * Wa_14019789679
2163 *
2164 * If the driver doesn't explicitly emit the SVG instructions while
2165 * setting up the default LRC, the context switch will write 0's
2166 * (noops) into the LRC memory rather than the expected instruction
2167 * headers. Application contexts start out as a copy of the default
2168 * LRC, and if they also do not emit specific settings for some SVG
2169 * state, then on context restore they'll unintentionally inherit
2170 * whatever state setting the previous context had programmed into the
2171 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2172 * prevent the hardware from resetting that state back to any specific
2173 * value).
2174 *
2175 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2176 * since that's a specific state setting that can easily cause GPU
2177 * hangs if unintentionally inherited. However to be safe we'll
2178 * continue to emit all of the SVG state since it's best not to leak
2179 * any of the state between contexts, even if that leakage is harmless.
2180 */
2181 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2182 state_table = xe_hpg_svg_state;
2183 state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2184 }
2185
2186 if (!state_table) {
2187 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2188 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2189 return cs;
2190 }
2191
2192 for (int i = 0; i < state_table_size; i++) {
2193 u32 instr = state_table[i].instr;
2194 u16 num_dw = state_table[i].num_dw;
2195 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2196
2197 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2198 xe_gt_assert(gt, num_dw != 0);
2199 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2200
2201 /*
2202 * Xe2's SVG context is the same as the one on DG2 / MTL
2203 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2204 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2205 * Just make the replacement here rather than defining a
2206 * whole separate table for the single trivial change.
2207 */
2208 if (GRAPHICS_VER(xe) >= 20 &&
2209 instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2210 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2211
2212 *cs = instr;
2213 if (!is_single_dw)
2214 *cs |= (num_dw - 2);
2215
2216 cs += num_dw;
2217 }
2218
2219 return cs;
2220 }
2221
xe_lrc_snapshot_capture(struct xe_lrc * lrc)2222 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2223 {
2224 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
2225
2226 if (!snapshot)
2227 return NULL;
2228
2229 snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2230 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2231 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2232 snapshot->head = xe_lrc_ring_head(lrc);
2233 snapshot->tail.internal = lrc->ring.tail;
2234 snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2235 snapshot->start = xe_lrc_ring_start(lrc);
2236 snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2237 snapshot->seqno = xe_lrc_seqno(lrc);
2238 snapshot->lrc_bo = xe_bo_get(lrc->bo);
2239 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2240 snapshot->lrc_size = lrc->size;
2241 snapshot->lrc_snapshot = NULL;
2242 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2243 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2244 return snapshot;
2245 }
2246
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)2247 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2248 {
2249 struct xe_bo *bo;
2250 struct iosys_map src;
2251
2252 if (!snapshot)
2253 return;
2254
2255 bo = snapshot->lrc_bo;
2256 snapshot->lrc_bo = NULL;
2257
2258 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2259 if (!snapshot->lrc_snapshot)
2260 goto put_bo;
2261
2262 xe_bo_lock(bo, false);
2263 if (!ttm_bo_vmap(&bo->ttm, &src)) {
2264 xe_map_memcpy_from(xe_bo_device(bo),
2265 snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2266 snapshot->lrc_size);
2267 ttm_bo_vunmap(&bo->ttm, &src);
2268 } else {
2269 kvfree(snapshot->lrc_snapshot);
2270 snapshot->lrc_snapshot = NULL;
2271 }
2272 xe_bo_unlock(bo);
2273 put_bo:
2274 xe_bo_put(bo);
2275 }
2276
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)2277 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2278 {
2279 unsigned long i;
2280
2281 if (!snapshot)
2282 return;
2283
2284 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2285 drm_printf(p, "\tHW Ring address: 0x%08x\n",
2286 snapshot->ring_addr);
2287 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2288 snapshot->indirect_context_desc);
2289 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2290 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2291 snapshot->tail.internal, snapshot->tail.memory);
2292 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2293 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2294 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2295 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2296 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2297
2298 if (!snapshot->lrc_snapshot)
2299 return;
2300
2301 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2302 drm_puts(p, "\t[HWSP].data: ");
2303 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2304 u32 *val = snapshot->lrc_snapshot + i;
2305 char dumped[ASCII85_BUFSZ];
2306
2307 drm_puts(p, ascii85_encode(*val, dumped));
2308 }
2309
2310 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2311 drm_puts(p, "\t[HWCTX].data: ");
2312 for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2313 u32 *val = snapshot->lrc_snapshot + i;
2314 char dumped[ASCII85_BUFSZ];
2315
2316 drm_puts(p, ascii85_encode(*val, dumped));
2317 }
2318 drm_puts(p, "\n");
2319 }
2320
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)2321 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2322 {
2323 if (!snapshot)
2324 return;
2325
2326 kvfree(snapshot->lrc_snapshot);
2327 if (snapshot->lrc_bo)
2328 xe_bo_put(snapshot->lrc_bo);
2329
2330 kfree(snapshot);
2331 }
2332
get_ctx_timestamp(struct xe_lrc * lrc,u32 engine_id,u64 * reg_ctx_ts)2333 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2334 {
2335 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2336 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2337 struct xe_hw_engine *hwe;
2338 u64 val;
2339
2340 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2341 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2342 "Unexpected engine class:instance %d:%d for context utilization\n",
2343 class, instance))
2344 return -1;
2345
2346 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2347 val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2348 RING_CTX_TIMESTAMP(hwe->mmio_base));
2349 else
2350 val = xe_mmio_read32(&hwe->gt->mmio,
2351 RING_CTX_TIMESTAMP(hwe->mmio_base));
2352
2353 *reg_ctx_ts = val;
2354
2355 return 0;
2356 }
2357
2358 /**
2359 * xe_lrc_update_timestamp() - Update ctx timestamp
2360 * @lrc: Pointer to the lrc.
2361 * @old_ts: Old timestamp value
2362 *
2363 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2364 * update saved value. With support for active contexts, the calculation may be
2365 * slightly racy, so follow a read-again logic to ensure that the context is
2366 * still active before returning the right timestamp.
2367 *
2368 * Returns: New ctx timestamp value
2369 */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u64 * old_ts)2370 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2371 {
2372 u64 lrc_ts, reg_ts;
2373 u32 engine_id;
2374
2375 *old_ts = lrc->ctx_timestamp;
2376
2377 lrc_ts = xe_lrc_ctx_timestamp(lrc);
2378 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2379 if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2380 lrc->ctx_timestamp = lrc_ts;
2381 goto done;
2382 }
2383
2384 if (lrc_ts == CONTEXT_ACTIVE) {
2385 engine_id = xe_lrc_engine_id(lrc);
2386 if (!get_ctx_timestamp(lrc, engine_id, ®_ts))
2387 lrc->ctx_timestamp = reg_ts;
2388
2389 /* read lrc again to ensure context is still active */
2390 lrc_ts = xe_lrc_ctx_timestamp(lrc);
2391 }
2392
2393 /*
2394 * If context switched out, just use the lrc_ts. Note that this needs to
2395 * be a separate if condition.
2396 */
2397 if (lrc_ts != CONTEXT_ACTIVE)
2398 lrc->ctx_timestamp = lrc_ts;
2399
2400 done:
2401 trace_xe_lrc_update_timestamp(lrc, *old_ts);
2402
2403 return lrc->ctx_timestamp;
2404 }
2405
2406 /**
2407 * xe_lrc_ring_is_idle() - LRC is idle
2408 * @lrc: Pointer to the lrc.
2409 *
2410 * Compare LRC ring head and tail to determine if idle.
2411 *
2412 * Return: True is ring is idle, False otherwise
2413 */
xe_lrc_ring_is_idle(struct xe_lrc * lrc)2414 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2415 {
2416 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2417 }
2418