1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2021 Intel Corporation
4 */
5
6 #include "xe_lrc.h"
7
8 #include <generated/xe_wa_oob.h>
9
10 #include <linux/ascii85.h>
11
12 #include "instructions/xe_mi_commands.h"
13 #include "instructions/xe_gfxpipe_commands.h"
14 #include "instructions/xe_gfx_state_commands.h"
15 #include "regs/xe_engine_regs.h"
16 #include "regs/xe_lrc_layout.h"
17 #include "xe_bb.h"
18 #include "xe_bo.h"
19 #include "xe_device.h"
20 #include "xe_drm_client.h"
21 #include "xe_exec_queue_types.h"
22 #include "xe_gt.h"
23 #include "xe_gt_printk.h"
24 #include "xe_hw_fence.h"
25 #include "xe_map.h"
26 #include "xe_memirq.h"
27 #include "xe_mmio.h"
28 #include "xe_sriov.h"
29 #include "xe_trace_lrc.h"
30 #include "xe_vm.h"
31 #include "xe_wa.h"
32
33 #define LRC_VALID BIT_ULL(0)
34 #define LRC_PRIVILEGE BIT_ULL(8)
35 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3)
36 #define LRC_LEGACY_64B_CONTEXT 3
37
38 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61)
39 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
40
41 #define LRC_PPHWSP_SIZE SZ_4K
42 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
43
44 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)45 lrc_to_xe(struct xe_lrc *lrc)
46 {
47 return gt_to_xe(lrc->fence_ctx.gt);
48 }
49
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)50 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
51 {
52 struct xe_device *xe = gt_to_xe(gt);
53 size_t size;
54
55 /* Per-process HW status page (PPHWSP) */
56 size = LRC_PPHWSP_SIZE;
57
58 /* Engine context image */
59 switch (class) {
60 case XE_ENGINE_CLASS_RENDER:
61 if (GRAPHICS_VER(xe) >= 20)
62 size += 3 * SZ_4K;
63 else
64 size += 13 * SZ_4K;
65 break;
66 case XE_ENGINE_CLASS_COMPUTE:
67 if (GRAPHICS_VER(xe) >= 20)
68 size += 2 * SZ_4K;
69 else
70 size += 13 * SZ_4K;
71 break;
72 default:
73 WARN(1, "Unknown engine class: %d", class);
74 fallthrough;
75 case XE_ENGINE_CLASS_COPY:
76 case XE_ENGINE_CLASS_VIDEO_DECODE:
77 case XE_ENGINE_CLASS_VIDEO_ENHANCE:
78 case XE_ENGINE_CLASS_OTHER:
79 size += 1 * SZ_4K;
80 }
81
82 /* Add indirect ring state page */
83 if (xe_gt_has_indirect_ring_state(gt))
84 size += LRC_INDIRECT_RING_STATE_SIZE;
85
86 return size;
87 }
88
89 /*
90 * The per-platform tables are u8-encoded in @data. Decode @data and set the
91 * addresses' offset and commands in @regs. The following encoding is used
92 * for each byte. There are 2 steps: decoding commands and decoding addresses.
93 *
94 * Commands:
95 * [7]: create NOPs - number of NOPs are set in lower bits
96 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
97 * MI_LRI_FORCE_POSTED
98 * [5:0]: Number of NOPs or registers to set values to in case of
99 * MI_LOAD_REGISTER_IMM
100 *
101 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
102 * number of registers. They are set by using the REG/REG16 macros: the former
103 * is used for offsets smaller than 0x200 while the latter is for values bigger
104 * than that. Those macros already set all the bits documented below correctly:
105 *
106 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
107 * follow, for the lower bits
108 * [6:0]: Register offset, without considering the engine base.
109 *
110 * This function only tweaks the commands and register offsets. Values are not
111 * filled out.
112 */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)113 static void set_offsets(u32 *regs,
114 const u8 *data,
115 const struct xe_hw_engine *hwe)
116 #define NOP(x) (BIT(7) | (x))
117 #define LRI(count, flags) ((flags) << 6 | (count) | \
118 BUILD_BUG_ON_ZERO(count >= BIT(6)))
119 #define POSTED BIT(0)
120 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
121 #define REG16(x) \
122 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
123 (((x) >> 2) & 0x7f)
124 {
125 const u32 base = hwe->mmio_base;
126
127 while (*data) {
128 u8 count, flags;
129
130 if (*data & BIT(7)) { /* skip */
131 count = *data++ & ~BIT(7);
132 regs += count;
133 continue;
134 }
135
136 count = *data & 0x3f;
137 flags = *data >> 6;
138 data++;
139
140 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
141 if (flags & POSTED)
142 *regs |= MI_LRI_FORCE_POSTED;
143 *regs |= MI_LRI_LRM_CS_MMIO;
144 regs++;
145
146 xe_gt_assert(hwe->gt, count);
147 do {
148 u32 offset = 0;
149 u8 v;
150
151 do {
152 v = *data++;
153 offset <<= 7;
154 offset |= v & ~BIT(7);
155 } while (v & BIT(7));
156
157 regs[0] = base + (offset << 2);
158 regs += 2;
159 } while (--count);
160 }
161
162 *regs = MI_BATCH_BUFFER_END | BIT(0);
163 }
164
165 static const u8 gen12_xcs_offsets[] = {
166 NOP(1),
167 LRI(13, POSTED),
168 REG16(0x244),
169 REG(0x034),
170 REG(0x030),
171 REG(0x038),
172 REG(0x03c),
173 REG(0x168),
174 REG(0x140),
175 REG(0x110),
176 REG(0x1c0),
177 REG(0x1c4),
178 REG(0x1c8),
179 REG(0x180),
180 REG16(0x2b4),
181
182 NOP(5),
183 LRI(9, POSTED),
184 REG16(0x3a8),
185 REG16(0x28c),
186 REG16(0x288),
187 REG16(0x284),
188 REG16(0x280),
189 REG16(0x27c),
190 REG16(0x278),
191 REG16(0x274),
192 REG16(0x270),
193
194 0
195 };
196
197 static const u8 dg2_xcs_offsets[] = {
198 NOP(1),
199 LRI(15, POSTED),
200 REG16(0x244),
201 REG(0x034),
202 REG(0x030),
203 REG(0x038),
204 REG(0x03c),
205 REG(0x168),
206 REG(0x140),
207 REG(0x110),
208 REG(0x1c0),
209 REG(0x1c4),
210 REG(0x1c8),
211 REG(0x180),
212 REG16(0x2b4),
213 REG(0x120),
214 REG(0x124),
215
216 NOP(1),
217 LRI(9, POSTED),
218 REG16(0x3a8),
219 REG16(0x28c),
220 REG16(0x288),
221 REG16(0x284),
222 REG16(0x280),
223 REG16(0x27c),
224 REG16(0x278),
225 REG16(0x274),
226 REG16(0x270),
227
228 0
229 };
230
231 static const u8 gen12_rcs_offsets[] = {
232 NOP(1),
233 LRI(13, POSTED),
234 REG16(0x244),
235 REG(0x034),
236 REG(0x030),
237 REG(0x038),
238 REG(0x03c),
239 REG(0x168),
240 REG(0x140),
241 REG(0x110),
242 REG(0x1c0),
243 REG(0x1c4),
244 REG(0x1c8),
245 REG(0x180),
246 REG16(0x2b4),
247
248 NOP(5),
249 LRI(9, POSTED),
250 REG16(0x3a8),
251 REG16(0x28c),
252 REG16(0x288),
253 REG16(0x284),
254 REG16(0x280),
255 REG16(0x27c),
256 REG16(0x278),
257 REG16(0x274),
258 REG16(0x270),
259
260 LRI(3, POSTED),
261 REG(0x1b0),
262 REG16(0x5a8),
263 REG16(0x5ac),
264
265 NOP(6),
266 LRI(1, 0),
267 REG(0x0c8),
268 NOP(3 + 9 + 1),
269
270 LRI(51, POSTED),
271 REG16(0x588),
272 REG16(0x588),
273 REG16(0x588),
274 REG16(0x588),
275 REG16(0x588),
276 REG16(0x588),
277 REG(0x028),
278 REG(0x09c),
279 REG(0x0c0),
280 REG(0x178),
281 REG(0x17c),
282 REG16(0x358),
283 REG(0x170),
284 REG(0x150),
285 REG(0x154),
286 REG(0x158),
287 REG16(0x41c),
288 REG16(0x600),
289 REG16(0x604),
290 REG16(0x608),
291 REG16(0x60c),
292 REG16(0x610),
293 REG16(0x614),
294 REG16(0x618),
295 REG16(0x61c),
296 REG16(0x620),
297 REG16(0x624),
298 REG16(0x628),
299 REG16(0x62c),
300 REG16(0x630),
301 REG16(0x634),
302 REG16(0x638),
303 REG16(0x63c),
304 REG16(0x640),
305 REG16(0x644),
306 REG16(0x648),
307 REG16(0x64c),
308 REG16(0x650),
309 REG16(0x654),
310 REG16(0x658),
311 REG16(0x65c),
312 REG16(0x660),
313 REG16(0x664),
314 REG16(0x668),
315 REG16(0x66c),
316 REG16(0x670),
317 REG16(0x674),
318 REG16(0x678),
319 REG16(0x67c),
320 REG(0x068),
321 REG(0x084),
322 NOP(1),
323
324 0
325 };
326
327 static const u8 xehp_rcs_offsets[] = {
328 NOP(1),
329 LRI(13, POSTED),
330 REG16(0x244),
331 REG(0x034),
332 REG(0x030),
333 REG(0x038),
334 REG(0x03c),
335 REG(0x168),
336 REG(0x140),
337 REG(0x110),
338 REG(0x1c0),
339 REG(0x1c4),
340 REG(0x1c8),
341 REG(0x180),
342 REG16(0x2b4),
343
344 NOP(5),
345 LRI(9, POSTED),
346 REG16(0x3a8),
347 REG16(0x28c),
348 REG16(0x288),
349 REG16(0x284),
350 REG16(0x280),
351 REG16(0x27c),
352 REG16(0x278),
353 REG16(0x274),
354 REG16(0x270),
355
356 LRI(3, POSTED),
357 REG(0x1b0),
358 REG16(0x5a8),
359 REG16(0x5ac),
360
361 NOP(6),
362 LRI(1, 0),
363 REG(0x0c8),
364
365 0
366 };
367
368 static const u8 dg2_rcs_offsets[] = {
369 NOP(1),
370 LRI(15, POSTED),
371 REG16(0x244),
372 REG(0x034),
373 REG(0x030),
374 REG(0x038),
375 REG(0x03c),
376 REG(0x168),
377 REG(0x140),
378 REG(0x110),
379 REG(0x1c0),
380 REG(0x1c4),
381 REG(0x1c8),
382 REG(0x180),
383 REG16(0x2b4),
384 REG(0x120),
385 REG(0x124),
386
387 NOP(1),
388 LRI(9, POSTED),
389 REG16(0x3a8),
390 REG16(0x28c),
391 REG16(0x288),
392 REG16(0x284),
393 REG16(0x280),
394 REG16(0x27c),
395 REG16(0x278),
396 REG16(0x274),
397 REG16(0x270),
398
399 LRI(3, POSTED),
400 REG(0x1b0),
401 REG16(0x5a8),
402 REG16(0x5ac),
403
404 NOP(6),
405 LRI(1, 0),
406 REG(0x0c8),
407
408 0
409 };
410
411 static const u8 mtl_rcs_offsets[] = {
412 NOP(1),
413 LRI(15, POSTED),
414 REG16(0x244),
415 REG(0x034),
416 REG(0x030),
417 REG(0x038),
418 REG(0x03c),
419 REG(0x168),
420 REG(0x140),
421 REG(0x110),
422 REG(0x1c0),
423 REG(0x1c4),
424 REG(0x1c8),
425 REG(0x180),
426 REG16(0x2b4),
427 REG(0x120),
428 REG(0x124),
429
430 NOP(1),
431 LRI(9, POSTED),
432 REG16(0x3a8),
433 REG16(0x28c),
434 REG16(0x288),
435 REG16(0x284),
436 REG16(0x280),
437 REG16(0x27c),
438 REG16(0x278),
439 REG16(0x274),
440 REG16(0x270),
441
442 NOP(2),
443 LRI(2, POSTED),
444 REG16(0x5a8),
445 REG16(0x5ac),
446
447 NOP(6),
448 LRI(1, 0),
449 REG(0x0c8),
450
451 0
452 };
453
454 #define XE2_CTX_COMMON \
455 NOP(1), /* [0x00] */ \
456 LRI(15, POSTED), /* [0x01] */ \
457 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \
458 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \
459 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \
460 REG(0x038), /* [0x08] RING_BUFFER_START */ \
461 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \
462 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \
463 REG(0x140), /* [0x0e] BB_ADDR */ \
464 REG(0x110), /* [0x10] BB_STATE */ \
465 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \
466 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \
467 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
468 REG(0x180), /* [0x18] CCID */ \
469 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \
470 REG(0x120), /* [0x1c] PRT_BB_STATE */ \
471 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \
472 \
473 NOP(1), /* [0x20] */ \
474 LRI(9, POSTED), /* [0x21] */ \
475 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \
476 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \
477 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \
478 REG16(0x284), /* [0x28] dummy reg */ \
479 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \
480 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \
481 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \
482 REG16(0x274), /* [0x30] PTBP_UDW */ \
483 REG16(0x270) /* [0x32] PTBP_LDW */
484
485 static const u8 xe2_rcs_offsets[] = {
486 XE2_CTX_COMMON,
487
488 NOP(2), /* [0x34] */
489 LRI(2, POSTED), /* [0x36] */
490 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
491 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */
492
493 NOP(6), /* [0x41] */
494 LRI(1, 0), /* [0x47] */
495 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */
496
497 0
498 };
499
500 static const u8 xe2_bcs_offsets[] = {
501 XE2_CTX_COMMON,
502
503 NOP(4 + 8 + 1), /* [0x34] */
504 LRI(2, POSTED), /* [0x41] */
505 REG16(0x200), /* [0x42] BCS_SWCTRL */
506 REG16(0x204), /* [0x44] BLIT_CCTL */
507
508 0
509 };
510
511 static const u8 xe2_xcs_offsets[] = {
512 XE2_CTX_COMMON,
513
514 0
515 };
516
517 static const u8 xe2_indirect_ring_state_offsets[] = {
518 NOP(1), /* [0x00] */
519 LRI(5, POSTED), /* [0x01] */
520 REG(0x034), /* [0x02] RING_BUFFER_HEAD */
521 REG(0x030), /* [0x04] RING_BUFFER_TAIL */
522 REG(0x038), /* [0x06] RING_BUFFER_START */
523 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */
524 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */
525
526 NOP(5), /* [0x0c] */
527 LRI(9, POSTED), /* [0x11] */
528 REG(0x168), /* [0x12] BB_ADDR_UDW */
529 REG(0x140), /* [0x14] BB_ADDR */
530 REG(0x110), /* [0x16] BB_STATE */
531 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */
532 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */
533 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */
534 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */
535 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */
536 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */
537
538 NOP(12), /* [0x00] */
539
540 0
541 };
542
543 #undef REG16
544 #undef REG
545 #undef LRI
546 #undef NOP
547
reg_offsets(struct xe_device * xe,enum xe_engine_class class)548 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
549 {
550 if (class == XE_ENGINE_CLASS_RENDER) {
551 if (GRAPHICS_VER(xe) >= 20)
552 return xe2_rcs_offsets;
553 else if (GRAPHICS_VERx100(xe) >= 1270)
554 return mtl_rcs_offsets;
555 else if (GRAPHICS_VERx100(xe) >= 1255)
556 return dg2_rcs_offsets;
557 else if (GRAPHICS_VERx100(xe) >= 1250)
558 return xehp_rcs_offsets;
559 else
560 return gen12_rcs_offsets;
561 } else if (class == XE_ENGINE_CLASS_COPY) {
562 if (GRAPHICS_VER(xe) >= 20)
563 return xe2_bcs_offsets;
564 else
565 return gen12_xcs_offsets;
566 } else {
567 if (GRAPHICS_VER(xe) >= 20)
568 return xe2_xcs_offsets;
569 else if (GRAPHICS_VERx100(xe) >= 1255)
570 return dg2_xcs_offsets;
571 else
572 return gen12_xcs_offsets;
573 }
574 }
575
set_context_control(u32 * regs,struct xe_hw_engine * hwe)576 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
577 {
578 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
579 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
580
581 if (xe_gt_has_indirect_ring_state(hwe->gt))
582 regs[CTX_CONTEXT_CONTROL] |=
583 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
584
585 /* TODO: Timestamp */
586 }
587
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)588 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
589 {
590 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq;
591 struct xe_device *xe = gt_to_xe(hwe->gt);
592 u8 num_regs;
593
594 if (!xe_device_uses_memirq(xe))
595 return;
596
597 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
598 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
599 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
600 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
601
602 num_regs = xe_device_has_msix(xe) ? 3 : 2;
603 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
604 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
605 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
606 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
607 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
608 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
609
610 if (xe_device_has_msix(xe)) {
611 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
612 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
613 }
614 }
615
lrc_ring_mi_mode(struct xe_hw_engine * hwe)616 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
617 {
618 struct xe_device *xe = gt_to_xe(hwe->gt);
619
620 if (GRAPHICS_VERx100(xe) >= 1250)
621 return 0x70;
622 else
623 return 0x60;
624 }
625
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)626 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
627 {
628 int x;
629
630 x = lrc_ring_mi_mode(hwe);
631 regs[x + 1] &= ~STOP_RING;
632 regs[x + 1] |= STOP_RING << 16;
633 }
634
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)635 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
636 {
637 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
638 }
639
__xe_lrc_ring_offset(struct xe_lrc * lrc)640 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
641 {
642 return 0;
643 }
644
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)645 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
646 {
647 return lrc->ring.size;
648 }
649
650 /* Make the magic macros work */
651 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
652 #define __xe_lrc_regs_offset xe_lrc_regs_offset
653
654 #define LRC_SEQNO_PPHWSP_OFFSET 512
655 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
656 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
657 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
658 #define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
659
xe_lrc_regs_offset(struct xe_lrc * lrc)660 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
661 {
662 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
663 }
664
lrc_reg_size(struct xe_device * xe)665 static size_t lrc_reg_size(struct xe_device *xe)
666 {
667 if (GRAPHICS_VERx100(xe) >= 1250)
668 return 96 * sizeof(u32);
669 else
670 return 80 * sizeof(u32);
671 }
672
xe_lrc_skip_size(struct xe_device * xe)673 size_t xe_lrc_skip_size(struct xe_device *xe)
674 {
675 return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
676 }
677
__xe_lrc_seqno_offset(struct xe_lrc * lrc)678 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
679 {
680 /* The seqno is stored in the driver-defined portion of PPHWSP */
681 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
682 }
683
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)684 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
685 {
686 /* The start seqno is stored in the driver-defined portion of PPHWSP */
687 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
688 }
689
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)690 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
691 {
692 /* This is stored in the driver-defined portion of PPHWSP */
693 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
694 }
695
__xe_lrc_parallel_offset(struct xe_lrc * lrc)696 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
697 {
698 /* The parallel is stored in the driver-defined portion of PPHWSP */
699 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
700 }
701
__xe_lrc_engine_id_offset(struct xe_lrc * lrc)702 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
703 {
704 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
705 }
706
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)707 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
708 {
709 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
710 }
711
__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc * lrc)712 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
713 {
714 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
715 }
716
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)717 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
718 {
719 /* Indirect ring state page is at the very end of LRC */
720 return lrc->size - LRC_INDIRECT_RING_STATE_SIZE;
721 }
722
723 #define DECL_MAP_ADDR_HELPERS(elem) \
724 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
725 { \
726 struct iosys_map map = lrc->bo->vmap; \
727 \
728 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \
729 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
730 return map; \
731 } \
732 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
733 { \
734 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
735 } \
736
737 DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)738 DECL_MAP_ADDR_HELPERS(pphwsp)
739 DECL_MAP_ADDR_HELPERS(seqno)
740 DECL_MAP_ADDR_HELPERS(regs)
741 DECL_MAP_ADDR_HELPERS(start_seqno)
742 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
743 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
744 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
745 DECL_MAP_ADDR_HELPERS(parallel)
746 DECL_MAP_ADDR_HELPERS(indirect_ring)
747 DECL_MAP_ADDR_HELPERS(engine_id)
748
749 #undef DECL_MAP_ADDR_HELPERS
750
751 /**
752 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
753 * @lrc: Pointer to the lrc.
754 *
755 * Returns: ctx timestamp GGTT address
756 */
757 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
758 {
759 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
760 }
761
762 /**
763 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
764 * @lrc: Pointer to the lrc.
765 *
766 * Returns: ctx timestamp udw GGTT address
767 */
xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc * lrc)768 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
769 {
770 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
771 }
772
773 /**
774 * xe_lrc_ctx_timestamp() - Read ctx timestamp value
775 * @lrc: Pointer to the lrc.
776 *
777 * Returns: ctx timestamp value
778 */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)779 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
780 {
781 struct xe_device *xe = lrc_to_xe(lrc);
782 struct iosys_map map;
783 u32 ldw, udw = 0;
784
785 map = __xe_lrc_ctx_timestamp_map(lrc);
786 ldw = xe_map_read32(xe, &map);
787
788 if (xe->info.has_64bit_timestamp) {
789 map = __xe_lrc_ctx_timestamp_udw_map(lrc);
790 udw = xe_map_read32(xe, &map);
791 }
792
793 return (u64)udw << 32 | ldw;
794 }
795
796 /**
797 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
798 * @lrc: Pointer to the lrc.
799 *
800 * Returns: ctx timestamp job GGTT address
801 */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)802 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
803 {
804 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
805 }
806
807 /**
808 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
809 * @lrc: Pointer to the lrc.
810 *
811 * Returns: ctx timestamp job value
812 */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)813 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
814 {
815 struct xe_device *xe = lrc_to_xe(lrc);
816 struct iosys_map map;
817
818 map = __xe_lrc_ctx_job_timestamp_map(lrc);
819 return xe_map_read32(xe, &map);
820 }
821
xe_lrc_ggtt_addr(struct xe_lrc * lrc)822 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
823 {
824 return __xe_lrc_pphwsp_ggtt_addr(lrc);
825 }
826
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)827 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
828 {
829 if (!xe_lrc_has_indirect_ring_state(lrc))
830 return 0;
831
832 return __xe_lrc_indirect_ring_ggtt_addr(lrc);
833 }
834
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)835 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
836 {
837 struct xe_device *xe = lrc_to_xe(lrc);
838 struct iosys_map map;
839
840 map = __xe_lrc_indirect_ring_map(lrc);
841 iosys_map_incr(&map, reg_nr * sizeof(u32));
842 return xe_map_read32(xe, &map);
843 }
844
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)845 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
846 int reg_nr, u32 val)
847 {
848 struct xe_device *xe = lrc_to_xe(lrc);
849 struct iosys_map map;
850
851 map = __xe_lrc_indirect_ring_map(lrc);
852 iosys_map_incr(&map, reg_nr * sizeof(u32));
853 xe_map_write32(xe, &map, val);
854 }
855
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)856 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
857 {
858 struct xe_device *xe = lrc_to_xe(lrc);
859 struct iosys_map map;
860
861 map = __xe_lrc_regs_map(lrc);
862 iosys_map_incr(&map, reg_nr * sizeof(u32));
863 return xe_map_read32(xe, &map);
864 }
865
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)866 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
867 {
868 struct xe_device *xe = lrc_to_xe(lrc);
869 struct iosys_map map;
870
871 map = __xe_lrc_regs_map(lrc);
872 iosys_map_incr(&map, reg_nr * sizeof(u32));
873 xe_map_write32(xe, &map, val);
874 }
875
empty_lrc_data(struct xe_hw_engine * hwe)876 static void *empty_lrc_data(struct xe_hw_engine *hwe)
877 {
878 struct xe_gt *gt = hwe->gt;
879 void *data;
880 u32 *regs;
881
882 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
883 if (!data)
884 return NULL;
885
886 /* 1st page: Per-Process of HW status Page */
887 regs = data + LRC_PPHWSP_SIZE;
888 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
889 set_context_control(regs, hwe);
890 set_memory_based_intr(regs, hwe);
891 reset_stop_ring(regs, hwe);
892 if (xe_gt_has_indirect_ring_state(gt)) {
893 regs = data + xe_gt_lrc_size(gt, hwe->class) -
894 LRC_INDIRECT_RING_STATE_SIZE;
895 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
896 }
897
898 return data;
899 }
900
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)901 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
902 {
903 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
904
905 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
906 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
907 }
908
xe_lrc_finish(struct xe_lrc * lrc)909 static void xe_lrc_finish(struct xe_lrc *lrc)
910 {
911 xe_hw_fence_ctx_finish(&lrc->fence_ctx);
912 xe_bo_unpin_map_no_vm(lrc->bo);
913 xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
914 }
915
916 /*
917 * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
918 * context run ticks.
919 * @lrc: Pointer to the lrc.
920 *
921 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
922 * context, but only gets updated when the context switches out. In order to
923 * check how long a context has been active before it switches out, two things
924 * are required:
925 *
926 * (1) Determine if the context is running:
927 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
928 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
929 * initialized. During a query, we just check for this value to determine if the
930 * context is active. If the context switched out, it would overwrite this
931 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
932 * the last part of context restore, so reusing this LRC location will not
933 * clobber anything.
934 *
935 * (2) Calculate the time that the context has been active for:
936 * The CTX_TIMESTAMP ticks only when the context is active. If a context is
937 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
938 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
939 * engine instance. Since we do not know which instance the context is running
940 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
941 * store it in the PPHSWP.
942 */
943 #define CONTEXT_ACTIVE 1ULL
xe_lrc_setup_utilization(struct xe_lrc * lrc)944 static int xe_lrc_setup_utilization(struct xe_lrc *lrc)
945 {
946 u32 *cmd, *buf = NULL;
947
948 if (lrc->bb_per_ctx_bo->vmap.is_iomem) {
949 buf = kmalloc(lrc->bb_per_ctx_bo->size, GFP_KERNEL);
950 if (!buf)
951 return -ENOMEM;
952 cmd = buf;
953 } else {
954 cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
955 }
956
957 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
958 *cmd++ = ENGINE_ID(0).addr;
959 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
960 *cmd++ = 0;
961
962 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
963 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
964 *cmd++ = 0;
965 *cmd++ = lower_32_bits(CONTEXT_ACTIVE);
966
967 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
968 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
969 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
970 *cmd++ = 0;
971 *cmd++ = upper_32_bits(CONTEXT_ACTIVE);
972 }
973
974 *cmd++ = MI_BATCH_BUFFER_END;
975
976 if (buf) {
977 xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bb_per_ctx_bo->vmap, 0,
978 buf, (cmd - buf) * sizeof(*cmd));
979 kfree(buf);
980 }
981
982 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
983 xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
984
985 return 0;
986 }
987
988 #define PVC_CTX_ASID (0x2e + 1)
989 #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1)
990
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 init_flags)991 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
992 struct xe_vm *vm, u32 ring_size, u16 msix_vec,
993 u32 init_flags)
994 {
995 struct xe_gt *gt = hwe->gt;
996 struct xe_tile *tile = gt_to_tile(gt);
997 struct xe_device *xe = gt_to_xe(gt);
998 struct iosys_map map;
999 void *init_data = NULL;
1000 u32 arb_enable;
1001 u32 lrc_size;
1002 u32 bo_flags;
1003 int err;
1004
1005 kref_init(&lrc->refcount);
1006 lrc->gt = gt;
1007 lrc->flags = 0;
1008 lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
1009 if (xe_gt_has_indirect_ring_state(gt))
1010 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1011
1012 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1013 XE_BO_FLAG_GGTT_INVALIDATE;
1014 if (vm && vm->xef) /* userspace */
1015 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE;
1016
1017 /*
1018 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
1019 * via VM bind calls.
1020 */
1021 lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, lrc_size,
1022 ttm_bo_type_kernel,
1023 bo_flags);
1024 if (IS_ERR(lrc->bo))
1025 return PTR_ERR(lrc->bo);
1026
1027 lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
1028 ttm_bo_type_kernel,
1029 bo_flags);
1030 if (IS_ERR(lrc->bb_per_ctx_bo)) {
1031 err = PTR_ERR(lrc->bb_per_ctx_bo);
1032 goto err_lrc_finish;
1033 }
1034
1035 lrc->size = lrc_size;
1036 lrc->ring.size = ring_size;
1037 lrc->ring.tail = 0;
1038
1039 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1040 hwe->fence_irq, hwe->name);
1041
1042 if (!gt->default_lrc[hwe->class]) {
1043 init_data = empty_lrc_data(hwe);
1044 if (!init_data) {
1045 err = -ENOMEM;
1046 goto err_lrc_finish;
1047 }
1048 }
1049
1050 /*
1051 * Init Per-Process of HW status Page, LRC / context state to known
1052 * values
1053 */
1054 map = __xe_lrc_pphwsp_map(lrc);
1055 if (!init_data) {
1056 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
1057 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1058 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1059 xe_gt_lrc_size(gt, hwe->class) - LRC_PPHWSP_SIZE);
1060 } else {
1061 xe_map_memcpy_to(xe, &map, 0, init_data,
1062 xe_gt_lrc_size(gt, hwe->class));
1063 kfree(init_data);
1064 }
1065
1066 if (vm) {
1067 xe_lrc_set_ppgtt(lrc, vm);
1068
1069 if (vm->xef)
1070 xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1071 }
1072
1073 if (xe_device_has_msix(xe)) {
1074 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1075 xe_memirq_status_ptr(&tile->memirq, hwe));
1076 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1077 xe_memirq_source_ptr(&tile->memirq, hwe));
1078 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1079 }
1080
1081 if (xe_gt_has_indirect_ring_state(gt)) {
1082 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1083 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1084
1085 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1086 __xe_lrc_ring_ggtt_addr(lrc));
1087 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1088 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1089 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1090 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1091 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1092 } else {
1093 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1094 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1095 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1096 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1097 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1098 }
1099
1100 if (init_flags & XE_LRC_CREATE_RUNALONE)
1101 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1102 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1103 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1104
1105 if (init_flags & XE_LRC_CREATE_PXP)
1106 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1107 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1108 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1109
1110 lrc->ctx_timestamp = 0;
1111 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1112 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1113 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1114
1115 if (xe->info.has_asid && vm)
1116 xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
1117
1118 lrc->desc = LRC_VALID;
1119 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1120 /* TODO: Priority */
1121
1122 /* While this appears to have something about privileged batches or
1123 * some such, it really just means PPGTT mode.
1124 */
1125 if (vm)
1126 lrc->desc |= LRC_PRIVILEGE;
1127
1128 if (GRAPHICS_VERx100(xe) < 1250) {
1129 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1130 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1131 }
1132
1133 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1134 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1135
1136 map = __xe_lrc_seqno_map(lrc);
1137 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1138
1139 map = __xe_lrc_start_seqno_map(lrc);
1140 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1141
1142 err = xe_lrc_setup_utilization(lrc);
1143 if (err)
1144 goto err_lrc_finish;
1145
1146 return 0;
1147
1148 err_lrc_finish:
1149 xe_lrc_finish(lrc);
1150 return err;
1151 }
1152
1153 /**
1154 * xe_lrc_create - Create a LRC
1155 * @hwe: Hardware Engine
1156 * @vm: The VM (address space)
1157 * @ring_size: LRC ring size
1158 * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1159 * @flags: LRC initialization flags
1160 *
1161 * Allocate and initialize the Logical Ring Context (LRC).
1162 *
1163 * Return pointer to created LRC upon success and an error pointer
1164 * upon failure.
1165 */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 flags)1166 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1167 u32 ring_size, u16 msix_vec, u32 flags)
1168 {
1169 struct xe_lrc *lrc;
1170 int err;
1171
1172 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1173 if (!lrc)
1174 return ERR_PTR(-ENOMEM);
1175
1176 err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1177 if (err) {
1178 kfree(lrc);
1179 return ERR_PTR(err);
1180 }
1181
1182 return lrc;
1183 }
1184
1185 /**
1186 * xe_lrc_destroy - Destroy the LRC
1187 * @ref: reference to LRC
1188 *
1189 * Called when ref == 0, release resources held by the Logical Ring Context
1190 * (LRC) and free the LRC memory.
1191 */
xe_lrc_destroy(struct kref * ref)1192 void xe_lrc_destroy(struct kref *ref)
1193 {
1194 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1195
1196 xe_lrc_finish(lrc);
1197 kfree(lrc);
1198 }
1199
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1200 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1201 {
1202 if (xe_lrc_has_indirect_ring_state(lrc))
1203 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1204 else
1205 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1206 }
1207
xe_lrc_ring_tail(struct xe_lrc * lrc)1208 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1209 {
1210 if (xe_lrc_has_indirect_ring_state(lrc))
1211 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1212 else
1213 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1214 }
1215
xe_lrc_ring_start(struct xe_lrc * lrc)1216 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1217 {
1218 if (xe_lrc_has_indirect_ring_state(lrc))
1219 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1220 else
1221 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1222 }
1223
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1224 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1225 {
1226 if (xe_lrc_has_indirect_ring_state(lrc))
1227 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1228 else
1229 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1230 }
1231
xe_lrc_ring_head(struct xe_lrc * lrc)1232 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1233 {
1234 if (xe_lrc_has_indirect_ring_state(lrc))
1235 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1236 else
1237 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1238 }
1239
xe_lrc_ring_space(struct xe_lrc * lrc)1240 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1241 {
1242 const u32 head = xe_lrc_ring_head(lrc);
1243 const u32 tail = lrc->ring.tail;
1244 const u32 size = lrc->ring.size;
1245
1246 return ((head - tail - 1) & (size - 1)) + 1;
1247 }
1248
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1249 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1250 const void *data, size_t size)
1251 {
1252 struct xe_device *xe = lrc_to_xe(lrc);
1253
1254 iosys_map_incr(&ring, lrc->ring.tail);
1255 xe_map_memcpy_to(xe, &ring, 0, data, size);
1256 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1257 }
1258
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1259 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1260 {
1261 struct xe_device *xe = lrc_to_xe(lrc);
1262 struct iosys_map ring;
1263 u32 rhs;
1264 size_t aligned_size;
1265
1266 xe_assert(xe, IS_ALIGNED(size, 4));
1267 aligned_size = ALIGN(size, 8);
1268
1269 ring = __xe_lrc_ring_map(lrc);
1270
1271 xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1272 rhs = lrc->ring.size - lrc->ring.tail;
1273 if (size > rhs) {
1274 __xe_lrc_write_ring(lrc, ring, data, rhs);
1275 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1276 } else {
1277 __xe_lrc_write_ring(lrc, ring, data, size);
1278 }
1279
1280 if (aligned_size > size) {
1281 u32 noop = MI_NOOP;
1282
1283 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1284 }
1285 }
1286
xe_lrc_descriptor(struct xe_lrc * lrc)1287 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1288 {
1289 return lrc->desc | xe_lrc_ggtt_addr(lrc);
1290 }
1291
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1292 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1293 {
1294 return __xe_lrc_seqno_ggtt_addr(lrc);
1295 }
1296
1297 /**
1298 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1299 *
1300 * Allocate but don't initialize an lrc seqno fence.
1301 *
1302 * Return: Pointer to the allocated fence or
1303 * negative error pointer on error.
1304 */
xe_lrc_alloc_seqno_fence(void)1305 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1306 {
1307 return xe_hw_fence_alloc();
1308 }
1309
1310 /**
1311 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1312 * @fence: Pointer to the fence to free.
1313 *
1314 * Frees an lrc seqno fence that hasn't yet been
1315 * initialized.
1316 */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1317 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1318 {
1319 xe_hw_fence_free(fence);
1320 }
1321
1322 /**
1323 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1324 * @lrc: Pointer to the lrc.
1325 * @fence: Pointer to the fence to initialize.
1326 *
1327 * Initializes a pre-allocated lrc seqno fence.
1328 * After initialization, the fence is subject to normal
1329 * dma-fence refcounting.
1330 */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1331 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1332 {
1333 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1334 }
1335
xe_lrc_seqno(struct xe_lrc * lrc)1336 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1337 {
1338 struct iosys_map map = __xe_lrc_seqno_map(lrc);
1339
1340 return xe_map_read32(lrc_to_xe(lrc), &map);
1341 }
1342
xe_lrc_start_seqno(struct xe_lrc * lrc)1343 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1344 {
1345 struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1346
1347 return xe_map_read32(lrc_to_xe(lrc), &map);
1348 }
1349
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1350 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1351 {
1352 return __xe_lrc_start_seqno_ggtt_addr(lrc);
1353 }
1354
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1355 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1356 {
1357 return __xe_lrc_parallel_ggtt_addr(lrc);
1358 }
1359
xe_lrc_parallel_map(struct xe_lrc * lrc)1360 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1361 {
1362 return __xe_lrc_parallel_map(lrc);
1363 }
1364
1365 /**
1366 * xe_lrc_engine_id() - Read engine id value
1367 * @lrc: Pointer to the lrc.
1368 *
1369 * Returns: context id value
1370 */
xe_lrc_engine_id(struct xe_lrc * lrc)1371 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1372 {
1373 struct xe_device *xe = lrc_to_xe(lrc);
1374 struct iosys_map map;
1375
1376 map = __xe_lrc_engine_id_map(lrc);
1377 return xe_map_read32(xe, &map);
1378 }
1379
instr_dw(u32 cmd_header)1380 static int instr_dw(u32 cmd_header)
1381 {
1382 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1383 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1384 GFXPIPE_SINGLE_DW_CMD(0, 0))
1385 return 1;
1386
1387 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1388 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1389 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1390
1391 /* Most instructions have the # of dwords (minus 2) in 7:0 */
1392 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1393 }
1394
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1395 static int dump_mi_command(struct drm_printer *p,
1396 struct xe_gt *gt,
1397 u32 *dw,
1398 int remaining_dw)
1399 {
1400 u32 inst_header = *dw;
1401 u32 numdw = instr_dw(inst_header);
1402 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1403 int num_noop;
1404
1405 /* First check for commands that don't have/use a '# DW' field */
1406 switch (inst_header & MI_OPCODE) {
1407 case MI_NOOP:
1408 num_noop = 1;
1409 while (num_noop < remaining_dw &&
1410 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1411 num_noop++;
1412 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1413 return num_noop;
1414
1415 case MI_TOPOLOGY_FILTER:
1416 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1417 return 1;
1418
1419 case MI_BATCH_BUFFER_END:
1420 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1421 /* Return 'remaining_dw' to consume the rest of the LRC */
1422 return remaining_dw;
1423 }
1424
1425 /*
1426 * Any remaining commands include a # of dwords. We should make sure
1427 * it doesn't exceed the remaining size of the LRC.
1428 */
1429 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1430 numdw = remaining_dw;
1431
1432 switch (inst_header & MI_OPCODE) {
1433 case MI_LOAD_REGISTER_IMM:
1434 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1435 inst_header, (numdw - 1) / 2);
1436 for (int i = 1; i < numdw; i += 2)
1437 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1438 return numdw;
1439
1440 case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1441 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1442 inst_header,
1443 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1444 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1445 if (numdw == 4)
1446 drm_printf(p, " - %#6x = %#010llx\n",
1447 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1448 else
1449 drm_printf(p, " - %*ph (%s)\n",
1450 (int)sizeof(u32) * (numdw - 1), dw + 1,
1451 numdw < 4 ? "truncated" : "malformed");
1452 return numdw;
1453
1454 case MI_FORCE_WAKEUP:
1455 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1456 return numdw;
1457
1458 default:
1459 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1460 inst_header, opcode, numdw);
1461 return numdw;
1462 }
1463 }
1464
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1465 static int dump_gfxpipe_command(struct drm_printer *p,
1466 struct xe_gt *gt,
1467 u32 *dw,
1468 int remaining_dw)
1469 {
1470 u32 numdw = instr_dw(*dw);
1471 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1472 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1473 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1474
1475 /*
1476 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1477 * remaining size of the LRC.
1478 */
1479 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1480 numdw = remaining_dw;
1481
1482 switch (*dw & GFXPIPE_MATCH_MASK) {
1483 #define MATCH(cmd) \
1484 case cmd: \
1485 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1486 return numdw
1487 #define MATCH3D(cmd) \
1488 case CMD_##cmd: \
1489 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1490 return numdw
1491
1492 MATCH(STATE_BASE_ADDRESS);
1493 MATCH(STATE_SIP);
1494 MATCH(GPGPU_CSR_BASE_ADDRESS);
1495 MATCH(STATE_COMPUTE_MODE);
1496 MATCH3D(3DSTATE_BTD);
1497 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1498 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1499
1500 MATCH3D(3DSTATE_VF_STATISTICS);
1501
1502 MATCH(PIPELINE_SELECT);
1503
1504 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1505 MATCH3D(3DSTATE_CLEAR_PARAMS);
1506 MATCH3D(3DSTATE_DEPTH_BUFFER);
1507 MATCH3D(3DSTATE_STENCIL_BUFFER);
1508 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1509 MATCH3D(3DSTATE_VERTEX_BUFFERS);
1510 MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1511 MATCH3D(3DSTATE_INDEX_BUFFER);
1512 MATCH3D(3DSTATE_VF);
1513 MATCH3D(3DSTATE_MULTISAMPLE);
1514 MATCH3D(3DSTATE_CC_STATE_POINTERS);
1515 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1516 MATCH3D(3DSTATE_VS);
1517 MATCH3D(3DSTATE_GS);
1518 MATCH3D(3DSTATE_CLIP);
1519 MATCH3D(3DSTATE_SF);
1520 MATCH3D(3DSTATE_WM);
1521 MATCH3D(3DSTATE_CONSTANT_VS);
1522 MATCH3D(3DSTATE_CONSTANT_GS);
1523 MATCH3D(3DSTATE_CONSTANT_PS);
1524 MATCH3D(3DSTATE_SAMPLE_MASK);
1525 MATCH3D(3DSTATE_CONSTANT_HS);
1526 MATCH3D(3DSTATE_CONSTANT_DS);
1527 MATCH3D(3DSTATE_HS);
1528 MATCH3D(3DSTATE_TE);
1529 MATCH3D(3DSTATE_DS);
1530 MATCH3D(3DSTATE_STREAMOUT);
1531 MATCH3D(3DSTATE_SBE);
1532 MATCH3D(3DSTATE_PS);
1533 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1534 MATCH3D(3DSTATE_CPS_POINTERS);
1535 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1536 MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1537 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1538 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1539 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1540 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1541 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1542 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1543 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1544 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1545 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1546 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1547 MATCH3D(3DSTATE_VF_INSTANCING);
1548 MATCH3D(3DSTATE_VF_SGVS);
1549 MATCH3D(3DSTATE_VF_TOPOLOGY);
1550 MATCH3D(3DSTATE_WM_CHROMAKEY);
1551 MATCH3D(3DSTATE_PS_BLEND);
1552 MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1553 MATCH3D(3DSTATE_PS_EXTRA);
1554 MATCH3D(3DSTATE_RASTER);
1555 MATCH3D(3DSTATE_SBE_SWIZ);
1556 MATCH3D(3DSTATE_WM_HZ_OP);
1557 MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1558 MATCH3D(3DSTATE_VF_SGVS_2);
1559 MATCH3D(3DSTATE_VFG);
1560 MATCH3D(3DSTATE_URB_ALLOC_VS);
1561 MATCH3D(3DSTATE_URB_ALLOC_HS);
1562 MATCH3D(3DSTATE_URB_ALLOC_DS);
1563 MATCH3D(3DSTATE_URB_ALLOC_GS);
1564 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1565 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1566 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1567 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1568 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1569 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1570 MATCH3D(3DSTATE_AMFS);
1571 MATCH3D(3DSTATE_DEPTH_BOUNDS);
1572 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1573 MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1574 MATCH3D(3DSTATE_MESH_CONTROL);
1575 MATCH3D(3DSTATE_MESH_DISTRIB);
1576 MATCH3D(3DSTATE_TASK_REDISTRIB);
1577 MATCH3D(3DSTATE_MESH_SHADER);
1578 MATCH3D(3DSTATE_MESH_SHADER_DATA);
1579 MATCH3D(3DSTATE_TASK_CONTROL);
1580 MATCH3D(3DSTATE_TASK_SHADER);
1581 MATCH3D(3DSTATE_TASK_SHADER_DATA);
1582 MATCH3D(3DSTATE_URB_ALLOC_MESH);
1583 MATCH3D(3DSTATE_URB_ALLOC_TASK);
1584 MATCH3D(3DSTATE_CLIP_MESH);
1585 MATCH3D(3DSTATE_SBE_MESH);
1586 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1587 MATCH3D(3DSTATE_COARSE_PIXEL);
1588
1589 MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1590 MATCH3D(3DSTATE_CHROMA_KEY);
1591 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1592 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1593 MATCH3D(3DSTATE_LINE_STIPPLE);
1594 MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1595 MATCH3D(3DSTATE_MONOFILTER_SIZE);
1596 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1597 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1598 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1599 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1600 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1601 MATCH3D(3DSTATE_SO_DECL_LIST);
1602 MATCH3D(3DSTATE_SO_BUFFER);
1603 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1604 MATCH3D(3DSTATE_SAMPLE_PATTERN);
1605 MATCH3D(3DSTATE_3D_MODE);
1606 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1607 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1608 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1609
1610 default:
1611 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1612 *dw, pipeline, opcode, subopcode, numdw);
1613 return numdw;
1614 }
1615 }
1616
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1617 static int dump_gfx_state_command(struct drm_printer *p,
1618 struct xe_gt *gt,
1619 u32 *dw,
1620 int remaining_dw)
1621 {
1622 u32 numdw = instr_dw(*dw);
1623 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1624
1625 /*
1626 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1627 * remaining size of the LRC.
1628 */
1629 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1630 numdw = remaining_dw;
1631
1632 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1633 MATCH(STATE_WRITE_INLINE);
1634
1635 default:
1636 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1637 *dw, opcode, numdw);
1638 return numdw;
1639 }
1640 }
1641
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)1642 void xe_lrc_dump_default(struct drm_printer *p,
1643 struct xe_gt *gt,
1644 enum xe_engine_class hwe_class)
1645 {
1646 u32 *dw;
1647 int remaining_dw, num_dw;
1648
1649 if (!gt->default_lrc[hwe_class]) {
1650 drm_printf(p, "No default LRC for class %d\n", hwe_class);
1651 return;
1652 }
1653
1654 /*
1655 * Skip the beginning of the LRC since it contains the per-process
1656 * hardware status page.
1657 */
1658 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1659 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
1660
1661 while (remaining_dw > 0) {
1662 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1663 num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1664 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1665 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1666 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1667 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1668 } else {
1669 num_dw = min(instr_dw(*dw), remaining_dw);
1670 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1671 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1672 num_dw);
1673 }
1674
1675 dw += num_dw;
1676 remaining_dw -= num_dw;
1677 }
1678 }
1679
1680 struct instr_state {
1681 u32 instr;
1682 u16 num_dw;
1683 };
1684
1685 static const struct instr_state xe_hpg_svg_state[] = {
1686 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1687 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1688 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1689 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1690 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1691 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1692 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1693 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1694 { .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1695 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1696 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1697 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1698 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1699 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1700 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1701 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1702 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1703 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1704 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1705 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1706 { .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1707 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1708 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1709 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1710 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1711 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1712 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1713 { .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1714 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1715 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1716 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1717 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1718 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1719 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1720 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1721 { .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1722 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1723 { .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1724 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1725 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1726 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1727 { .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1728 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1729 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1730 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1731 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1732 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1733 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1734 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1735 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1736 };
1737
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,struct xe_bb * bb)1738 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1739 {
1740 struct xe_gt *gt = q->hwe->gt;
1741 struct xe_device *xe = gt_to_xe(gt);
1742 const struct instr_state *state_table = NULL;
1743 int state_table_size = 0;
1744
1745 /*
1746 * Wa_14019789679
1747 *
1748 * If the driver doesn't explicitly emit the SVG instructions while
1749 * setting up the default LRC, the context switch will write 0's
1750 * (noops) into the LRC memory rather than the expected instruction
1751 * headers. Application contexts start out as a copy of the default
1752 * LRC, and if they also do not emit specific settings for some SVG
1753 * state, then on context restore they'll unintentionally inherit
1754 * whatever state setting the previous context had programmed into the
1755 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
1756 * prevent the hardware from resetting that state back to any specific
1757 * value).
1758 *
1759 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
1760 * since that's a specific state setting that can easily cause GPU
1761 * hangs if unintentionally inherited. However to be safe we'll
1762 * continue to emit all of the SVG state since it's best not to leak
1763 * any of the state between contexts, even if that leakage is harmless.
1764 */
1765 if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
1766 state_table = xe_hpg_svg_state;
1767 state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1768 }
1769
1770 if (!state_table) {
1771 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1772 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1773 return;
1774 }
1775
1776 for (int i = 0; i < state_table_size; i++) {
1777 u32 instr = state_table[i].instr;
1778 u16 num_dw = state_table[i].num_dw;
1779 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1780
1781 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1782 xe_gt_assert(gt, num_dw != 0);
1783 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1784
1785 /*
1786 * Xe2's SVG context is the same as the one on DG2 / MTL
1787 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1788 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1789 * Just make the replacement here rather than defining a
1790 * whole separate table for the single trivial change.
1791 */
1792 if (GRAPHICS_VER(xe) >= 20 &&
1793 instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1794 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1795
1796 bb->cs[bb->len] = instr;
1797 if (!is_single_dw)
1798 bb->cs[bb->len] |= (num_dw - 2);
1799
1800 bb->len += num_dw;
1801 }
1802 }
1803
xe_lrc_snapshot_capture(struct xe_lrc * lrc)1804 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1805 {
1806 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1807
1808 if (!snapshot)
1809 return NULL;
1810
1811 snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
1812 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
1813 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
1814 snapshot->head = xe_lrc_ring_head(lrc);
1815 snapshot->tail.internal = lrc->ring.tail;
1816 snapshot->tail.memory = xe_lrc_ring_tail(lrc);
1817 snapshot->start = xe_lrc_ring_start(lrc);
1818 snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1819 snapshot->seqno = xe_lrc_seqno(lrc);
1820 snapshot->lrc_bo = xe_bo_get(lrc->bo);
1821 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
1822 snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
1823 snapshot->lrc_snapshot = NULL;
1824 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
1825 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
1826 return snapshot;
1827 }
1828
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)1829 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
1830 {
1831 struct xe_bo *bo;
1832 struct iosys_map src;
1833
1834 if (!snapshot)
1835 return;
1836
1837 bo = snapshot->lrc_bo;
1838 snapshot->lrc_bo = NULL;
1839
1840 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
1841 if (!snapshot->lrc_snapshot)
1842 goto put_bo;
1843
1844 xe_bo_lock(bo, false);
1845 if (!ttm_bo_vmap(&bo->ttm, &src)) {
1846 xe_map_memcpy_from(xe_bo_device(bo),
1847 snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
1848 snapshot->lrc_size);
1849 ttm_bo_vunmap(&bo->ttm, &src);
1850 } else {
1851 kvfree(snapshot->lrc_snapshot);
1852 snapshot->lrc_snapshot = NULL;
1853 }
1854 xe_bo_unlock(bo);
1855 put_bo:
1856 xe_bo_put(bo);
1857 }
1858
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)1859 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
1860 {
1861 unsigned long i;
1862
1863 if (!snapshot)
1864 return;
1865
1866 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
1867 drm_printf(p, "\tHW Ring address: 0x%08x\n",
1868 snapshot->ring_addr);
1869 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
1870 snapshot->indirect_context_desc);
1871 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
1872 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
1873 snapshot->tail.internal, snapshot->tail.memory);
1874 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
1875 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
1876 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
1877 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
1878 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
1879
1880 if (!snapshot->lrc_snapshot)
1881 return;
1882
1883 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
1884 drm_puts(p, "\t[HWSP].data: ");
1885 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
1886 u32 *val = snapshot->lrc_snapshot + i;
1887 char dumped[ASCII85_BUFSZ];
1888
1889 drm_puts(p, ascii85_encode(*val, dumped));
1890 }
1891
1892 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
1893 drm_puts(p, "\t[HWCTX].data: ");
1894 for (; i < snapshot->lrc_size; i += sizeof(u32)) {
1895 u32 *val = snapshot->lrc_snapshot + i;
1896 char dumped[ASCII85_BUFSZ];
1897
1898 drm_puts(p, ascii85_encode(*val, dumped));
1899 }
1900 drm_puts(p, "\n");
1901 }
1902
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)1903 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
1904 {
1905 if (!snapshot)
1906 return;
1907
1908 kvfree(snapshot->lrc_snapshot);
1909 if (snapshot->lrc_bo)
1910 xe_bo_put(snapshot->lrc_bo);
1911
1912 kfree(snapshot);
1913 }
1914
get_ctx_timestamp(struct xe_lrc * lrc,u32 engine_id,u64 * reg_ctx_ts)1915 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
1916 {
1917 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
1918 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
1919 struct xe_hw_engine *hwe;
1920 u64 val;
1921
1922 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
1923 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
1924 "Unexpected engine class:instance %d:%d for context utilization\n",
1925 class, instance))
1926 return -1;
1927
1928 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1929 val = xe_mmio_read64_2x32(&hwe->gt->mmio,
1930 RING_CTX_TIMESTAMP(hwe->mmio_base));
1931 else
1932 val = xe_mmio_read32(&hwe->gt->mmio,
1933 RING_CTX_TIMESTAMP(hwe->mmio_base));
1934
1935 *reg_ctx_ts = val;
1936
1937 return 0;
1938 }
1939
1940 /**
1941 * xe_lrc_update_timestamp() - Update ctx timestamp
1942 * @lrc: Pointer to the lrc.
1943 * @old_ts: Old timestamp value
1944 *
1945 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
1946 * update saved value. With support for active contexts, the calculation may be
1947 * slightly racy, so follow a read-again logic to ensure that the context is
1948 * still active before returning the right timestamp.
1949 *
1950 * Returns: New ctx timestamp value
1951 */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u64 * old_ts)1952 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
1953 {
1954 u64 lrc_ts, reg_ts;
1955 u32 engine_id;
1956
1957 *old_ts = lrc->ctx_timestamp;
1958
1959 lrc_ts = xe_lrc_ctx_timestamp(lrc);
1960 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
1961 if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
1962 lrc->ctx_timestamp = lrc_ts;
1963 goto done;
1964 }
1965
1966 if (lrc_ts == CONTEXT_ACTIVE) {
1967 engine_id = xe_lrc_engine_id(lrc);
1968 if (!get_ctx_timestamp(lrc, engine_id, ®_ts))
1969 lrc->ctx_timestamp = reg_ts;
1970
1971 /* read lrc again to ensure context is still active */
1972 lrc_ts = xe_lrc_ctx_timestamp(lrc);
1973 }
1974
1975 /*
1976 * If context switched out, just use the lrc_ts. Note that this needs to
1977 * be a separate if condition.
1978 */
1979 if (lrc_ts != CONTEXT_ACTIVE)
1980 lrc->ctx_timestamp = lrc_ts;
1981
1982 done:
1983 trace_xe_lrc_update_timestamp(lrc, *old_ts);
1984
1985 return lrc->ctx_timestamp;
1986 }
1987
1988 /**
1989 * xe_lrc_ring_is_idle() - LRC is idle
1990 * @lrc: Pointer to the lrc.
1991 *
1992 * Compare LRC ring head and tail to determine if idle.
1993 *
1994 * Return: True is ring is idle, False otherwise
1995 */
xe_lrc_ring_is_idle(struct xe_lrc * lrc)1996 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
1997 {
1998 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
1999 }
2000