xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 8cdcef1c2f82d207aa8b2a02298fbc17191c6261)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include "instructions/xe_mi_commands.h"
9 #include "instructions/xe_gfxpipe_commands.h"
10 #include "regs/xe_engine_regs.h"
11 #include "regs/xe_gpu_commands.h"
12 #include "regs/xe_gt_regs.h"
13 #include "regs/xe_lrc_layout.h"
14 #include "regs/xe_regs.h"
15 #include "xe_bb.h"
16 #include "xe_bo.h"
17 #include "xe_device.h"
18 #include "xe_drm_client.h"
19 #include "xe_exec_queue_types.h"
20 #include "xe_gt.h"
21 #include "xe_gt_printk.h"
22 #include "xe_hw_fence.h"
23 #include "xe_map.h"
24 #include "xe_vm.h"
25 
26 #define CTX_VALID				(1 << 0)
27 #define CTX_PRIVILEGE				(1 << 8)
28 #define CTX_ADDRESSING_MODE_SHIFT		3
29 #define LEGACY_64B_CONTEXT			3
30 
31 #define ENGINE_CLASS_SHIFT			61
32 #define ENGINE_INSTANCE_SHIFT			48
33 
34 static struct xe_device *
35 lrc_to_xe(struct xe_lrc *lrc)
36 {
37 	return gt_to_xe(lrc->fence_ctx.gt);
38 }
39 
40 size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
41 {
42 	switch (class) {
43 	case XE_ENGINE_CLASS_RENDER:
44 		if (GRAPHICS_VER(xe) >= 20)
45 			return 4 * SZ_4K;
46 		else
47 			return 14 * SZ_4K;
48 	case XE_ENGINE_CLASS_COMPUTE:
49 		/* 14 pages since graphics_ver == 11 */
50 		if (GRAPHICS_VER(xe) >= 20)
51 			return 3 * SZ_4K;
52 		else
53 			return 14 * SZ_4K;
54 	default:
55 		WARN(1, "Unknown engine class: %d", class);
56 		fallthrough;
57 	case XE_ENGINE_CLASS_COPY:
58 	case XE_ENGINE_CLASS_VIDEO_DECODE:
59 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
60 	case XE_ENGINE_CLASS_OTHER:
61 		return 2 * SZ_4K;
62 	}
63 }
64 
65 /*
66  * The per-platform tables are u8-encoded in @data. Decode @data and set the
67  * addresses' offset and commands in @regs. The following encoding is used
68  * for each byte. There are 2 steps: decoding commands and decoding addresses.
69  *
70  * Commands:
71  * [7]: create NOPs - number of NOPs are set in lower bits
72  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
73  *      MI_LRI_FORCE_POSTED
74  * [5:0]: Number of NOPs or registers to set values to in case of
75  *        MI_LOAD_REGISTER_IMM
76  *
77  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
78  * number of registers. They are set by using the REG/REG16 macros: the former
79  * is used for offsets smaller than 0x200 while the latter is for values bigger
80  * than that. Those macros already set all the bits documented below correctly:
81  *
82  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
83  *      follow, for the lower bits
84  * [6:0]: Register offset, without considering the engine base.
85  *
86  * This function only tweaks the commands and register offsets. Values are not
87  * filled out.
88  */
89 static void set_offsets(u32 *regs,
90 			const u8 *data,
91 			const struct xe_hw_engine *hwe)
92 #define NOP(x) (BIT(7) | (x))
93 #define LRI(count, flags) ((flags) << 6 | (count) | \
94 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
95 #define POSTED BIT(0)
96 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
97 #define REG16(x) \
98 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
99 	(((x) >> 2) & 0x7f)
100 #define END 0
101 {
102 	const u32 base = hwe->mmio_base;
103 
104 	while (*data) {
105 		u8 count, flags;
106 
107 		if (*data & BIT(7)) { /* skip */
108 			count = *data++ & ~BIT(7);
109 			regs += count;
110 			continue;
111 		}
112 
113 		count = *data & 0x3f;
114 		flags = *data >> 6;
115 		data++;
116 
117 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
118 		if (flags & POSTED)
119 			*regs |= MI_LRI_FORCE_POSTED;
120 		*regs |= MI_LRI_LRM_CS_MMIO;
121 		regs++;
122 
123 		xe_gt_assert(hwe->gt, count);
124 		do {
125 			u32 offset = 0;
126 			u8 v;
127 
128 			do {
129 				v = *data++;
130 				offset <<= 7;
131 				offset |= v & ~BIT(7);
132 			} while (v & BIT(7));
133 
134 			regs[0] = base + (offset << 2);
135 			regs += 2;
136 		} while (--count);
137 	}
138 
139 	*regs = MI_BATCH_BUFFER_END | BIT(0);
140 }
141 
142 static const u8 gen12_xcs_offsets[] = {
143 	NOP(1),
144 	LRI(13, POSTED),
145 	REG16(0x244),
146 	REG(0x034),
147 	REG(0x030),
148 	REG(0x038),
149 	REG(0x03c),
150 	REG(0x168),
151 	REG(0x140),
152 	REG(0x110),
153 	REG(0x1c0),
154 	REG(0x1c4),
155 	REG(0x1c8),
156 	REG(0x180),
157 	REG16(0x2b4),
158 
159 	NOP(5),
160 	LRI(9, POSTED),
161 	REG16(0x3a8),
162 	REG16(0x28c),
163 	REG16(0x288),
164 	REG16(0x284),
165 	REG16(0x280),
166 	REG16(0x27c),
167 	REG16(0x278),
168 	REG16(0x274),
169 	REG16(0x270),
170 
171 	END
172 };
173 
174 static const u8 dg2_xcs_offsets[] = {
175 	NOP(1),
176 	LRI(15, POSTED),
177 	REG16(0x244),
178 	REG(0x034),
179 	REG(0x030),
180 	REG(0x038),
181 	REG(0x03c),
182 	REG(0x168),
183 	REG(0x140),
184 	REG(0x110),
185 	REG(0x1c0),
186 	REG(0x1c4),
187 	REG(0x1c8),
188 	REG(0x180),
189 	REG16(0x2b4),
190 	REG(0x120),
191 	REG(0x124),
192 
193 	NOP(1),
194 	LRI(9, POSTED),
195 	REG16(0x3a8),
196 	REG16(0x28c),
197 	REG16(0x288),
198 	REG16(0x284),
199 	REG16(0x280),
200 	REG16(0x27c),
201 	REG16(0x278),
202 	REG16(0x274),
203 	REG16(0x270),
204 
205 	END
206 };
207 
208 static const u8 gen12_rcs_offsets[] = {
209 	NOP(1),
210 	LRI(13, POSTED),
211 	REG16(0x244),
212 	REG(0x034),
213 	REG(0x030),
214 	REG(0x038),
215 	REG(0x03c),
216 	REG(0x168),
217 	REG(0x140),
218 	REG(0x110),
219 	REG(0x1c0),
220 	REG(0x1c4),
221 	REG(0x1c8),
222 	REG(0x180),
223 	REG16(0x2b4),
224 
225 	NOP(5),
226 	LRI(9, POSTED),
227 	REG16(0x3a8),
228 	REG16(0x28c),
229 	REG16(0x288),
230 	REG16(0x284),
231 	REG16(0x280),
232 	REG16(0x27c),
233 	REG16(0x278),
234 	REG16(0x274),
235 	REG16(0x270),
236 
237 	LRI(3, POSTED),
238 	REG(0x1b0),
239 	REG16(0x5a8),
240 	REG16(0x5ac),
241 
242 	NOP(6),
243 	LRI(1, 0),
244 	REG(0x0c8),
245 	NOP(3 + 9 + 1),
246 
247 	LRI(51, POSTED),
248 	REG16(0x588),
249 	REG16(0x588),
250 	REG16(0x588),
251 	REG16(0x588),
252 	REG16(0x588),
253 	REG16(0x588),
254 	REG(0x028),
255 	REG(0x09c),
256 	REG(0x0c0),
257 	REG(0x178),
258 	REG(0x17c),
259 	REG16(0x358),
260 	REG(0x170),
261 	REG(0x150),
262 	REG(0x154),
263 	REG(0x158),
264 	REG16(0x41c),
265 	REG16(0x600),
266 	REG16(0x604),
267 	REG16(0x608),
268 	REG16(0x60c),
269 	REG16(0x610),
270 	REG16(0x614),
271 	REG16(0x618),
272 	REG16(0x61c),
273 	REG16(0x620),
274 	REG16(0x624),
275 	REG16(0x628),
276 	REG16(0x62c),
277 	REG16(0x630),
278 	REG16(0x634),
279 	REG16(0x638),
280 	REG16(0x63c),
281 	REG16(0x640),
282 	REG16(0x644),
283 	REG16(0x648),
284 	REG16(0x64c),
285 	REG16(0x650),
286 	REG16(0x654),
287 	REG16(0x658),
288 	REG16(0x65c),
289 	REG16(0x660),
290 	REG16(0x664),
291 	REG16(0x668),
292 	REG16(0x66c),
293 	REG16(0x670),
294 	REG16(0x674),
295 	REG16(0x678),
296 	REG16(0x67c),
297 	REG(0x068),
298 	REG(0x084),
299 	NOP(1),
300 
301 	END
302 };
303 
304 static const u8 xehp_rcs_offsets[] = {
305 	NOP(1),
306 	LRI(13, POSTED),
307 	REG16(0x244),
308 	REG(0x034),
309 	REG(0x030),
310 	REG(0x038),
311 	REG(0x03c),
312 	REG(0x168),
313 	REG(0x140),
314 	REG(0x110),
315 	REG(0x1c0),
316 	REG(0x1c4),
317 	REG(0x1c8),
318 	REG(0x180),
319 	REG16(0x2b4),
320 
321 	NOP(5),
322 	LRI(9, POSTED),
323 	REG16(0x3a8),
324 	REG16(0x28c),
325 	REG16(0x288),
326 	REG16(0x284),
327 	REG16(0x280),
328 	REG16(0x27c),
329 	REG16(0x278),
330 	REG16(0x274),
331 	REG16(0x270),
332 
333 	LRI(3, POSTED),
334 	REG(0x1b0),
335 	REG16(0x5a8),
336 	REG16(0x5ac),
337 
338 	NOP(6),
339 	LRI(1, 0),
340 	REG(0x0c8),
341 
342 	END
343 };
344 
345 static const u8 dg2_rcs_offsets[] = {
346 	NOP(1),
347 	LRI(15, POSTED),
348 	REG16(0x244),
349 	REG(0x034),
350 	REG(0x030),
351 	REG(0x038),
352 	REG(0x03c),
353 	REG(0x168),
354 	REG(0x140),
355 	REG(0x110),
356 	REG(0x1c0),
357 	REG(0x1c4),
358 	REG(0x1c8),
359 	REG(0x180),
360 	REG16(0x2b4),
361 	REG(0x120),
362 	REG(0x124),
363 
364 	NOP(1),
365 	LRI(9, POSTED),
366 	REG16(0x3a8),
367 	REG16(0x28c),
368 	REG16(0x288),
369 	REG16(0x284),
370 	REG16(0x280),
371 	REG16(0x27c),
372 	REG16(0x278),
373 	REG16(0x274),
374 	REG16(0x270),
375 
376 	LRI(3, POSTED),
377 	REG(0x1b0),
378 	REG16(0x5a8),
379 	REG16(0x5ac),
380 
381 	NOP(6),
382 	LRI(1, 0),
383 	REG(0x0c8),
384 
385 	END
386 };
387 
388 static const u8 mtl_rcs_offsets[] = {
389 	NOP(1),
390 	LRI(15, POSTED),
391 	REG16(0x244),
392 	REG(0x034),
393 	REG(0x030),
394 	REG(0x038),
395 	REG(0x03c),
396 	REG(0x168),
397 	REG(0x140),
398 	REG(0x110),
399 	REG(0x1c0),
400 	REG(0x1c4),
401 	REG(0x1c8),
402 	REG(0x180),
403 	REG16(0x2b4),
404 	REG(0x120),
405 	REG(0x124),
406 
407 	NOP(1),
408 	LRI(9, POSTED),
409 	REG16(0x3a8),
410 	REG16(0x28c),
411 	REG16(0x288),
412 	REG16(0x284),
413 	REG16(0x280),
414 	REG16(0x27c),
415 	REG16(0x278),
416 	REG16(0x274),
417 	REG16(0x270),
418 
419 	NOP(2),
420 	LRI(2, POSTED),
421 	REG16(0x5a8),
422 	REG16(0x5ac),
423 
424 	NOP(6),
425 	LRI(1, 0),
426 	REG(0x0c8),
427 
428 	END
429 };
430 
431 #define XE2_CTX_COMMON \
432 	NOP(1),                 /* [0x00] */ \
433 	LRI(15, POSTED),        /* [0x01] */ \
434 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
435 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
436 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
437 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
438 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
439 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
440 	REG(0x140),             /* [0x0e] BB_ADDR */ \
441 	REG(0x110),             /* [0x10] BB_STATE */ \
442 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
443 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
444 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
445 	REG(0x180),             /* [0x18] CCID */ \
446 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
447 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
448 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
449 	\
450 	NOP(1),                 /* [0x20] */ \
451 	LRI(9, POSTED),         /* [0x21] */ \
452 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
453 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
454 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
455 	REG16(0x284),           /* [0x28] dummy reg */ \
456 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
457 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
458 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
459 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
460 	REG16(0x270)            /* [0x32] PTBP_LDW */
461 
462 static const u8 xe2_rcs_offsets[] = {
463 	XE2_CTX_COMMON,
464 
465 	NOP(2),                 /* [0x34] */
466 	LRI(2, POSTED),         /* [0x36] */
467 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
468 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
469 
470 	NOP(6),                 /* [0x41] */
471 	LRI(1, 0),              /* [0x47] */
472 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
473 
474 	END
475 };
476 
477 static const u8 xe2_bcs_offsets[] = {
478 	XE2_CTX_COMMON,
479 
480 	NOP(4 + 8 + 1),         /* [0x34] */
481 	LRI(2, POSTED),         /* [0x41] */
482 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
483 	REG16(0x204),           /* [0x44] BLIT_CCTL */
484 
485 	END
486 };
487 
488 static const u8 xe2_xcs_offsets[] = {
489 	XE2_CTX_COMMON,
490 
491 	END
492 };
493 
494 #undef END
495 #undef REG16
496 #undef REG
497 #undef LRI
498 #undef NOP
499 
500 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
501 {
502 	if (class == XE_ENGINE_CLASS_RENDER) {
503 		if (GRAPHICS_VER(xe) >= 20)
504 			return xe2_rcs_offsets;
505 		else if (GRAPHICS_VERx100(xe) >= 1270)
506 			return mtl_rcs_offsets;
507 		else if (GRAPHICS_VERx100(xe) >= 1255)
508 			return dg2_rcs_offsets;
509 		else if (GRAPHICS_VERx100(xe) >= 1250)
510 			return xehp_rcs_offsets;
511 		else
512 			return gen12_rcs_offsets;
513 	} else if (class == XE_ENGINE_CLASS_COPY) {
514 		if (GRAPHICS_VER(xe) >= 20)
515 			return xe2_bcs_offsets;
516 		else
517 			return gen12_xcs_offsets;
518 	} else {
519 		if (GRAPHICS_VER(xe) >= 20)
520 			return xe2_xcs_offsets;
521 		else if (GRAPHICS_VERx100(xe) >= 1255)
522 			return dg2_xcs_offsets;
523 		else
524 			return gen12_xcs_offsets;
525 	}
526 }
527 
528 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
529 {
530 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) |
531 				    _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
532 				    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
533 
534 	/* TODO: Timestamp */
535 }
536 
537 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
538 {
539 	struct xe_device *xe = gt_to_xe(hwe->gt);
540 
541 	if (GRAPHICS_VERx100(xe) >= 1250)
542 		return 0x70;
543 	else
544 		return 0x60;
545 }
546 
547 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
548 {
549 	int x;
550 
551 	x = lrc_ring_mi_mode(hwe);
552 	regs[x + 1] &= ~STOP_RING;
553 	regs[x + 1] |= STOP_RING << 16;
554 }
555 
556 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
557 {
558 	return 0;
559 }
560 
561 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
562 {
563 	return lrc->ring.size;
564 }
565 
566 /* Make the magic macros work */
567 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
568 
569 #define LRC_SEQNO_PPHWSP_OFFSET 512
570 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
571 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
572 #define LRC_PPHWSP_SIZE SZ_4K
573 
574 static size_t lrc_reg_size(struct xe_device *xe)
575 {
576 	if (GRAPHICS_VERx100(xe) >= 1250)
577 		return 96 * sizeof(u32);
578 	else
579 		return 80 * sizeof(u32);
580 }
581 
582 size_t xe_lrc_skip_size(struct xe_device *xe)
583 {
584 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
585 }
586 
587 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
588 {
589 	/* The seqno is stored in the driver-defined portion of PPHWSP */
590 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
591 }
592 
593 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
594 {
595 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
596 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
597 }
598 
599 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
600 {
601 	/* The parallel is stored in the driver-defined portion of PPHWSP */
602 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
603 }
604 
605 static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
606 {
607 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
608 }
609 
610 #define DECL_MAP_ADDR_HELPERS(elem) \
611 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
612 { \
613 	struct iosys_map map = lrc->bo->vmap; \
614 \
615 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
616 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
617 	return map; \
618 } \
619 static inline u32 __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
620 { \
621 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
622 } \
623 
624 DECL_MAP_ADDR_HELPERS(ring)
625 DECL_MAP_ADDR_HELPERS(pphwsp)
626 DECL_MAP_ADDR_HELPERS(seqno)
627 DECL_MAP_ADDR_HELPERS(regs)
628 DECL_MAP_ADDR_HELPERS(start_seqno)
629 DECL_MAP_ADDR_HELPERS(parallel)
630 
631 #undef DECL_MAP_ADDR_HELPERS
632 
633 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
634 {
635 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
636 }
637 
638 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
639 {
640 	struct xe_device *xe = lrc_to_xe(lrc);
641 	struct iosys_map map;
642 
643 	map = __xe_lrc_regs_map(lrc);
644 	iosys_map_incr(&map, reg_nr * sizeof(u32));
645 	return xe_map_read32(xe, &map);
646 }
647 
648 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
649 {
650 	struct xe_device *xe = lrc_to_xe(lrc);
651 	struct iosys_map map;
652 
653 	map = __xe_lrc_regs_map(lrc);
654 	iosys_map_incr(&map, reg_nr * sizeof(u32));
655 	xe_map_write32(xe, &map, val);
656 }
657 
658 static void *empty_lrc_data(struct xe_hw_engine *hwe)
659 {
660 	struct xe_device *xe = gt_to_xe(hwe->gt);
661 	void *data;
662 	u32 *regs;
663 
664 	data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
665 	if (!data)
666 		return NULL;
667 
668 	/* 1st page: Per-Process of HW status Page */
669 	regs = data + LRC_PPHWSP_SIZE;
670 	set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
671 	set_context_control(regs, hwe);
672 	reset_stop_ring(regs, hwe);
673 
674 	return data;
675 }
676 
677 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
678 {
679 	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
680 
681 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
682 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
683 }
684 
685 #define PVC_CTX_ASID		(0x2e + 1)
686 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
687 #define ACC_GRANULARITY_S       20
688 #define ACC_NOTIFY_S            16
689 
690 int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
691 		struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
692 {
693 	struct xe_gt *gt = hwe->gt;
694 	struct xe_tile *tile = gt_to_tile(gt);
695 	struct xe_device *xe = gt_to_xe(gt);
696 	struct iosys_map map;
697 	void *init_data = NULL;
698 	u32 arb_enable;
699 	int err;
700 
701 	lrc->flags = 0;
702 
703 	/*
704 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
705 	 * via VM bind calls.
706 	 */
707 	lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
708 				      ring_size + xe_lrc_size(xe, hwe->class),
709 				      ttm_bo_type_kernel,
710 				      XE_BO_CREATE_VRAM_IF_DGFX(tile) |
711 				      XE_BO_CREATE_GGTT_BIT);
712 	if (IS_ERR(lrc->bo))
713 		return PTR_ERR(lrc->bo);
714 
715 	lrc->tile = gt_to_tile(hwe->gt);
716 	lrc->ring.size = ring_size;
717 	lrc->ring.tail = 0;
718 
719 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
720 			     hwe->fence_irq, hwe->name);
721 
722 	if (!gt->default_lrc[hwe->class]) {
723 		init_data = empty_lrc_data(hwe);
724 		if (!init_data) {
725 			err = -ENOMEM;
726 			goto err_lrc_finish;
727 		}
728 	}
729 
730 	/*
731 	 * Init Per-Process of HW status Page, LRC / context state to known
732 	 * values
733 	 */
734 	map = __xe_lrc_pphwsp_map(lrc);
735 	if (!init_data) {
736 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
737 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
738 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
739 				 xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
740 	} else {
741 		xe_map_memcpy_to(xe, &map, 0, init_data,
742 				 xe_lrc_size(xe, hwe->class));
743 		kfree(init_data);
744 	}
745 
746 	if (vm) {
747 		xe_lrc_set_ppgtt(lrc, vm);
748 
749 		if (vm->xef)
750 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
751 	}
752 
753 	xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
754 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
755 	xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
756 	xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
757 			     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
758 	if (xe->info.has_asid && vm)
759 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID,
760 				     (q->usm.acc_granularity <<
761 				      ACC_GRANULARITY_S) | vm->usm.asid);
762 	if (xe->info.supports_usm && vm)
763 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ACC_CTR_THOLD,
764 				     (q->usm.acc_notify << ACC_NOTIFY_S) |
765 				     q->usm.acc_trigger);
766 
767 	lrc->desc = CTX_VALID;
768 	lrc->desc |= LEGACY_64B_CONTEXT << CTX_ADDRESSING_MODE_SHIFT;
769 	/* TODO: Priority */
770 
771 	/* While this appears to have something about privileged batches or
772 	 * some such, it really just means PPGTT mode.
773 	 */
774 	if (vm)
775 		lrc->desc |= CTX_PRIVILEGE;
776 
777 	if (GRAPHICS_VERx100(xe) < 1250) {
778 		lrc->desc |= (u64)hwe->instance << ENGINE_INSTANCE_SHIFT;
779 		lrc->desc |= (u64)hwe->class << ENGINE_CLASS_SHIFT;
780 	}
781 
782 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
783 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
784 
785 	map = __xe_lrc_seqno_map(lrc);
786 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
787 
788 	map = __xe_lrc_start_seqno_map(lrc);
789 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
790 
791 	return 0;
792 
793 err_lrc_finish:
794 	xe_lrc_finish(lrc);
795 	return err;
796 }
797 
798 void xe_lrc_finish(struct xe_lrc *lrc)
799 {
800 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
801 	xe_bo_lock(lrc->bo, false);
802 	xe_bo_unpin(lrc->bo);
803 	xe_bo_unlock(lrc->bo);
804 	xe_bo_put(lrc->bo);
805 }
806 
807 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
808 {
809 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
810 }
811 
812 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
813 {
814 	return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
815 }
816 
817 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
818 {
819 	const u32 head = xe_lrc_ring_head(lrc);
820 	const u32 tail = lrc->ring.tail;
821 	const u32 size = lrc->ring.size;
822 
823 	return ((head - tail - 1) & (size - 1)) + 1;
824 }
825 
826 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
827 				const void *data, size_t size)
828 {
829 	struct xe_device *xe = lrc_to_xe(lrc);
830 
831 	iosys_map_incr(&ring, lrc->ring.tail);
832 	xe_map_memcpy_to(xe, &ring, 0, data, size);
833 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
834 }
835 
836 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
837 {
838 	struct xe_device *xe = lrc_to_xe(lrc);
839 	struct iosys_map ring;
840 	u32 rhs;
841 	size_t aligned_size;
842 
843 	xe_assert(xe, IS_ALIGNED(size, 4));
844 	aligned_size = ALIGN(size, 8);
845 
846 	ring = __xe_lrc_ring_map(lrc);
847 
848 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
849 	rhs = lrc->ring.size - lrc->ring.tail;
850 	if (size > rhs) {
851 		__xe_lrc_write_ring(lrc, ring, data, rhs);
852 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
853 	} else {
854 		__xe_lrc_write_ring(lrc, ring, data, size);
855 	}
856 
857 	if (aligned_size > size) {
858 		u32 noop = MI_NOOP;
859 
860 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
861 	}
862 }
863 
864 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
865 {
866 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
867 }
868 
869 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
870 {
871 	return __xe_lrc_seqno_ggtt_addr(lrc);
872 }
873 
874 struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
875 {
876 	return &xe_hw_fence_create(&lrc->fence_ctx,
877 				   __xe_lrc_seqno_map(lrc))->dma;
878 }
879 
880 s32 xe_lrc_seqno(struct xe_lrc *lrc)
881 {
882 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
883 
884 	return xe_map_read32(lrc_to_xe(lrc), &map);
885 }
886 
887 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
888 {
889 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
890 
891 	return xe_map_read32(lrc_to_xe(lrc), &map);
892 }
893 
894 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
895 {
896 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
897 }
898 
899 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
900 {
901 	return __xe_lrc_parallel_ggtt_addr(lrc);
902 }
903 
904 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
905 {
906 	return __xe_lrc_parallel_map(lrc);
907 }
908 
909 static int instr_dw(u32 cmd_header)
910 {
911 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
912 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
913 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
914 		return 1;
915 
916 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
917 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
918 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
919 
920 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
921 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
922 }
923 
924 static int dump_mi_command(struct drm_printer *p,
925 			   struct xe_gt *gt,
926 			   u32 *dw,
927 			   int remaining_dw)
928 {
929 	u32 inst_header = *dw;
930 	u32 numdw = instr_dw(inst_header);
931 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
932 	int num_noop;
933 
934 	/* First check for commands that don't have/use a '# DW' field */
935 	switch (inst_header & MI_OPCODE) {
936 	case MI_NOOP:
937 		num_noop = 1;
938 		while (num_noop < remaining_dw &&
939 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
940 			num_noop++;
941 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
942 		return num_noop;
943 
944 	case MI_TOPOLOGY_FILTER:
945 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
946 		return 1;
947 
948 	case MI_BATCH_BUFFER_END:
949 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
950 		/* Return 'remaining_dw' to consume the rest of the LRC */
951 		return remaining_dw;
952 	}
953 
954 	/*
955 	 * Any remaining commands include a # of dwords.  We should make sure
956 	 * it doesn't exceed the remaining size of the LRC.
957 	 */
958 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
959 		numdw = remaining_dw;
960 
961 	switch (inst_header & MI_OPCODE) {
962 	case MI_LOAD_REGISTER_IMM:
963 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
964 			   inst_header, (numdw - 1) / 2);
965 		for (int i = 1; i < numdw; i += 2)
966 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
967 		return numdw;
968 
969 	case MI_FORCE_WAKEUP:
970 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
971 		return numdw;
972 
973 	default:
974 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
975 			   inst_header, opcode, numdw);
976 		return numdw;
977 	}
978 }
979 
980 static int dump_gfxpipe_command(struct drm_printer *p,
981 				struct xe_gt *gt,
982 				u32 *dw,
983 				int remaining_dw)
984 {
985 	u32 numdw = instr_dw(*dw);
986 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
987 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
988 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
989 
990 	/*
991 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
992 	 * remaining size of the LRC.
993 	 */
994 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
995 		numdw = remaining_dw;
996 
997 	switch (*dw & GFXPIPE_MATCH_MASK) {
998 #define MATCH(cmd) \
999 	case cmd: \
1000 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1001 		return numdw
1002 #define MATCH3D(cmd) \
1003 	case CMD_##cmd: \
1004 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1005 		return numdw
1006 
1007 	MATCH(STATE_BASE_ADDRESS);
1008 	MATCH(STATE_SIP);
1009 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1010 	MATCH(STATE_COMPUTE_MODE);
1011 	MATCH3D(3DSTATE_BTD);
1012 
1013 	MATCH3D(3DSTATE_VF_STATISTICS);
1014 
1015 	MATCH(PIPELINE_SELECT);
1016 
1017 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1018 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1019 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1020 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1021 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1022 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1023 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1024 	MATCH3D(3DSTATE_INDEX_BUFFER);
1025 	MATCH3D(3DSTATE_VF);
1026 	MATCH3D(3DSTATE_MULTISAMPLE);
1027 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1028 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1029 	MATCH3D(3DSTATE_VS);
1030 	MATCH3D(3DSTATE_GS);
1031 	MATCH3D(3DSTATE_CLIP);
1032 	MATCH3D(3DSTATE_SF);
1033 	MATCH3D(3DSTATE_WM);
1034 	MATCH3D(3DSTATE_CONSTANT_VS);
1035 	MATCH3D(3DSTATE_CONSTANT_GS);
1036 	MATCH3D(3DSTATE_SAMPLE_MASK);
1037 	MATCH3D(3DSTATE_CONSTANT_HS);
1038 	MATCH3D(3DSTATE_CONSTANT_DS);
1039 	MATCH3D(3DSTATE_HS);
1040 	MATCH3D(3DSTATE_TE);
1041 	MATCH3D(3DSTATE_DS);
1042 	MATCH3D(3DSTATE_STREAMOUT);
1043 	MATCH3D(3DSTATE_SBE);
1044 	MATCH3D(3DSTATE_PS);
1045 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1046 	MATCH3D(3DSTATE_CPS_POINTERS);
1047 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1048 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1049 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1050 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1051 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1052 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1053 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1054 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1055 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1056 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1057 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1058 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1059 	MATCH3D(3DSTATE_VF_INSTANCING);
1060 	MATCH3D(3DSTATE_VF_SGVS);
1061 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1062 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1063 	MATCH3D(3DSTATE_PS_BLEND);
1064 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1065 	MATCH3D(3DSTATE_PS_EXTRA);
1066 	MATCH3D(3DSTATE_RASTER);
1067 	MATCH3D(3DSTATE_SBE_SWIZ);
1068 	MATCH3D(3DSTATE_WM_HZ_OP);
1069 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1070 	MATCH3D(3DSTATE_VF_SGVS_2);
1071 	MATCH3D(3DSTATE_VFG);
1072 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1073 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1074 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1075 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1076 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1077 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1078 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1079 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1080 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1081 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1082 	MATCH3D(3DSTATE_AMFS);
1083 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1084 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1085 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1086 	MATCH3D(3DSTATE_MESH_CONTROL);
1087 	MATCH3D(3DSTATE_MESH_DISTRIB);
1088 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1089 	MATCH3D(3DSTATE_MESH_SHADER);
1090 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1091 	MATCH3D(3DSTATE_TASK_CONTROL);
1092 	MATCH3D(3DSTATE_TASK_SHADER);
1093 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1094 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1095 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1096 	MATCH3D(3DSTATE_CLIP_MESH);
1097 	MATCH3D(3DSTATE_SBE_MESH);
1098 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1099 
1100 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1101 	MATCH3D(3DSTATE_CHROMA_KEY);
1102 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1103 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1104 	MATCH3D(3DSTATE_LINE_STIPPLE);
1105 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1106 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1107 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1108 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1109 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1110 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1111 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1112 	MATCH3D(3DSTATE_SO_DECL_LIST);
1113 	MATCH3D(3DSTATE_SO_BUFFER);
1114 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1115 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1116 	MATCH3D(3DSTATE_3D_MODE);
1117 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1118 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1119 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1120 
1121 	default:
1122 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1123 			   *dw, pipeline, opcode, subopcode, numdw);
1124 		return numdw;
1125 	}
1126 }
1127 
1128 void xe_lrc_dump_default(struct drm_printer *p,
1129 			 struct xe_gt *gt,
1130 			 enum xe_engine_class hwe_class)
1131 {
1132 	u32 *dw;
1133 	int remaining_dw, num_dw;
1134 
1135 	if (!gt->default_lrc[hwe_class]) {
1136 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1137 		return;
1138 	}
1139 
1140 	/*
1141 	 * Skip the beginning of the LRC since it contains the per-process
1142 	 * hardware status page.
1143 	 */
1144 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1145 	remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
1146 
1147 	while (remaining_dw > 0) {
1148 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1149 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1150 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1151 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1152 		} else {
1153 			num_dw = min(instr_dw(*dw), remaining_dw);
1154 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1155 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1156 				   num_dw);
1157 		}
1158 
1159 		dw += num_dw;
1160 		remaining_dw -= num_dw;
1161 	}
1162 }
1163 
1164 struct instr_state {
1165 	u32 instr;
1166 	u16 num_dw;
1167 };
1168 
1169 static const struct instr_state xe_hpg_svg_state[] = {
1170 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1171 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1172 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1173 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1174 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1175 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1176 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1177 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1178 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1179 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1180 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1181 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1182 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1183 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1184 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1185 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1186 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1187 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1188 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1189 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1190 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1191 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1192 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1193 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1194 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1195 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1196 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1197 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1198 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1199 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1200 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1201 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1202 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1203 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1204 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1205 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1206 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1207 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1208 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1209 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1210 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1211 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1212 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1213 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1214 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1215 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1216 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1217 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1218 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1219 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1220 };
1221 
1222 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1223 {
1224 	struct xe_gt *gt = q->hwe->gt;
1225 	struct xe_device *xe = gt_to_xe(gt);
1226 	const struct instr_state *state_table = NULL;
1227 	int state_table_size = 0;
1228 
1229 	/*
1230 	 * At the moment we only need to emit non-register state for the RCS
1231 	 * engine.
1232 	 */
1233 	if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
1234 		return;
1235 
1236 	switch (GRAPHICS_VERx100(xe)) {
1237 	case 1255:
1238 	case 1270 ... 2004:
1239 		state_table = xe_hpg_svg_state;
1240 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1241 		break;
1242 	default:
1243 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1244 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1245 		return;
1246 	}
1247 
1248 	for (int i = 0; i < state_table_size; i++) {
1249 		u32 instr = state_table[i].instr;
1250 		u16 num_dw = state_table[i].num_dw;
1251 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1252 
1253 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1254 		xe_gt_assert(gt, num_dw != 0);
1255 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1256 
1257 		/*
1258 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1259 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1260 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1261 		 * Just make the replacement here rather than defining a
1262 		 * whole separate table for the single trivial change.
1263 		 */
1264 		if (GRAPHICS_VER(xe) >= 20 &&
1265 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1266 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1267 
1268 		bb->cs[bb->len] = instr;
1269 		if (!is_single_dw)
1270 			bb->cs[bb->len] |= (num_dw - 2);
1271 
1272 		bb->len += num_dw;
1273 	}
1274 }
1275