xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision e28c5efc31397af17bc5a7d55b963f59bcde0166)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include "instructions/xe_mi_commands.h"
9 #include "instructions/xe_gfxpipe_commands.h"
10 #include "regs/xe_engine_regs.h"
11 #include "regs/xe_gpu_commands.h"
12 #include "regs/xe_lrc_layout.h"
13 #include "xe_bb.h"
14 #include "xe_bo.h"
15 #include "xe_device.h"
16 #include "xe_drm_client.h"
17 #include "xe_exec_queue_types.h"
18 #include "xe_gt.h"
19 #include "xe_gt_printk.h"
20 #include "xe_hw_fence.h"
21 #include "xe_map.h"
22 #include "xe_vm.h"
23 
24 #define LRC_VALID				(1 << 0)
25 #define LRC_PRIVILEGE				(1 << 8)
26 #define LRC_ADDRESSING_MODE_SHIFT		3
27 #define LRC_LEGACY_64B_CONTEXT			3
28 
29 #define ENGINE_CLASS_SHIFT			61
30 #define ENGINE_INSTANCE_SHIFT			48
31 
32 static struct xe_device *
33 lrc_to_xe(struct xe_lrc *lrc)
34 {
35 	return gt_to_xe(lrc->fence_ctx.gt);
36 }
37 
38 size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
39 {
40 	switch (class) {
41 	case XE_ENGINE_CLASS_RENDER:
42 		if (GRAPHICS_VER(xe) >= 20)
43 			return 4 * SZ_4K;
44 		else
45 			return 14 * SZ_4K;
46 	case XE_ENGINE_CLASS_COMPUTE:
47 		/* 14 pages since graphics_ver == 11 */
48 		if (GRAPHICS_VER(xe) >= 20)
49 			return 3 * SZ_4K;
50 		else
51 			return 14 * SZ_4K;
52 	default:
53 		WARN(1, "Unknown engine class: %d", class);
54 		fallthrough;
55 	case XE_ENGINE_CLASS_COPY:
56 	case XE_ENGINE_CLASS_VIDEO_DECODE:
57 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
58 	case XE_ENGINE_CLASS_OTHER:
59 		return 2 * SZ_4K;
60 	}
61 }
62 
63 /*
64  * The per-platform tables are u8-encoded in @data. Decode @data and set the
65  * addresses' offset and commands in @regs. The following encoding is used
66  * for each byte. There are 2 steps: decoding commands and decoding addresses.
67  *
68  * Commands:
69  * [7]: create NOPs - number of NOPs are set in lower bits
70  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
71  *      MI_LRI_FORCE_POSTED
72  * [5:0]: Number of NOPs or registers to set values to in case of
73  *        MI_LOAD_REGISTER_IMM
74  *
75  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
76  * number of registers. They are set by using the REG/REG16 macros: the former
77  * is used for offsets smaller than 0x200 while the latter is for values bigger
78  * than that. Those macros already set all the bits documented below correctly:
79  *
80  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
81  *      follow, for the lower bits
82  * [6:0]: Register offset, without considering the engine base.
83  *
84  * This function only tweaks the commands and register offsets. Values are not
85  * filled out.
86  */
87 static void set_offsets(u32 *regs,
88 			const u8 *data,
89 			const struct xe_hw_engine *hwe)
90 #define NOP(x) (BIT(7) | (x))
91 #define LRI(count, flags) ((flags) << 6 | (count) | \
92 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
93 #define POSTED BIT(0)
94 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
95 #define REG16(x) \
96 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
97 	(((x) >> 2) & 0x7f)
98 #define END 0
99 {
100 	const u32 base = hwe->mmio_base;
101 
102 	while (*data) {
103 		u8 count, flags;
104 
105 		if (*data & BIT(7)) { /* skip */
106 			count = *data++ & ~BIT(7);
107 			regs += count;
108 			continue;
109 		}
110 
111 		count = *data & 0x3f;
112 		flags = *data >> 6;
113 		data++;
114 
115 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
116 		if (flags & POSTED)
117 			*regs |= MI_LRI_FORCE_POSTED;
118 		*regs |= MI_LRI_LRM_CS_MMIO;
119 		regs++;
120 
121 		xe_gt_assert(hwe->gt, count);
122 		do {
123 			u32 offset = 0;
124 			u8 v;
125 
126 			do {
127 				v = *data++;
128 				offset <<= 7;
129 				offset |= v & ~BIT(7);
130 			} while (v & BIT(7));
131 
132 			regs[0] = base + (offset << 2);
133 			regs += 2;
134 		} while (--count);
135 	}
136 
137 	*regs = MI_BATCH_BUFFER_END | BIT(0);
138 }
139 
140 static const u8 gen12_xcs_offsets[] = {
141 	NOP(1),
142 	LRI(13, POSTED),
143 	REG16(0x244),
144 	REG(0x034),
145 	REG(0x030),
146 	REG(0x038),
147 	REG(0x03c),
148 	REG(0x168),
149 	REG(0x140),
150 	REG(0x110),
151 	REG(0x1c0),
152 	REG(0x1c4),
153 	REG(0x1c8),
154 	REG(0x180),
155 	REG16(0x2b4),
156 
157 	NOP(5),
158 	LRI(9, POSTED),
159 	REG16(0x3a8),
160 	REG16(0x28c),
161 	REG16(0x288),
162 	REG16(0x284),
163 	REG16(0x280),
164 	REG16(0x27c),
165 	REG16(0x278),
166 	REG16(0x274),
167 	REG16(0x270),
168 
169 	END
170 };
171 
172 static const u8 dg2_xcs_offsets[] = {
173 	NOP(1),
174 	LRI(15, POSTED),
175 	REG16(0x244),
176 	REG(0x034),
177 	REG(0x030),
178 	REG(0x038),
179 	REG(0x03c),
180 	REG(0x168),
181 	REG(0x140),
182 	REG(0x110),
183 	REG(0x1c0),
184 	REG(0x1c4),
185 	REG(0x1c8),
186 	REG(0x180),
187 	REG16(0x2b4),
188 	REG(0x120),
189 	REG(0x124),
190 
191 	NOP(1),
192 	LRI(9, POSTED),
193 	REG16(0x3a8),
194 	REG16(0x28c),
195 	REG16(0x288),
196 	REG16(0x284),
197 	REG16(0x280),
198 	REG16(0x27c),
199 	REG16(0x278),
200 	REG16(0x274),
201 	REG16(0x270),
202 
203 	END
204 };
205 
206 static const u8 gen12_rcs_offsets[] = {
207 	NOP(1),
208 	LRI(13, POSTED),
209 	REG16(0x244),
210 	REG(0x034),
211 	REG(0x030),
212 	REG(0x038),
213 	REG(0x03c),
214 	REG(0x168),
215 	REG(0x140),
216 	REG(0x110),
217 	REG(0x1c0),
218 	REG(0x1c4),
219 	REG(0x1c8),
220 	REG(0x180),
221 	REG16(0x2b4),
222 
223 	NOP(5),
224 	LRI(9, POSTED),
225 	REG16(0x3a8),
226 	REG16(0x28c),
227 	REG16(0x288),
228 	REG16(0x284),
229 	REG16(0x280),
230 	REG16(0x27c),
231 	REG16(0x278),
232 	REG16(0x274),
233 	REG16(0x270),
234 
235 	LRI(3, POSTED),
236 	REG(0x1b0),
237 	REG16(0x5a8),
238 	REG16(0x5ac),
239 
240 	NOP(6),
241 	LRI(1, 0),
242 	REG(0x0c8),
243 	NOP(3 + 9 + 1),
244 
245 	LRI(51, POSTED),
246 	REG16(0x588),
247 	REG16(0x588),
248 	REG16(0x588),
249 	REG16(0x588),
250 	REG16(0x588),
251 	REG16(0x588),
252 	REG(0x028),
253 	REG(0x09c),
254 	REG(0x0c0),
255 	REG(0x178),
256 	REG(0x17c),
257 	REG16(0x358),
258 	REG(0x170),
259 	REG(0x150),
260 	REG(0x154),
261 	REG(0x158),
262 	REG16(0x41c),
263 	REG16(0x600),
264 	REG16(0x604),
265 	REG16(0x608),
266 	REG16(0x60c),
267 	REG16(0x610),
268 	REG16(0x614),
269 	REG16(0x618),
270 	REG16(0x61c),
271 	REG16(0x620),
272 	REG16(0x624),
273 	REG16(0x628),
274 	REG16(0x62c),
275 	REG16(0x630),
276 	REG16(0x634),
277 	REG16(0x638),
278 	REG16(0x63c),
279 	REG16(0x640),
280 	REG16(0x644),
281 	REG16(0x648),
282 	REG16(0x64c),
283 	REG16(0x650),
284 	REG16(0x654),
285 	REG16(0x658),
286 	REG16(0x65c),
287 	REG16(0x660),
288 	REG16(0x664),
289 	REG16(0x668),
290 	REG16(0x66c),
291 	REG16(0x670),
292 	REG16(0x674),
293 	REG16(0x678),
294 	REG16(0x67c),
295 	REG(0x068),
296 	REG(0x084),
297 	NOP(1),
298 
299 	END
300 };
301 
302 static const u8 xehp_rcs_offsets[] = {
303 	NOP(1),
304 	LRI(13, POSTED),
305 	REG16(0x244),
306 	REG(0x034),
307 	REG(0x030),
308 	REG(0x038),
309 	REG(0x03c),
310 	REG(0x168),
311 	REG(0x140),
312 	REG(0x110),
313 	REG(0x1c0),
314 	REG(0x1c4),
315 	REG(0x1c8),
316 	REG(0x180),
317 	REG16(0x2b4),
318 
319 	NOP(5),
320 	LRI(9, POSTED),
321 	REG16(0x3a8),
322 	REG16(0x28c),
323 	REG16(0x288),
324 	REG16(0x284),
325 	REG16(0x280),
326 	REG16(0x27c),
327 	REG16(0x278),
328 	REG16(0x274),
329 	REG16(0x270),
330 
331 	LRI(3, POSTED),
332 	REG(0x1b0),
333 	REG16(0x5a8),
334 	REG16(0x5ac),
335 
336 	NOP(6),
337 	LRI(1, 0),
338 	REG(0x0c8),
339 
340 	END
341 };
342 
343 static const u8 dg2_rcs_offsets[] = {
344 	NOP(1),
345 	LRI(15, POSTED),
346 	REG16(0x244),
347 	REG(0x034),
348 	REG(0x030),
349 	REG(0x038),
350 	REG(0x03c),
351 	REG(0x168),
352 	REG(0x140),
353 	REG(0x110),
354 	REG(0x1c0),
355 	REG(0x1c4),
356 	REG(0x1c8),
357 	REG(0x180),
358 	REG16(0x2b4),
359 	REG(0x120),
360 	REG(0x124),
361 
362 	NOP(1),
363 	LRI(9, POSTED),
364 	REG16(0x3a8),
365 	REG16(0x28c),
366 	REG16(0x288),
367 	REG16(0x284),
368 	REG16(0x280),
369 	REG16(0x27c),
370 	REG16(0x278),
371 	REG16(0x274),
372 	REG16(0x270),
373 
374 	LRI(3, POSTED),
375 	REG(0x1b0),
376 	REG16(0x5a8),
377 	REG16(0x5ac),
378 
379 	NOP(6),
380 	LRI(1, 0),
381 	REG(0x0c8),
382 
383 	END
384 };
385 
386 static const u8 mtl_rcs_offsets[] = {
387 	NOP(1),
388 	LRI(15, POSTED),
389 	REG16(0x244),
390 	REG(0x034),
391 	REG(0x030),
392 	REG(0x038),
393 	REG(0x03c),
394 	REG(0x168),
395 	REG(0x140),
396 	REG(0x110),
397 	REG(0x1c0),
398 	REG(0x1c4),
399 	REG(0x1c8),
400 	REG(0x180),
401 	REG16(0x2b4),
402 	REG(0x120),
403 	REG(0x124),
404 
405 	NOP(1),
406 	LRI(9, POSTED),
407 	REG16(0x3a8),
408 	REG16(0x28c),
409 	REG16(0x288),
410 	REG16(0x284),
411 	REG16(0x280),
412 	REG16(0x27c),
413 	REG16(0x278),
414 	REG16(0x274),
415 	REG16(0x270),
416 
417 	NOP(2),
418 	LRI(2, POSTED),
419 	REG16(0x5a8),
420 	REG16(0x5ac),
421 
422 	NOP(6),
423 	LRI(1, 0),
424 	REG(0x0c8),
425 
426 	END
427 };
428 
429 #define XE2_CTX_COMMON \
430 	NOP(1),                 /* [0x00] */ \
431 	LRI(15, POSTED),        /* [0x01] */ \
432 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
433 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
434 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
435 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
436 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
437 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
438 	REG(0x140),             /* [0x0e] BB_ADDR */ \
439 	REG(0x110),             /* [0x10] BB_STATE */ \
440 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
441 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
442 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
443 	REG(0x180),             /* [0x18] CCID */ \
444 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
445 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
446 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
447 	\
448 	NOP(1),                 /* [0x20] */ \
449 	LRI(9, POSTED),         /* [0x21] */ \
450 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
451 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
452 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
453 	REG16(0x284),           /* [0x28] dummy reg */ \
454 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
455 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
456 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
457 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
458 	REG16(0x270)            /* [0x32] PTBP_LDW */
459 
460 static const u8 xe2_rcs_offsets[] = {
461 	XE2_CTX_COMMON,
462 
463 	NOP(2),                 /* [0x34] */
464 	LRI(2, POSTED),         /* [0x36] */
465 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
466 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
467 
468 	NOP(6),                 /* [0x41] */
469 	LRI(1, 0),              /* [0x47] */
470 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
471 
472 	END
473 };
474 
475 static const u8 xe2_bcs_offsets[] = {
476 	XE2_CTX_COMMON,
477 
478 	NOP(4 + 8 + 1),         /* [0x34] */
479 	LRI(2, POSTED),         /* [0x41] */
480 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
481 	REG16(0x204),           /* [0x44] BLIT_CCTL */
482 
483 	END
484 };
485 
486 static const u8 xe2_xcs_offsets[] = {
487 	XE2_CTX_COMMON,
488 
489 	END
490 };
491 
492 #undef END
493 #undef REG16
494 #undef REG
495 #undef LRI
496 #undef NOP
497 
498 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
499 {
500 	if (class == XE_ENGINE_CLASS_RENDER) {
501 		if (GRAPHICS_VER(xe) >= 20)
502 			return xe2_rcs_offsets;
503 		else if (GRAPHICS_VERx100(xe) >= 1270)
504 			return mtl_rcs_offsets;
505 		else if (GRAPHICS_VERx100(xe) >= 1255)
506 			return dg2_rcs_offsets;
507 		else if (GRAPHICS_VERx100(xe) >= 1250)
508 			return xehp_rcs_offsets;
509 		else
510 			return gen12_rcs_offsets;
511 	} else if (class == XE_ENGINE_CLASS_COPY) {
512 		if (GRAPHICS_VER(xe) >= 20)
513 			return xe2_bcs_offsets;
514 		else
515 			return gen12_xcs_offsets;
516 	} else {
517 		if (GRAPHICS_VER(xe) >= 20)
518 			return xe2_xcs_offsets;
519 		else if (GRAPHICS_VERx100(xe) >= 1255)
520 			return dg2_xcs_offsets;
521 		else
522 			return gen12_xcs_offsets;
523 	}
524 }
525 
526 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
527 {
528 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) |
529 				    _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
530 				    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
531 
532 	/* TODO: Timestamp */
533 }
534 
535 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
536 {
537 	struct xe_device *xe = gt_to_xe(hwe->gt);
538 
539 	if (GRAPHICS_VERx100(xe) >= 1250)
540 		return 0x70;
541 	else
542 		return 0x60;
543 }
544 
545 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
546 {
547 	int x;
548 
549 	x = lrc_ring_mi_mode(hwe);
550 	regs[x + 1] &= ~STOP_RING;
551 	regs[x + 1] |= STOP_RING << 16;
552 }
553 
554 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
555 {
556 	return 0;
557 }
558 
559 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
560 {
561 	return lrc->ring.size;
562 }
563 
564 /* Make the magic macros work */
565 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
566 
567 #define LRC_SEQNO_PPHWSP_OFFSET 512
568 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
569 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
570 #define LRC_PPHWSP_SIZE SZ_4K
571 
572 static size_t lrc_reg_size(struct xe_device *xe)
573 {
574 	if (GRAPHICS_VERx100(xe) >= 1250)
575 		return 96 * sizeof(u32);
576 	else
577 		return 80 * sizeof(u32);
578 }
579 
580 size_t xe_lrc_skip_size(struct xe_device *xe)
581 {
582 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
583 }
584 
585 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
586 {
587 	/* The seqno is stored in the driver-defined portion of PPHWSP */
588 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
589 }
590 
591 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
592 {
593 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
594 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
595 }
596 
597 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
598 {
599 	/* The parallel is stored in the driver-defined portion of PPHWSP */
600 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
601 }
602 
603 static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
604 {
605 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
606 }
607 
608 #define DECL_MAP_ADDR_HELPERS(elem) \
609 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
610 { \
611 	struct iosys_map map = lrc->bo->vmap; \
612 \
613 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
614 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
615 	return map; \
616 } \
617 static inline u32 __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
618 { \
619 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
620 } \
621 
622 DECL_MAP_ADDR_HELPERS(ring)
623 DECL_MAP_ADDR_HELPERS(pphwsp)
624 DECL_MAP_ADDR_HELPERS(seqno)
625 DECL_MAP_ADDR_HELPERS(regs)
626 DECL_MAP_ADDR_HELPERS(start_seqno)
627 DECL_MAP_ADDR_HELPERS(parallel)
628 
629 #undef DECL_MAP_ADDR_HELPERS
630 
631 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
632 {
633 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
634 }
635 
636 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
637 {
638 	struct xe_device *xe = lrc_to_xe(lrc);
639 	struct iosys_map map;
640 
641 	map = __xe_lrc_regs_map(lrc);
642 	iosys_map_incr(&map, reg_nr * sizeof(u32));
643 	return xe_map_read32(xe, &map);
644 }
645 
646 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
647 {
648 	struct xe_device *xe = lrc_to_xe(lrc);
649 	struct iosys_map map;
650 
651 	map = __xe_lrc_regs_map(lrc);
652 	iosys_map_incr(&map, reg_nr * sizeof(u32));
653 	xe_map_write32(xe, &map, val);
654 }
655 
656 static void *empty_lrc_data(struct xe_hw_engine *hwe)
657 {
658 	struct xe_device *xe = gt_to_xe(hwe->gt);
659 	void *data;
660 	u32 *regs;
661 
662 	data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
663 	if (!data)
664 		return NULL;
665 
666 	/* 1st page: Per-Process of HW status Page */
667 	regs = data + LRC_PPHWSP_SIZE;
668 	set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
669 	set_context_control(regs, hwe);
670 	reset_stop_ring(regs, hwe);
671 
672 	return data;
673 }
674 
675 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
676 {
677 	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
678 
679 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
680 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
681 }
682 
683 #define PVC_CTX_ASID		(0x2e + 1)
684 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
685 
686 int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
687 		struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
688 {
689 	struct xe_gt *gt = hwe->gt;
690 	struct xe_tile *tile = gt_to_tile(gt);
691 	struct xe_device *xe = gt_to_xe(gt);
692 	struct iosys_map map;
693 	void *init_data = NULL;
694 	u32 arb_enable;
695 	int err;
696 
697 	lrc->flags = 0;
698 
699 	/*
700 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
701 	 * via VM bind calls.
702 	 */
703 	lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
704 				      ring_size + xe_lrc_size(xe, hwe->class),
705 				      ttm_bo_type_kernel,
706 				      XE_BO_CREATE_VRAM_IF_DGFX(tile) |
707 				      XE_BO_CREATE_GGTT_BIT);
708 	if (IS_ERR(lrc->bo))
709 		return PTR_ERR(lrc->bo);
710 
711 	lrc->tile = gt_to_tile(hwe->gt);
712 	lrc->ring.size = ring_size;
713 	lrc->ring.tail = 0;
714 
715 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
716 			     hwe->fence_irq, hwe->name);
717 
718 	if (!gt->default_lrc[hwe->class]) {
719 		init_data = empty_lrc_data(hwe);
720 		if (!init_data) {
721 			err = -ENOMEM;
722 			goto err_lrc_finish;
723 		}
724 	}
725 
726 	/*
727 	 * Init Per-Process of HW status Page, LRC / context state to known
728 	 * values
729 	 */
730 	map = __xe_lrc_pphwsp_map(lrc);
731 	if (!init_data) {
732 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
733 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
734 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
735 				 xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
736 	} else {
737 		xe_map_memcpy_to(xe, &map, 0, init_data,
738 				 xe_lrc_size(xe, hwe->class));
739 		kfree(init_data);
740 	}
741 
742 	if (vm) {
743 		xe_lrc_set_ppgtt(lrc, vm);
744 
745 		if (vm->xef)
746 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
747 	}
748 
749 	xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
750 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
751 	xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
752 	xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
753 			     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
754 	if (xe->info.has_asid && vm)
755 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
756 
757 	lrc->desc = LRC_VALID;
758 	lrc->desc |= LRC_LEGACY_64B_CONTEXT << LRC_ADDRESSING_MODE_SHIFT;
759 	/* TODO: Priority */
760 
761 	/* While this appears to have something about privileged batches or
762 	 * some such, it really just means PPGTT mode.
763 	 */
764 	if (vm)
765 		lrc->desc |= LRC_PRIVILEGE;
766 
767 	if (GRAPHICS_VERx100(xe) < 1250) {
768 		lrc->desc |= (u64)hwe->instance << ENGINE_INSTANCE_SHIFT;
769 		lrc->desc |= (u64)hwe->class << ENGINE_CLASS_SHIFT;
770 	}
771 
772 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
773 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
774 
775 	map = __xe_lrc_seqno_map(lrc);
776 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
777 
778 	map = __xe_lrc_start_seqno_map(lrc);
779 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
780 
781 	return 0;
782 
783 err_lrc_finish:
784 	xe_lrc_finish(lrc);
785 	return err;
786 }
787 
788 void xe_lrc_finish(struct xe_lrc *lrc)
789 {
790 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
791 	xe_bo_lock(lrc->bo, false);
792 	xe_bo_unpin(lrc->bo);
793 	xe_bo_unlock(lrc->bo);
794 	xe_bo_put(lrc->bo);
795 }
796 
797 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
798 {
799 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
800 }
801 
802 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
803 {
804 	return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
805 }
806 
807 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
808 {
809 	const u32 head = xe_lrc_ring_head(lrc);
810 	const u32 tail = lrc->ring.tail;
811 	const u32 size = lrc->ring.size;
812 
813 	return ((head - tail - 1) & (size - 1)) + 1;
814 }
815 
816 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
817 				const void *data, size_t size)
818 {
819 	struct xe_device *xe = lrc_to_xe(lrc);
820 
821 	iosys_map_incr(&ring, lrc->ring.tail);
822 	xe_map_memcpy_to(xe, &ring, 0, data, size);
823 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
824 }
825 
826 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
827 {
828 	struct xe_device *xe = lrc_to_xe(lrc);
829 	struct iosys_map ring;
830 	u32 rhs;
831 	size_t aligned_size;
832 
833 	xe_assert(xe, IS_ALIGNED(size, 4));
834 	aligned_size = ALIGN(size, 8);
835 
836 	ring = __xe_lrc_ring_map(lrc);
837 
838 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
839 	rhs = lrc->ring.size - lrc->ring.tail;
840 	if (size > rhs) {
841 		__xe_lrc_write_ring(lrc, ring, data, rhs);
842 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
843 	} else {
844 		__xe_lrc_write_ring(lrc, ring, data, size);
845 	}
846 
847 	if (aligned_size > size) {
848 		u32 noop = MI_NOOP;
849 
850 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
851 	}
852 }
853 
854 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
855 {
856 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
857 }
858 
859 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
860 {
861 	return __xe_lrc_seqno_ggtt_addr(lrc);
862 }
863 
864 struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
865 {
866 	return &xe_hw_fence_create(&lrc->fence_ctx,
867 				   __xe_lrc_seqno_map(lrc))->dma;
868 }
869 
870 s32 xe_lrc_seqno(struct xe_lrc *lrc)
871 {
872 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
873 
874 	return xe_map_read32(lrc_to_xe(lrc), &map);
875 }
876 
877 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
878 {
879 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
880 
881 	return xe_map_read32(lrc_to_xe(lrc), &map);
882 }
883 
884 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
885 {
886 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
887 }
888 
889 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
890 {
891 	return __xe_lrc_parallel_ggtt_addr(lrc);
892 }
893 
894 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
895 {
896 	return __xe_lrc_parallel_map(lrc);
897 }
898 
899 static int instr_dw(u32 cmd_header)
900 {
901 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
902 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
903 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
904 		return 1;
905 
906 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
907 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
908 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
909 
910 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
911 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
912 }
913 
914 static int dump_mi_command(struct drm_printer *p,
915 			   struct xe_gt *gt,
916 			   u32 *dw,
917 			   int remaining_dw)
918 {
919 	u32 inst_header = *dw;
920 	u32 numdw = instr_dw(inst_header);
921 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
922 	int num_noop;
923 
924 	/* First check for commands that don't have/use a '# DW' field */
925 	switch (inst_header & MI_OPCODE) {
926 	case MI_NOOP:
927 		num_noop = 1;
928 		while (num_noop < remaining_dw &&
929 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
930 			num_noop++;
931 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
932 		return num_noop;
933 
934 	case MI_TOPOLOGY_FILTER:
935 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
936 		return 1;
937 
938 	case MI_BATCH_BUFFER_END:
939 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
940 		/* Return 'remaining_dw' to consume the rest of the LRC */
941 		return remaining_dw;
942 	}
943 
944 	/*
945 	 * Any remaining commands include a # of dwords.  We should make sure
946 	 * it doesn't exceed the remaining size of the LRC.
947 	 */
948 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
949 		numdw = remaining_dw;
950 
951 	switch (inst_header & MI_OPCODE) {
952 	case MI_LOAD_REGISTER_IMM:
953 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
954 			   inst_header, (numdw - 1) / 2);
955 		for (int i = 1; i < numdw; i += 2)
956 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
957 		return numdw;
958 
959 	case MI_FORCE_WAKEUP:
960 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
961 		return numdw;
962 
963 	default:
964 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
965 			   inst_header, opcode, numdw);
966 		return numdw;
967 	}
968 }
969 
970 static int dump_gfxpipe_command(struct drm_printer *p,
971 				struct xe_gt *gt,
972 				u32 *dw,
973 				int remaining_dw)
974 {
975 	u32 numdw = instr_dw(*dw);
976 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
977 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
978 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
979 
980 	/*
981 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
982 	 * remaining size of the LRC.
983 	 */
984 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
985 		numdw = remaining_dw;
986 
987 	switch (*dw & GFXPIPE_MATCH_MASK) {
988 #define MATCH(cmd) \
989 	case cmd: \
990 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
991 		return numdw
992 #define MATCH3D(cmd) \
993 	case CMD_##cmd: \
994 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
995 		return numdw
996 
997 	MATCH(STATE_BASE_ADDRESS);
998 	MATCH(STATE_SIP);
999 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1000 	MATCH(STATE_COMPUTE_MODE);
1001 	MATCH3D(3DSTATE_BTD);
1002 
1003 	MATCH3D(3DSTATE_VF_STATISTICS);
1004 
1005 	MATCH(PIPELINE_SELECT);
1006 
1007 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1008 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1009 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1010 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1011 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1012 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1013 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1014 	MATCH3D(3DSTATE_INDEX_BUFFER);
1015 	MATCH3D(3DSTATE_VF);
1016 	MATCH3D(3DSTATE_MULTISAMPLE);
1017 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1018 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1019 	MATCH3D(3DSTATE_VS);
1020 	MATCH3D(3DSTATE_GS);
1021 	MATCH3D(3DSTATE_CLIP);
1022 	MATCH3D(3DSTATE_SF);
1023 	MATCH3D(3DSTATE_WM);
1024 	MATCH3D(3DSTATE_CONSTANT_VS);
1025 	MATCH3D(3DSTATE_CONSTANT_GS);
1026 	MATCH3D(3DSTATE_SAMPLE_MASK);
1027 	MATCH3D(3DSTATE_CONSTANT_HS);
1028 	MATCH3D(3DSTATE_CONSTANT_DS);
1029 	MATCH3D(3DSTATE_HS);
1030 	MATCH3D(3DSTATE_TE);
1031 	MATCH3D(3DSTATE_DS);
1032 	MATCH3D(3DSTATE_STREAMOUT);
1033 	MATCH3D(3DSTATE_SBE);
1034 	MATCH3D(3DSTATE_PS);
1035 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1036 	MATCH3D(3DSTATE_CPS_POINTERS);
1037 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1038 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1039 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1040 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1041 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1042 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1043 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1044 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1045 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1046 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1047 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1048 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1049 	MATCH3D(3DSTATE_VF_INSTANCING);
1050 	MATCH3D(3DSTATE_VF_SGVS);
1051 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1052 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1053 	MATCH3D(3DSTATE_PS_BLEND);
1054 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1055 	MATCH3D(3DSTATE_PS_EXTRA);
1056 	MATCH3D(3DSTATE_RASTER);
1057 	MATCH3D(3DSTATE_SBE_SWIZ);
1058 	MATCH3D(3DSTATE_WM_HZ_OP);
1059 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1060 	MATCH3D(3DSTATE_VF_SGVS_2);
1061 	MATCH3D(3DSTATE_VFG);
1062 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1063 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1064 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1065 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1066 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1067 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1068 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1069 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1070 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1071 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1072 	MATCH3D(3DSTATE_AMFS);
1073 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1074 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1075 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1076 	MATCH3D(3DSTATE_MESH_CONTROL);
1077 	MATCH3D(3DSTATE_MESH_DISTRIB);
1078 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1079 	MATCH3D(3DSTATE_MESH_SHADER);
1080 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1081 	MATCH3D(3DSTATE_TASK_CONTROL);
1082 	MATCH3D(3DSTATE_TASK_SHADER);
1083 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1084 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1085 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1086 	MATCH3D(3DSTATE_CLIP_MESH);
1087 	MATCH3D(3DSTATE_SBE_MESH);
1088 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1089 
1090 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1091 	MATCH3D(3DSTATE_CHROMA_KEY);
1092 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1093 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1094 	MATCH3D(3DSTATE_LINE_STIPPLE);
1095 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1096 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1097 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1098 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1099 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1100 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1101 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1102 	MATCH3D(3DSTATE_SO_DECL_LIST);
1103 	MATCH3D(3DSTATE_SO_BUFFER);
1104 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1105 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1106 	MATCH3D(3DSTATE_3D_MODE);
1107 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1108 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1109 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1110 
1111 	default:
1112 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1113 			   *dw, pipeline, opcode, subopcode, numdw);
1114 		return numdw;
1115 	}
1116 }
1117 
1118 void xe_lrc_dump_default(struct drm_printer *p,
1119 			 struct xe_gt *gt,
1120 			 enum xe_engine_class hwe_class)
1121 {
1122 	u32 *dw;
1123 	int remaining_dw, num_dw;
1124 
1125 	if (!gt->default_lrc[hwe_class]) {
1126 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1127 		return;
1128 	}
1129 
1130 	/*
1131 	 * Skip the beginning of the LRC since it contains the per-process
1132 	 * hardware status page.
1133 	 */
1134 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1135 	remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
1136 
1137 	while (remaining_dw > 0) {
1138 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1139 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1140 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1141 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1142 		} else {
1143 			num_dw = min(instr_dw(*dw), remaining_dw);
1144 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1145 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1146 				   num_dw);
1147 		}
1148 
1149 		dw += num_dw;
1150 		remaining_dw -= num_dw;
1151 	}
1152 }
1153 
1154 struct instr_state {
1155 	u32 instr;
1156 	u16 num_dw;
1157 };
1158 
1159 static const struct instr_state xe_hpg_svg_state[] = {
1160 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1161 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1162 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1163 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1164 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1165 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1166 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1167 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1168 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1169 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1170 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1171 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1172 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1173 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1174 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1175 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1176 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1177 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1178 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1179 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1180 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1181 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1182 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1183 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1184 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1185 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1186 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1187 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1188 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1189 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1190 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1191 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1192 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1193 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1194 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1195 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1196 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1197 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1198 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1199 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1200 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1201 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1202 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1203 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1204 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1205 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1206 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1207 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1208 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1209 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1210 };
1211 
1212 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1213 {
1214 	struct xe_gt *gt = q->hwe->gt;
1215 	struct xe_device *xe = gt_to_xe(gt);
1216 	const struct instr_state *state_table = NULL;
1217 	int state_table_size = 0;
1218 
1219 	/*
1220 	 * At the moment we only need to emit non-register state for the RCS
1221 	 * engine.
1222 	 */
1223 	if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
1224 		return;
1225 
1226 	switch (GRAPHICS_VERx100(xe)) {
1227 	case 1255:
1228 	case 1270 ... 2004:
1229 		state_table = xe_hpg_svg_state;
1230 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1231 		break;
1232 	default:
1233 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1234 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1235 		return;
1236 	}
1237 
1238 	for (int i = 0; i < state_table_size; i++) {
1239 		u32 instr = state_table[i].instr;
1240 		u16 num_dw = state_table[i].num_dw;
1241 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1242 
1243 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1244 		xe_gt_assert(gt, num_dw != 0);
1245 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1246 
1247 		/*
1248 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1249 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1250 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1251 		 * Just make the replacement here rather than defining a
1252 		 * whole separate table for the single trivial change.
1253 		 */
1254 		if (GRAPHICS_VER(xe) >= 20 &&
1255 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1256 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1257 
1258 		bb->cs[bb->len] = instr;
1259 		if (!is_single_dw)
1260 			bb->cs[bb->len] |= (num_dw - 2);
1261 
1262 		bb->len += num_dw;
1263 	}
1264 }
1265