xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 1a562c0d44974d3cf89c6cc5c34c708c08af420e)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include "instructions/xe_mi_commands.h"
9 #include "instructions/xe_gfxpipe_commands.h"
10 #include "regs/xe_engine_regs.h"
11 #include "regs/xe_gpu_commands.h"
12 #include "regs/xe_lrc_layout.h"
13 #include "xe_bb.h"
14 #include "xe_bo.h"
15 #include "xe_device.h"
16 #include "xe_drm_client.h"
17 #include "xe_exec_queue_types.h"
18 #include "xe_gt.h"
19 #include "xe_gt_printk.h"
20 #include "xe_hw_fence.h"
21 #include "xe_map.h"
22 #include "xe_vm.h"
23 
24 #define CTX_VALID				(1 << 0)
25 #define CTX_PRIVILEGE				(1 << 8)
26 #define CTX_ADDRESSING_MODE_SHIFT		3
27 #define LEGACY_64B_CONTEXT			3
28 
29 #define ENGINE_CLASS_SHIFT			61
30 #define ENGINE_INSTANCE_SHIFT			48
31 
32 static struct xe_device *
33 lrc_to_xe(struct xe_lrc *lrc)
34 {
35 	return gt_to_xe(lrc->fence_ctx.gt);
36 }
37 
38 size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
39 {
40 	switch (class) {
41 	case XE_ENGINE_CLASS_RENDER:
42 		if (GRAPHICS_VER(xe) >= 20)
43 			return 4 * SZ_4K;
44 		else
45 			return 14 * SZ_4K;
46 	case XE_ENGINE_CLASS_COMPUTE:
47 		/* 14 pages since graphics_ver == 11 */
48 		if (GRAPHICS_VER(xe) >= 20)
49 			return 3 * SZ_4K;
50 		else
51 			return 14 * SZ_4K;
52 	default:
53 		WARN(1, "Unknown engine class: %d", class);
54 		fallthrough;
55 	case XE_ENGINE_CLASS_COPY:
56 	case XE_ENGINE_CLASS_VIDEO_DECODE:
57 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
58 	case XE_ENGINE_CLASS_OTHER:
59 		return 2 * SZ_4K;
60 	}
61 }
62 
63 /*
64  * The per-platform tables are u8-encoded in @data. Decode @data and set the
65  * addresses' offset and commands in @regs. The following encoding is used
66  * for each byte. There are 2 steps: decoding commands and decoding addresses.
67  *
68  * Commands:
69  * [7]: create NOPs - number of NOPs are set in lower bits
70  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
71  *      MI_LRI_FORCE_POSTED
72  * [5:0]: Number of NOPs or registers to set values to in case of
73  *        MI_LOAD_REGISTER_IMM
74  *
75  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
76  * number of registers. They are set by using the REG/REG16 macros: the former
77  * is used for offsets smaller than 0x200 while the latter is for values bigger
78  * than that. Those macros already set all the bits documented below correctly:
79  *
80  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
81  *      follow, for the lower bits
82  * [6:0]: Register offset, without considering the engine base.
83  *
84  * This function only tweaks the commands and register offsets. Values are not
85  * filled out.
86  */
87 static void set_offsets(u32 *regs,
88 			const u8 *data,
89 			const struct xe_hw_engine *hwe)
90 #define NOP(x) (BIT(7) | (x))
91 #define LRI(count, flags) ((flags) << 6 | (count) | \
92 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
93 #define POSTED BIT(0)
94 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
95 #define REG16(x) \
96 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
97 	(((x) >> 2) & 0x7f)
98 #define END 0
99 {
100 	const u32 base = hwe->mmio_base;
101 
102 	while (*data) {
103 		u8 count, flags;
104 
105 		if (*data & BIT(7)) { /* skip */
106 			count = *data++ & ~BIT(7);
107 			regs += count;
108 			continue;
109 		}
110 
111 		count = *data & 0x3f;
112 		flags = *data >> 6;
113 		data++;
114 
115 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
116 		if (flags & POSTED)
117 			*regs |= MI_LRI_FORCE_POSTED;
118 		*regs |= MI_LRI_LRM_CS_MMIO;
119 		regs++;
120 
121 		xe_gt_assert(hwe->gt, count);
122 		do {
123 			u32 offset = 0;
124 			u8 v;
125 
126 			do {
127 				v = *data++;
128 				offset <<= 7;
129 				offset |= v & ~BIT(7);
130 			} while (v & BIT(7));
131 
132 			regs[0] = base + (offset << 2);
133 			regs += 2;
134 		} while (--count);
135 	}
136 
137 	*regs = MI_BATCH_BUFFER_END | BIT(0);
138 }
139 
140 static const u8 gen12_xcs_offsets[] = {
141 	NOP(1),
142 	LRI(13, POSTED),
143 	REG16(0x244),
144 	REG(0x034),
145 	REG(0x030),
146 	REG(0x038),
147 	REG(0x03c),
148 	REG(0x168),
149 	REG(0x140),
150 	REG(0x110),
151 	REG(0x1c0),
152 	REG(0x1c4),
153 	REG(0x1c8),
154 	REG(0x180),
155 	REG16(0x2b4),
156 
157 	NOP(5),
158 	LRI(9, POSTED),
159 	REG16(0x3a8),
160 	REG16(0x28c),
161 	REG16(0x288),
162 	REG16(0x284),
163 	REG16(0x280),
164 	REG16(0x27c),
165 	REG16(0x278),
166 	REG16(0x274),
167 	REG16(0x270),
168 
169 	END
170 };
171 
172 static const u8 dg2_xcs_offsets[] = {
173 	NOP(1),
174 	LRI(15, POSTED),
175 	REG16(0x244),
176 	REG(0x034),
177 	REG(0x030),
178 	REG(0x038),
179 	REG(0x03c),
180 	REG(0x168),
181 	REG(0x140),
182 	REG(0x110),
183 	REG(0x1c0),
184 	REG(0x1c4),
185 	REG(0x1c8),
186 	REG(0x180),
187 	REG16(0x2b4),
188 	REG(0x120),
189 	REG(0x124),
190 
191 	NOP(1),
192 	LRI(9, POSTED),
193 	REG16(0x3a8),
194 	REG16(0x28c),
195 	REG16(0x288),
196 	REG16(0x284),
197 	REG16(0x280),
198 	REG16(0x27c),
199 	REG16(0x278),
200 	REG16(0x274),
201 	REG16(0x270),
202 
203 	END
204 };
205 
206 static const u8 gen12_rcs_offsets[] = {
207 	NOP(1),
208 	LRI(13, POSTED),
209 	REG16(0x244),
210 	REG(0x034),
211 	REG(0x030),
212 	REG(0x038),
213 	REG(0x03c),
214 	REG(0x168),
215 	REG(0x140),
216 	REG(0x110),
217 	REG(0x1c0),
218 	REG(0x1c4),
219 	REG(0x1c8),
220 	REG(0x180),
221 	REG16(0x2b4),
222 
223 	NOP(5),
224 	LRI(9, POSTED),
225 	REG16(0x3a8),
226 	REG16(0x28c),
227 	REG16(0x288),
228 	REG16(0x284),
229 	REG16(0x280),
230 	REG16(0x27c),
231 	REG16(0x278),
232 	REG16(0x274),
233 	REG16(0x270),
234 
235 	LRI(3, POSTED),
236 	REG(0x1b0),
237 	REG16(0x5a8),
238 	REG16(0x5ac),
239 
240 	NOP(6),
241 	LRI(1, 0),
242 	REG(0x0c8),
243 	NOP(3 + 9 + 1),
244 
245 	LRI(51, POSTED),
246 	REG16(0x588),
247 	REG16(0x588),
248 	REG16(0x588),
249 	REG16(0x588),
250 	REG16(0x588),
251 	REG16(0x588),
252 	REG(0x028),
253 	REG(0x09c),
254 	REG(0x0c0),
255 	REG(0x178),
256 	REG(0x17c),
257 	REG16(0x358),
258 	REG(0x170),
259 	REG(0x150),
260 	REG(0x154),
261 	REG(0x158),
262 	REG16(0x41c),
263 	REG16(0x600),
264 	REG16(0x604),
265 	REG16(0x608),
266 	REG16(0x60c),
267 	REG16(0x610),
268 	REG16(0x614),
269 	REG16(0x618),
270 	REG16(0x61c),
271 	REG16(0x620),
272 	REG16(0x624),
273 	REG16(0x628),
274 	REG16(0x62c),
275 	REG16(0x630),
276 	REG16(0x634),
277 	REG16(0x638),
278 	REG16(0x63c),
279 	REG16(0x640),
280 	REG16(0x644),
281 	REG16(0x648),
282 	REG16(0x64c),
283 	REG16(0x650),
284 	REG16(0x654),
285 	REG16(0x658),
286 	REG16(0x65c),
287 	REG16(0x660),
288 	REG16(0x664),
289 	REG16(0x668),
290 	REG16(0x66c),
291 	REG16(0x670),
292 	REG16(0x674),
293 	REG16(0x678),
294 	REG16(0x67c),
295 	REG(0x068),
296 	REG(0x084),
297 	NOP(1),
298 
299 	END
300 };
301 
302 static const u8 xehp_rcs_offsets[] = {
303 	NOP(1),
304 	LRI(13, POSTED),
305 	REG16(0x244),
306 	REG(0x034),
307 	REG(0x030),
308 	REG(0x038),
309 	REG(0x03c),
310 	REG(0x168),
311 	REG(0x140),
312 	REG(0x110),
313 	REG(0x1c0),
314 	REG(0x1c4),
315 	REG(0x1c8),
316 	REG(0x180),
317 	REG16(0x2b4),
318 
319 	NOP(5),
320 	LRI(9, POSTED),
321 	REG16(0x3a8),
322 	REG16(0x28c),
323 	REG16(0x288),
324 	REG16(0x284),
325 	REG16(0x280),
326 	REG16(0x27c),
327 	REG16(0x278),
328 	REG16(0x274),
329 	REG16(0x270),
330 
331 	LRI(3, POSTED),
332 	REG(0x1b0),
333 	REG16(0x5a8),
334 	REG16(0x5ac),
335 
336 	NOP(6),
337 	LRI(1, 0),
338 	REG(0x0c8),
339 
340 	END
341 };
342 
343 static const u8 dg2_rcs_offsets[] = {
344 	NOP(1),
345 	LRI(15, POSTED),
346 	REG16(0x244),
347 	REG(0x034),
348 	REG(0x030),
349 	REG(0x038),
350 	REG(0x03c),
351 	REG(0x168),
352 	REG(0x140),
353 	REG(0x110),
354 	REG(0x1c0),
355 	REG(0x1c4),
356 	REG(0x1c8),
357 	REG(0x180),
358 	REG16(0x2b4),
359 	REG(0x120),
360 	REG(0x124),
361 
362 	NOP(1),
363 	LRI(9, POSTED),
364 	REG16(0x3a8),
365 	REG16(0x28c),
366 	REG16(0x288),
367 	REG16(0x284),
368 	REG16(0x280),
369 	REG16(0x27c),
370 	REG16(0x278),
371 	REG16(0x274),
372 	REG16(0x270),
373 
374 	LRI(3, POSTED),
375 	REG(0x1b0),
376 	REG16(0x5a8),
377 	REG16(0x5ac),
378 
379 	NOP(6),
380 	LRI(1, 0),
381 	REG(0x0c8),
382 
383 	END
384 };
385 
386 static const u8 mtl_rcs_offsets[] = {
387 	NOP(1),
388 	LRI(15, POSTED),
389 	REG16(0x244),
390 	REG(0x034),
391 	REG(0x030),
392 	REG(0x038),
393 	REG(0x03c),
394 	REG(0x168),
395 	REG(0x140),
396 	REG(0x110),
397 	REG(0x1c0),
398 	REG(0x1c4),
399 	REG(0x1c8),
400 	REG(0x180),
401 	REG16(0x2b4),
402 	REG(0x120),
403 	REG(0x124),
404 
405 	NOP(1),
406 	LRI(9, POSTED),
407 	REG16(0x3a8),
408 	REG16(0x28c),
409 	REG16(0x288),
410 	REG16(0x284),
411 	REG16(0x280),
412 	REG16(0x27c),
413 	REG16(0x278),
414 	REG16(0x274),
415 	REG16(0x270),
416 
417 	NOP(2),
418 	LRI(2, POSTED),
419 	REG16(0x5a8),
420 	REG16(0x5ac),
421 
422 	NOP(6),
423 	LRI(1, 0),
424 	REG(0x0c8),
425 
426 	END
427 };
428 
429 #define XE2_CTX_COMMON \
430 	NOP(1),                 /* [0x00] */ \
431 	LRI(15, POSTED),        /* [0x01] */ \
432 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
433 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
434 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
435 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
436 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
437 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
438 	REG(0x140),             /* [0x0e] BB_ADDR */ \
439 	REG(0x110),             /* [0x10] BB_STATE */ \
440 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
441 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
442 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
443 	REG(0x180),             /* [0x18] CCID */ \
444 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
445 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
446 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
447 	\
448 	NOP(1),                 /* [0x20] */ \
449 	LRI(9, POSTED),         /* [0x21] */ \
450 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
451 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
452 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
453 	REG16(0x284),           /* [0x28] dummy reg */ \
454 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
455 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
456 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
457 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
458 	REG16(0x270)            /* [0x32] PTBP_LDW */
459 
460 static const u8 xe2_rcs_offsets[] = {
461 	XE2_CTX_COMMON,
462 
463 	NOP(2),                 /* [0x34] */
464 	LRI(2, POSTED),         /* [0x36] */
465 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
466 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
467 
468 	NOP(6),                 /* [0x41] */
469 	LRI(1, 0),              /* [0x47] */
470 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
471 
472 	END
473 };
474 
475 static const u8 xe2_bcs_offsets[] = {
476 	XE2_CTX_COMMON,
477 
478 	NOP(4 + 8 + 1),         /* [0x34] */
479 	LRI(2, POSTED),         /* [0x41] */
480 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
481 	REG16(0x204),           /* [0x44] BLIT_CCTL */
482 
483 	END
484 };
485 
486 static const u8 xe2_xcs_offsets[] = {
487 	XE2_CTX_COMMON,
488 
489 	END
490 };
491 
492 #undef END
493 #undef REG16
494 #undef REG
495 #undef LRI
496 #undef NOP
497 
498 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
499 {
500 	if (class == XE_ENGINE_CLASS_RENDER) {
501 		if (GRAPHICS_VER(xe) >= 20)
502 			return xe2_rcs_offsets;
503 		else if (GRAPHICS_VERx100(xe) >= 1270)
504 			return mtl_rcs_offsets;
505 		else if (GRAPHICS_VERx100(xe) >= 1255)
506 			return dg2_rcs_offsets;
507 		else if (GRAPHICS_VERx100(xe) >= 1250)
508 			return xehp_rcs_offsets;
509 		else
510 			return gen12_rcs_offsets;
511 	} else if (class == XE_ENGINE_CLASS_COPY) {
512 		if (GRAPHICS_VER(xe) >= 20)
513 			return xe2_bcs_offsets;
514 		else
515 			return gen12_xcs_offsets;
516 	} else {
517 		if (GRAPHICS_VER(xe) >= 20)
518 			return xe2_xcs_offsets;
519 		else if (GRAPHICS_VERx100(xe) >= 1255)
520 			return dg2_xcs_offsets;
521 		else
522 			return gen12_xcs_offsets;
523 	}
524 }
525 
526 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
527 {
528 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) |
529 				    _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
530 				    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
531 
532 	/* TODO: Timestamp */
533 }
534 
535 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
536 {
537 	struct xe_device *xe = gt_to_xe(hwe->gt);
538 
539 	if (GRAPHICS_VERx100(xe) >= 1250)
540 		return 0x70;
541 	else
542 		return 0x60;
543 }
544 
545 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
546 {
547 	int x;
548 
549 	x = lrc_ring_mi_mode(hwe);
550 	regs[x + 1] &= ~STOP_RING;
551 	regs[x + 1] |= STOP_RING << 16;
552 }
553 
554 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
555 {
556 	return 0;
557 }
558 
559 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
560 {
561 	return lrc->ring.size;
562 }
563 
564 /* Make the magic macros work */
565 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
566 
567 #define LRC_SEQNO_PPHWSP_OFFSET 512
568 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
569 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
570 #define LRC_PPHWSP_SIZE SZ_4K
571 
572 static size_t lrc_reg_size(struct xe_device *xe)
573 {
574 	if (GRAPHICS_VERx100(xe) >= 1250)
575 		return 96 * sizeof(u32);
576 	else
577 		return 80 * sizeof(u32);
578 }
579 
580 size_t xe_lrc_skip_size(struct xe_device *xe)
581 {
582 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
583 }
584 
585 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
586 {
587 	/* The seqno is stored in the driver-defined portion of PPHWSP */
588 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
589 }
590 
591 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
592 {
593 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
594 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
595 }
596 
597 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
598 {
599 	/* The parallel is stored in the driver-defined portion of PPHWSP */
600 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
601 }
602 
603 static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
604 {
605 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
606 }
607 
608 #define DECL_MAP_ADDR_HELPERS(elem) \
609 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
610 { \
611 	struct iosys_map map = lrc->bo->vmap; \
612 \
613 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
614 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
615 	return map; \
616 } \
617 static inline u32 __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
618 { \
619 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
620 } \
621 
622 DECL_MAP_ADDR_HELPERS(ring)
623 DECL_MAP_ADDR_HELPERS(pphwsp)
624 DECL_MAP_ADDR_HELPERS(seqno)
625 DECL_MAP_ADDR_HELPERS(regs)
626 DECL_MAP_ADDR_HELPERS(start_seqno)
627 DECL_MAP_ADDR_HELPERS(parallel)
628 
629 #undef DECL_MAP_ADDR_HELPERS
630 
631 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
632 {
633 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
634 }
635 
636 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
637 {
638 	struct xe_device *xe = lrc_to_xe(lrc);
639 	struct iosys_map map;
640 
641 	map = __xe_lrc_regs_map(lrc);
642 	iosys_map_incr(&map, reg_nr * sizeof(u32));
643 	return xe_map_read32(xe, &map);
644 }
645 
646 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
647 {
648 	struct xe_device *xe = lrc_to_xe(lrc);
649 	struct iosys_map map;
650 
651 	map = __xe_lrc_regs_map(lrc);
652 	iosys_map_incr(&map, reg_nr * sizeof(u32));
653 	xe_map_write32(xe, &map, val);
654 }
655 
656 static void *empty_lrc_data(struct xe_hw_engine *hwe)
657 {
658 	struct xe_device *xe = gt_to_xe(hwe->gt);
659 	void *data;
660 	u32 *regs;
661 
662 	data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
663 	if (!data)
664 		return NULL;
665 
666 	/* 1st page: Per-Process of HW status Page */
667 	regs = data + LRC_PPHWSP_SIZE;
668 	set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
669 	set_context_control(regs, hwe);
670 	reset_stop_ring(regs, hwe);
671 
672 	return data;
673 }
674 
675 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
676 {
677 	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
678 
679 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
680 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
681 }
682 
683 #define PVC_CTX_ASID		(0x2e + 1)
684 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
685 #define ACC_GRANULARITY_S       20
686 #define ACC_NOTIFY_S            16
687 
688 int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
689 		struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
690 {
691 	struct xe_gt *gt = hwe->gt;
692 	struct xe_tile *tile = gt_to_tile(gt);
693 	struct xe_device *xe = gt_to_xe(gt);
694 	struct iosys_map map;
695 	void *init_data = NULL;
696 	u32 arb_enable;
697 	int err;
698 
699 	lrc->flags = 0;
700 
701 	/*
702 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
703 	 * via VM bind calls.
704 	 */
705 	lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
706 				      ring_size + xe_lrc_size(xe, hwe->class),
707 				      ttm_bo_type_kernel,
708 				      XE_BO_CREATE_VRAM_IF_DGFX(tile) |
709 				      XE_BO_CREATE_GGTT_BIT);
710 	if (IS_ERR(lrc->bo))
711 		return PTR_ERR(lrc->bo);
712 
713 	lrc->tile = gt_to_tile(hwe->gt);
714 	lrc->ring.size = ring_size;
715 	lrc->ring.tail = 0;
716 
717 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
718 			     hwe->fence_irq, hwe->name);
719 
720 	if (!gt->default_lrc[hwe->class]) {
721 		init_data = empty_lrc_data(hwe);
722 		if (!init_data) {
723 			err = -ENOMEM;
724 			goto err_lrc_finish;
725 		}
726 	}
727 
728 	/*
729 	 * Init Per-Process of HW status Page, LRC / context state to known
730 	 * values
731 	 */
732 	map = __xe_lrc_pphwsp_map(lrc);
733 	if (!init_data) {
734 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
735 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
736 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
737 				 xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
738 	} else {
739 		xe_map_memcpy_to(xe, &map, 0, init_data,
740 				 xe_lrc_size(xe, hwe->class));
741 		kfree(init_data);
742 	}
743 
744 	if (vm) {
745 		xe_lrc_set_ppgtt(lrc, vm);
746 
747 		if (vm->xef)
748 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
749 	}
750 
751 	xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
752 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
753 	xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
754 	xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
755 			     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
756 	if (xe->info.has_asid && vm)
757 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID,
758 				     (q->usm.acc_granularity <<
759 				      ACC_GRANULARITY_S) | vm->usm.asid);
760 	if (xe->info.has_usm && vm)
761 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ACC_CTR_THOLD,
762 				     (q->usm.acc_notify << ACC_NOTIFY_S) |
763 				     q->usm.acc_trigger);
764 
765 	lrc->desc = CTX_VALID;
766 	lrc->desc |= LEGACY_64B_CONTEXT << CTX_ADDRESSING_MODE_SHIFT;
767 	/* TODO: Priority */
768 
769 	/* While this appears to have something about privileged batches or
770 	 * some such, it really just means PPGTT mode.
771 	 */
772 	if (vm)
773 		lrc->desc |= CTX_PRIVILEGE;
774 
775 	if (GRAPHICS_VERx100(xe) < 1250) {
776 		lrc->desc |= (u64)hwe->instance << ENGINE_INSTANCE_SHIFT;
777 		lrc->desc |= (u64)hwe->class << ENGINE_CLASS_SHIFT;
778 	}
779 
780 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
781 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
782 
783 	map = __xe_lrc_seqno_map(lrc);
784 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
785 
786 	map = __xe_lrc_start_seqno_map(lrc);
787 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
788 
789 	return 0;
790 
791 err_lrc_finish:
792 	xe_lrc_finish(lrc);
793 	return err;
794 }
795 
796 void xe_lrc_finish(struct xe_lrc *lrc)
797 {
798 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
799 	xe_bo_lock(lrc->bo, false);
800 	xe_bo_unpin(lrc->bo);
801 	xe_bo_unlock(lrc->bo);
802 	xe_bo_put(lrc->bo);
803 }
804 
805 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
806 {
807 	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
808 }
809 
810 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
811 {
812 	return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
813 }
814 
815 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
816 {
817 	const u32 head = xe_lrc_ring_head(lrc);
818 	const u32 tail = lrc->ring.tail;
819 	const u32 size = lrc->ring.size;
820 
821 	return ((head - tail - 1) & (size - 1)) + 1;
822 }
823 
824 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
825 				const void *data, size_t size)
826 {
827 	struct xe_device *xe = lrc_to_xe(lrc);
828 
829 	iosys_map_incr(&ring, lrc->ring.tail);
830 	xe_map_memcpy_to(xe, &ring, 0, data, size);
831 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
832 }
833 
834 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
835 {
836 	struct xe_device *xe = lrc_to_xe(lrc);
837 	struct iosys_map ring;
838 	u32 rhs;
839 	size_t aligned_size;
840 
841 	xe_assert(xe, IS_ALIGNED(size, 4));
842 	aligned_size = ALIGN(size, 8);
843 
844 	ring = __xe_lrc_ring_map(lrc);
845 
846 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
847 	rhs = lrc->ring.size - lrc->ring.tail;
848 	if (size > rhs) {
849 		__xe_lrc_write_ring(lrc, ring, data, rhs);
850 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
851 	} else {
852 		__xe_lrc_write_ring(lrc, ring, data, size);
853 	}
854 
855 	if (aligned_size > size) {
856 		u32 noop = MI_NOOP;
857 
858 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
859 	}
860 }
861 
862 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
863 {
864 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
865 }
866 
867 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
868 {
869 	return __xe_lrc_seqno_ggtt_addr(lrc);
870 }
871 
872 struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
873 {
874 	return &xe_hw_fence_create(&lrc->fence_ctx,
875 				   __xe_lrc_seqno_map(lrc))->dma;
876 }
877 
878 s32 xe_lrc_seqno(struct xe_lrc *lrc)
879 {
880 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
881 
882 	return xe_map_read32(lrc_to_xe(lrc), &map);
883 }
884 
885 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
886 {
887 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
888 
889 	return xe_map_read32(lrc_to_xe(lrc), &map);
890 }
891 
892 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
893 {
894 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
895 }
896 
897 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
898 {
899 	return __xe_lrc_parallel_ggtt_addr(lrc);
900 }
901 
902 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
903 {
904 	return __xe_lrc_parallel_map(lrc);
905 }
906 
907 static int instr_dw(u32 cmd_header)
908 {
909 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
910 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
911 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
912 		return 1;
913 
914 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
915 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
916 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
917 
918 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
919 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
920 }
921 
922 static int dump_mi_command(struct drm_printer *p,
923 			   struct xe_gt *gt,
924 			   u32 *dw,
925 			   int remaining_dw)
926 {
927 	u32 inst_header = *dw;
928 	u32 numdw = instr_dw(inst_header);
929 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
930 	int num_noop;
931 
932 	/* First check for commands that don't have/use a '# DW' field */
933 	switch (inst_header & MI_OPCODE) {
934 	case MI_NOOP:
935 		num_noop = 1;
936 		while (num_noop < remaining_dw &&
937 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
938 			num_noop++;
939 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
940 		return num_noop;
941 
942 	case MI_TOPOLOGY_FILTER:
943 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
944 		return 1;
945 
946 	case MI_BATCH_BUFFER_END:
947 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
948 		/* Return 'remaining_dw' to consume the rest of the LRC */
949 		return remaining_dw;
950 	}
951 
952 	/*
953 	 * Any remaining commands include a # of dwords.  We should make sure
954 	 * it doesn't exceed the remaining size of the LRC.
955 	 */
956 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
957 		numdw = remaining_dw;
958 
959 	switch (inst_header & MI_OPCODE) {
960 	case MI_LOAD_REGISTER_IMM:
961 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
962 			   inst_header, (numdw - 1) / 2);
963 		for (int i = 1; i < numdw; i += 2)
964 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
965 		return numdw;
966 
967 	case MI_FORCE_WAKEUP:
968 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
969 		return numdw;
970 
971 	default:
972 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
973 			   inst_header, opcode, numdw);
974 		return numdw;
975 	}
976 }
977 
978 static int dump_gfxpipe_command(struct drm_printer *p,
979 				struct xe_gt *gt,
980 				u32 *dw,
981 				int remaining_dw)
982 {
983 	u32 numdw = instr_dw(*dw);
984 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
985 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
986 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
987 
988 	/*
989 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
990 	 * remaining size of the LRC.
991 	 */
992 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
993 		numdw = remaining_dw;
994 
995 	switch (*dw & GFXPIPE_MATCH_MASK) {
996 #define MATCH(cmd) \
997 	case cmd: \
998 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
999 		return numdw
1000 #define MATCH3D(cmd) \
1001 	case CMD_##cmd: \
1002 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1003 		return numdw
1004 
1005 	MATCH(STATE_BASE_ADDRESS);
1006 	MATCH(STATE_SIP);
1007 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1008 	MATCH(STATE_COMPUTE_MODE);
1009 	MATCH3D(3DSTATE_BTD);
1010 
1011 	MATCH3D(3DSTATE_VF_STATISTICS);
1012 
1013 	MATCH(PIPELINE_SELECT);
1014 
1015 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1016 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1017 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1018 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1019 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1020 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1021 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1022 	MATCH3D(3DSTATE_INDEX_BUFFER);
1023 	MATCH3D(3DSTATE_VF);
1024 	MATCH3D(3DSTATE_MULTISAMPLE);
1025 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1026 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1027 	MATCH3D(3DSTATE_VS);
1028 	MATCH3D(3DSTATE_GS);
1029 	MATCH3D(3DSTATE_CLIP);
1030 	MATCH3D(3DSTATE_SF);
1031 	MATCH3D(3DSTATE_WM);
1032 	MATCH3D(3DSTATE_CONSTANT_VS);
1033 	MATCH3D(3DSTATE_CONSTANT_GS);
1034 	MATCH3D(3DSTATE_SAMPLE_MASK);
1035 	MATCH3D(3DSTATE_CONSTANT_HS);
1036 	MATCH3D(3DSTATE_CONSTANT_DS);
1037 	MATCH3D(3DSTATE_HS);
1038 	MATCH3D(3DSTATE_TE);
1039 	MATCH3D(3DSTATE_DS);
1040 	MATCH3D(3DSTATE_STREAMOUT);
1041 	MATCH3D(3DSTATE_SBE);
1042 	MATCH3D(3DSTATE_PS);
1043 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1044 	MATCH3D(3DSTATE_CPS_POINTERS);
1045 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1046 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1047 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1048 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1049 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1050 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1051 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1052 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1053 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1054 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1055 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1056 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1057 	MATCH3D(3DSTATE_VF_INSTANCING);
1058 	MATCH3D(3DSTATE_VF_SGVS);
1059 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1060 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1061 	MATCH3D(3DSTATE_PS_BLEND);
1062 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1063 	MATCH3D(3DSTATE_PS_EXTRA);
1064 	MATCH3D(3DSTATE_RASTER);
1065 	MATCH3D(3DSTATE_SBE_SWIZ);
1066 	MATCH3D(3DSTATE_WM_HZ_OP);
1067 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1068 	MATCH3D(3DSTATE_VF_SGVS_2);
1069 	MATCH3D(3DSTATE_VFG);
1070 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1071 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1072 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1073 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1074 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1075 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1076 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1077 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1078 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1079 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1080 	MATCH3D(3DSTATE_AMFS);
1081 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1082 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1083 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1084 	MATCH3D(3DSTATE_MESH_CONTROL);
1085 	MATCH3D(3DSTATE_MESH_DISTRIB);
1086 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1087 	MATCH3D(3DSTATE_MESH_SHADER);
1088 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1089 	MATCH3D(3DSTATE_TASK_CONTROL);
1090 	MATCH3D(3DSTATE_TASK_SHADER);
1091 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1092 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1093 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1094 	MATCH3D(3DSTATE_CLIP_MESH);
1095 	MATCH3D(3DSTATE_SBE_MESH);
1096 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1097 
1098 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1099 	MATCH3D(3DSTATE_CHROMA_KEY);
1100 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1101 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1102 	MATCH3D(3DSTATE_LINE_STIPPLE);
1103 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1104 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1105 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1106 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1107 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1108 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1109 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1110 	MATCH3D(3DSTATE_SO_DECL_LIST);
1111 	MATCH3D(3DSTATE_SO_BUFFER);
1112 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1113 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1114 	MATCH3D(3DSTATE_3D_MODE);
1115 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1116 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1117 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1118 
1119 	default:
1120 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1121 			   *dw, pipeline, opcode, subopcode, numdw);
1122 		return numdw;
1123 	}
1124 }
1125 
1126 void xe_lrc_dump_default(struct drm_printer *p,
1127 			 struct xe_gt *gt,
1128 			 enum xe_engine_class hwe_class)
1129 {
1130 	u32 *dw;
1131 	int remaining_dw, num_dw;
1132 
1133 	if (!gt->default_lrc[hwe_class]) {
1134 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1135 		return;
1136 	}
1137 
1138 	/*
1139 	 * Skip the beginning of the LRC since it contains the per-process
1140 	 * hardware status page.
1141 	 */
1142 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1143 	remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
1144 
1145 	while (remaining_dw > 0) {
1146 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1147 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1148 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1149 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1150 		} else {
1151 			num_dw = min(instr_dw(*dw), remaining_dw);
1152 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1153 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1154 				   num_dw);
1155 		}
1156 
1157 		dw += num_dw;
1158 		remaining_dw -= num_dw;
1159 	}
1160 }
1161 
1162 struct instr_state {
1163 	u32 instr;
1164 	u16 num_dw;
1165 };
1166 
1167 static const struct instr_state xe_hpg_svg_state[] = {
1168 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1169 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1170 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1171 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1172 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1173 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1174 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1175 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1176 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1177 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1178 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1179 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1180 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1181 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1182 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1183 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1184 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1185 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1186 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1187 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1188 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1189 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1190 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1191 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1192 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1193 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1194 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1195 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1196 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1197 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1198 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1199 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1200 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1201 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1202 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1203 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1204 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1205 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1206 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1207 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1208 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1209 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1210 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1211 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1212 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1213 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1214 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1215 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1216 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1217 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1218 };
1219 
1220 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1221 {
1222 	struct xe_gt *gt = q->hwe->gt;
1223 	struct xe_device *xe = gt_to_xe(gt);
1224 	const struct instr_state *state_table = NULL;
1225 	int state_table_size = 0;
1226 
1227 	/*
1228 	 * At the moment we only need to emit non-register state for the RCS
1229 	 * engine.
1230 	 */
1231 	if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
1232 		return;
1233 
1234 	switch (GRAPHICS_VERx100(xe)) {
1235 	case 1255:
1236 	case 1270 ... 2004:
1237 		state_table = xe_hpg_svg_state;
1238 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1239 		break;
1240 	default:
1241 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1242 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1243 		return;
1244 	}
1245 
1246 	for (int i = 0; i < state_table_size; i++) {
1247 		u32 instr = state_table[i].instr;
1248 		u16 num_dw = state_table[i].num_dw;
1249 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1250 
1251 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1252 		xe_gt_assert(gt, num_dw != 0);
1253 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1254 
1255 		/*
1256 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1257 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1258 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1259 		 * Just make the replacement here rather than defining a
1260 		 * whole separate table for the single trivial change.
1261 		 */
1262 		if (GRAPHICS_VER(xe) >= 20 &&
1263 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1264 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1265 
1266 		bb->cs[bb->len] = instr;
1267 		if (!is_single_dw)
1268 			bb->cs[bb->len] |= (num_dw - 2);
1269 
1270 		bb->len += num_dw;
1271 	}
1272 }
1273