xref: /linux/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm (revision eeccf287a2a517954b57cf9d733b3cf5d47afa34)
1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23/* To compile this assembly code:
24 *
25 * gfx12:
26 *   cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx12.asm -P -o gfx12.sp3
27 *   sp3 gfx12.sp3 -hex gfx12.hex
28 */
29
30#define CHIP_GFX12 37
31#define CHIP_GC_12_0_3 38
32
33#define HAVE_XNACK (ASIC_FAMILY == CHIP_GC_12_0_3)
34#define HAVE_57BIT_ADDRESS (ASIC_FAMILY == CHIP_GC_12_0_3)
35#define HAVE_BANKED_VGPRS (ASIC_FAMILY == CHIP_GC_12_0_3)
36#define NUM_NAMED_BARRIERS (ASIC_FAMILY == CHIP_GC_12_0_3 ? 0x10 : 0)
37#define HAVE_CLUSTER_BARRIER (ASIC_FAMILY == CHIP_GC_12_0_3)
38#define CLUSTER_BARRIER_SERIALIZE_WORKAROUND (ASIC_FAMILY == CHIP_GC_12_0_3)
39#define RELAXED_SCHEDULING_IN_TRAP (ASIC_FAMILY == CHIP_GFX12)
40#define HAVE_INSTRUCTION_FIXUP (ASIC_FAMILY == CHIP_GC_12_0_3)
41
42#define SINGLE_STEP_MISSED_WORKAROUND 1	//workaround for lost TRAP_AFTER_INST exception when SAVECTX raised
43#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12)
44#define WAVE32_ONLY (ASIC_FAMILY == CHIP_GC_12_0_3)
45#define SAVE_TTMPS_IN_SGPR_BLOCK (ASIC_FAMILY >= CHIP_GC_12_0_3)
46
47#if HAVE_XNACK && !WAVE32_ONLY
48# error
49#endif
50
51#define ADDRESS_HI32_NUM_BITS ((HAVE_57BIT_ADDRESS ? 57 : 48) - 32)
52#define ADDRESS_HI32_MASK ((1 << ADDRESS_HI32_NUM_BITS) - 1)
53
54var SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK	= 0x4 | (NUM_NAMED_BARRIERS ? 0x8 : 0) | (HAVE_CLUSTER_BARRIER ? 0x10000 : 0)
55var SQ_WAVE_STATE_PRIV_SCC_SHIFT		= 9
56var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK		= 0xC00
57var SQ_WAVE_STATE_PRIV_HALT_MASK		= 0x4000
58var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK		= 0x8000
59var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT		= 15
60var SQ_WAVE_STATUS_WAVE64_SHIFT			= 29
61var SQ_WAVE_STATUS_WAVE64_SIZE			= 1
62var SQ_WAVE_STATUS_NO_VGPRS_SHIFT		= 24
63var SQ_WAVE_STATUS_IN_WG_SHIFT			= 11
64var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK	= SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
65var S_SAVE_PC_HI_TRAP_ID_MASK			= 0xF0000000
66
67var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT		= 12
68var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
69var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE		= 8
70var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT		= 12
71
72#if ASIC_FAMILY < CHIP_GC_12_0_3
73var SQ_WAVE_LDS_ALLOC_GRANULARITY		= 9
74#else
75var SQ_WAVE_LDS_ALLOC_GRANULARITY		= 10
76#endif
77
78var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK	= 0xF
79var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT	= 4
80var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK	= 0x10
81var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT	= 5
82var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK	= 0x20
83var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK	= 0x40
84var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT	= 6
85var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK	= 0x80
86var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT	= 7
87var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK	= 0x100
88var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT	= 8
89var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK	= 0x200
90var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK	= 0x800
91var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK		= 0x80
92var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK	= 0x200
93
94var SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK= SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK		|\
95						  SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK	|\
96						  SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK		|\
97						  SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK	|\
98						  SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK		|\
99						  SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK
100var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE	= SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT
101var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT	= SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
102var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE	= SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
103var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT	= SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
104var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE	= 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT
105
106var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT		= 0
107var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE		= 2
108
109var BARRIER_STATE_SIGNAL_OFFSET			= 16
110var BARRIER_STATE_SIGNAL_SIZE			= 7
111var BARRIER_STATE_MEMBER_OFFSET			= 4
112var BARRIER_STATE_MEMBER_SIZE			= 7
113var BARRIER_STATE_VALID_OFFSET			= 0
114
115#if RELAXED_SCHEDULING_IN_TRAP
116var TTMP11_SCHED_MODE_SHIFT			= 26
117var TTMP11_SCHED_MODE_SIZE			= 2
118var TTMP11_SCHED_MODE_MASK			= 0xC000000
119#endif
120
121var NAMED_BARRIERS_SR_OFFSET_FROM_HWREG		= 0x80
122var S_BARRIER_INIT_MEMBERCNT_MASK		= 0x7F0000
123var S_BARRIER_INIT_MEMBERCNT_SHIFT		= 0x10
124
125var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT	= 18
126var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE	= 1
127var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT	= 16
128var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE	= 1
129var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT	= 0
130var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE		= 7
131
132#if HAVE_BANKED_VGPRS
133var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT	= 12
134var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE	= 6
135#endif
136
137var TTMP11_SCHED_MODE_SHIFT			= 26
138var TTMP11_SCHED_MODE_SIZE			= 2
139var TTMP11_SCHED_MODE_MASK			= 0xC000000
140var TTMP11_DEBUG_TRAP_ENABLED_SHIFT		= 23
141var TTMP11_DEBUG_TRAP_ENABLED_MASK		= 0x800000
142var TTMP11_FIRST_REPLAY_SHIFT			= 22
143var TTMP11_FIRST_REPLAY_MASK			= 0x400000
144var TTMP11_REPLAY_W64H_SHIFT			= 21
145var TTMP11_REPLAY_W64H_MASK			= 0x200000
146var TTMP11_FXPTR_SHIFT				= 14
147var TTMP11_FXPTR_MASK				= 0x1FC000
148
149// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
150// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
151var S_SAVE_BUF_RSRC_WORD1_STRIDE		= 0x00040000
152var S_SAVE_BUF_RSRC_WORD3_MISC			= 0x10807FAC
153var S_SAVE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
154var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
155
156var S_SAVE_PC_HI_FIRST_WAVE_MASK		= 0x80000000
157var S_SAVE_PC_HI_FIRST_WAVE_SHIFT		= 31
158
159#if HAVE_BANKED_VGPRS
160var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT	= 25
161var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE	= 6
162#endif
163
164var s_sgpr_save_num				= 108
165
166var s_save_spi_init_lo				= exec_lo
167var s_save_spi_init_hi				= exec_hi
168var s_save_pc_lo				= ttmp0
169var s_save_pc_hi				= ttmp1
170var s_save_exec_lo				= ttmp2
171var s_save_exec_hi				= ttmp3
172var s_save_state_priv				= ttmp12
173var s_save_excp_flag_priv			= ttmp15
174var s_save_xnack_mask				= s_save_exec_hi
175var s_wave_size					= ttmp7
176var s_save_base_addr_lo				= ttmp8
177var s_save_base_addr_hi				= ttmp9
178var s_save_addr_lo				= ttmp10
179var s_save_addr_hi				= ttmp11
180var s_save_mem_offset				= ttmp4
181var s_save_alloc_size				= s_save_excp_flag_priv
182var s_save_tmp					= ttmp14
183var s_save_m0					= ttmp5
184var s_save_ttmps_lo				= s_save_tmp
185var s_save_ttmps_hi				= s_save_excp_flag_priv
186
187var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
188var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
189var S_WAVE_SIZE					= 25
190
191var s_restore_spi_init_lo			= exec_lo
192var s_restore_spi_init_hi			= exec_hi
193var s_restore_mem_offset			= ttmp12
194var s_restore_alloc_size			= ttmp3
195var s_restore_tmp				= ttmp2
196var s_restore_mem_offset_save			= s_restore_tmp
197var s_restore_m0				= s_restore_alloc_size
198var s_restore_mode				= ttmp7
199var s_restore_flat_scratch			= s_restore_tmp
200var s_restore_pc_lo				= ttmp0
201var s_restore_pc_hi				= ttmp1
202var s_restore_exec_lo				= ttmp4
203var s_restore_exec_hi				= ttmp5
204var s_restore_state_priv			= ttmp14
205var s_restore_excp_flag_priv			= ttmp15
206var s_restore_xnack_mask			= ttmp13
207var s_restore_base_addr_lo			= ttmp8
208var s_restore_base_addr_hi			= ttmp9
209var s_restore_addr_lo				= ttmp10
210var s_restore_addr_hi				= ttmp11
211var s_restore_size				= ttmp6
212var s_restore_ttmps_lo				= s_restore_tmp
213var s_restore_ttmps_hi				= s_restore_alloc_size
214var s_restore_spi_init_hi_save			= s_restore_exec_hi
215
216#if SAVE_TTMPS_IN_SGPR_BLOCK
217var TTMP_SR_OFFSET_FROM_HWREG			= -0x40
218#else
219var TTMP_SR_OFFSET_FROM_HWREG			= 0x40
220#endif
221
222shader main
223	asic(DEFAULT)
224	type(CS)
225	wave_size(32)
226
227	s_branch	L_SKIP_RESTORE						//NOT restore. might be a regular trap or save
228
229L_JUMP_TO_RESTORE:
230	s_branch	L_RESTORE
231
232L_SKIP_RESTORE:
233#if RELAXED_SCHEDULING_IN_TRAP
234	// Assume most relaxed scheduling mode is set. Save and revert to normal mode.
235	s_getreg_b32	ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE)
236	s_wait_alu	0
237	s_setreg_imm32_b32	hwreg(HW_REG_WAVE_SCHED_MODE, \
238		SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0
239#endif
240
241	s_getreg_b32	s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV)	//save STATUS since we will change SCC
242
243#if RELAXED_SCHEDULING_IN_TRAP
244	// Save SCHED_MODE[1:0] into ttmp11[27:26].
245	s_andn2_b32	ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK
246	s_lshl_b32	ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT
247	s_or_b32	ttmp11, ttmp11, ttmp2
248#endif
249
250	// Clear SPI_PRIO: do not save with elevated priority.
251	// Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
252	s_andn2_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK
253
254	s_getreg_b32	s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
255
256	s_and_b32       ttmp2, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK
257	s_cbranch_scc0	L_NOT_HALTED
258
259L_HALTED:
260	// Host trap may occur while wave is halted.
261	s_and_b32	ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
262	s_cbranch_scc1	L_FETCH_2ND_TRAP
263
264L_CHECK_SAVE:
265	s_and_b32	ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK
266	s_cbranch_scc1	L_SAVE
267
268	// Wave is halted but neither host trap nor SAVECTX is raised.
269	// Caused by instruction fetch memory violation.
270	// Spin wait until context saved to prevent interrupt storm.
271	s_sleep		0x10
272	s_getreg_b32	s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
273	s_branch	L_CHECK_SAVE
274
275L_NOT_HALTED:
276	// Let second-level handle non-SAVECTX exception or trap.
277	// Any concurrent SAVECTX will be handled upon re-entry once halted.
278
279	// Check non-maskable exceptions. memory_violation, illegal_instruction
280	// and xnack_error exceptions always cause the wave to enter the trap
281	// handler.
282	s_and_b32	ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK
283	s_cbranch_scc1	L_FETCH_2ND_TRAP
284
285	// Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
286	// Maskable exceptions only cause the wave to enter the trap handler if
287	// their respective bit in mode.excp_en is set.
288	s_getreg_b32	ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
289	s_and_b32	ttmp3, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK
290	s_cbranch_scc0	L_NOT_ADDR_WATCH
291	s_or_b32	ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK
292
293L_NOT_ADDR_WATCH:
294	s_getreg_b32	ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL)
295	s_and_b32	ttmp2, ttmp3, ttmp2
296	s_cbranch_scc1	L_FETCH_2ND_TRAP
297
298L_CHECK_TRAP_ID:
299	// Check trap_id != 0
300	s_and_b32	ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
301	s_cbranch_scc1	L_FETCH_2ND_TRAP
302
303#if SINGLE_STEP_MISSED_WORKAROUND
304	// Prioritize single step exception over context save.
305	// Second-level trap will halt wave and RFE, re-entering for SAVECTX.
306	// WAVE_TRAP_CTRL is already in ttmp3.
307	s_and_b32	ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK
308	s_cbranch_scc1	L_FETCH_2ND_TRAP
309#endif
310
311	s_and_b32	ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK
312	s_cbranch_scc1	L_SAVE
313
314L_FETCH_2ND_TRAP:
315#if HAVE_XNACK
316	save_and_clear_xnack_state_priv(ttmp14)
317#endif
318
319	// Read second-level TBA/TMA from first-level TMA and jump if available.
320	// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
321	// ttmp12 holds SQ_WAVE_STATUS
322	s_sendmsg_rtn_b64       [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
323	s_wait_idle
324	s_lshl_b64	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
325
326	s_bitcmp1_b32	ttmp15, (ADDRESS_HI32_NUM_BITS - 1)
327	s_cbranch_scc0	L_NO_SIGN_EXTEND_TMA
328	s_or_b32	ttmp15, ttmp15, ~ADDRESS_HI32_MASK
329L_NO_SIGN_EXTEND_TMA:
330#if RELAXED_SCHEDULING_IN_TRAP
331	// Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI).
332	// The second-level trap will restore from ttmp1 for backwards compatibility.
333	s_and_b32	ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK
334	s_andn2_b32	ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK
335	s_or_b32	ttmp1, ttmp1, ttmp2
336#endif
337
338	s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS		// debug trap enabled flag
339	s_wait_idle
340	s_lshl_b32      ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
341	s_andn2_b32     ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
342	s_or_b32        ttmp11, ttmp11, ttmp2
343
344	s_load_dwordx2	[ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 scope:SCOPE_SYS	// second-level TBA
345	s_wait_idle
346	s_load_dwordx2	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 scope:SCOPE_SYS	// second-level TMA
347	s_wait_idle
348
349	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
350	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set
351	s_setpc_b64	[ttmp2, ttmp3]						// jump to second-level trap handler
352
353L_NO_NEXT_TRAP:
354	// If not caused by trap then halt wave to prevent re-entry.
355	s_and_b32	ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
356	s_cbranch_scc1	L_TRAP_CASE
357
358	// Host trap will not cause trap re-entry.
359	s_getreg_b32	ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
360	s_and_b32	ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
361	s_cbranch_scc1	L_EXIT_TRAP
362	s_or_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK
363
364	// If the PC points to S_ENDPGM then context save will fail if STATE_PRIV.HALT is set.
365	// Rewind the PC to prevent this from occurring.
366	s_sub_u32	ttmp0, ttmp0, 0x8
367	s_subb_u32	ttmp1, ttmp1, 0x0
368
369	s_branch	L_EXIT_TRAP
370
371L_TRAP_CASE:
372	// Advance past trap instruction to prevent re-entry.
373	s_add_u32	ttmp0, ttmp0, 0x4
374	s_addc_u32	ttmp1, ttmp1, 0x0
375
376L_EXIT_TRAP:
377	s_and_b32	ttmp1, ttmp1, ADDRESS_HI32_MASK
378
379#if HAVE_INSTRUCTION_FIXUP
380	s_getreg_b32	s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
381	fixup_instruction()
382#endif
383
384#if HAVE_XNACK
385	restore_xnack_state_priv(s_save_tmp)
386#endif
387
388	// Restore SQ_WAVE_STATUS.
389	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
390	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
391
392	// STATE_PRIV.*BARRIER_COMPLETE may have changed since we read it.
393	// Only restore fields which the trap handler changes.
394	s_lshr_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
395
396#if RELAXED_SCHEDULING_IN_TRAP
397	// Assume relaxed scheduling mode after this point.
398	restore_sched_mode(ttmp2)
399#endif
400
401	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
402		SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv
403
404	s_rfe_b64	[ttmp0, ttmp1]
405
406L_SAVE:
407	// If VGPRs have been deallocated then terminate the wavefront.
408	// It has no remaining program to run and cannot save without VGPRs.
409	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
410	s_bitcmp1_b32	s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
411	s_cbranch_scc0	L_HAVE_VGPRS
412	s_endpgm
413L_HAVE_VGPRS:
414	s_and_b32	s_save_pc_hi, s_save_pc_hi, ADDRESS_HI32_MASK
415	s_mov_b32	s_save_tmp, 0
416	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp	//clear saveCtx bit
417
418#if HAVE_XNACK
419	save_and_clear_xnack_state_priv(s_save_tmp)
420#endif
421
422#if HAVE_INSTRUCTION_FIXUP
423	fixup_instruction()
424#endif
425
426	/* inform SPI the readiness and wait for SPI's go signal */
427	s_mov_b32	s_save_exec_lo, exec_lo					//save EXEC and use EXEC for the go signal from SPI
428	s_mov_b32	s_save_exec_hi, exec_hi
429	s_mov_b64	exec, 0x0						//clear EXEC to get ready to receive
430
431	s_sendmsg_rtn_b64       [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE)
432	s_wait_idle
433
434	// Save first_wave flag so we can clear high bits of save address.
435	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
436	s_lshl_b32	s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
437	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
438
439#if HAVE_XNACK
440	s_getreg_b32	s_save_xnack_mask, hwreg(HW_REG_WAVE_XNACK_MASK)
441	s_setreg_imm32_b32	hwreg(HW_REG_WAVE_XNACK_MASK), 0
442#endif
443
444#if HAVE_BANKED_VGPRS
445	// Save and clear shader's DST/SRC0/SRC1 VGPR bank selection so we can use v[0-255].
446	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE)
447	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT
448	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
449	s_mov_b32	s_save_tmp, 0
450	s_setreg_b32	hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE), s_save_tmp
451#endif
452
453	// Trap temporaries must be saved via VGPR but all VGPRs are in use.
454	// There is no ttmp space to hold the resource constant for VGPR save.
455	// Save v0 by itself since it requires only two SGPRs.
456	s_mov_b32	s_save_ttmps_lo, exec_lo
457	s_and_b32	s_save_ttmps_hi, exec_hi, ADDRESS_HI32_MASK
458	s_mov_b32	exec_lo, 0xFFFFFFFF
459	s_mov_b32	exec_hi, 0xFFFFFFFF
460	global_store_dword_addtid	v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS
461	v_mov_b32	v0, 0x0
462	s_mov_b32	exec_lo, s_save_ttmps_lo
463	s_mov_b32	exec_hi, s_save_ttmps_hi
464
465	// Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
466	// ttmp SR memory offset:
467	// - gfx12:   size(VGPR)+size(SGPR)+0x40
468	// - gfx12.5: size(VGPR)+size(SGPR)-0x40
469	get_wave_size2(s_save_ttmps_hi)
470	get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
471	s_and_b32	s_save_ttmps_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK
472	s_add_u32	s_save_ttmps_lo, s_save_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG)
473	s_add_u32	s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
474	s_addc_u32	s_save_ttmps_hi, s_save_ttmps_hi, 0x0
475
476	v_writelane_b32	v0, ttmp4, 0x4
477	v_writelane_b32	v0, ttmp5, 0x5
478	v_writelane_b32	v0, ttmp6, 0x6
479	v_writelane_b32	v0, ttmp7, 0x7
480	v_writelane_b32	v0, ttmp8, 0x8
481	v_writelane_b32	v0, ttmp9, 0x9
482	v_writelane_b32	v0, ttmp10, 0xA
483	v_writelane_b32	v0, ttmp11, 0xB
484	v_writelane_b32	v0, ttmp13, 0xD
485	v_writelane_b32	v0, exec_lo, 0xE
486	v_writelane_b32	v0, exec_hi, 0xF
487
488	s_mov_b32	exec_lo, 0x3FFF
489	s_mov_b32	exec_hi, 0x0
490	global_store_dword_addtid	v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS
491	v_readlane_b32	ttmp14, v0, 0xE
492	v_readlane_b32	ttmp15, v0, 0xF
493	s_mov_b32	exec_lo, ttmp14
494	s_mov_b32	exec_hi, ttmp15
495
496	s_mov_b32	s_save_base_addr_lo, s_save_spi_init_lo
497	s_and_b32	s_save_base_addr_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK
498	s_mov_b32	s_save_m0, m0
499
500	get_wave_size2(s_wave_size)
501
502	/* save first 4 VGPRs, needed for SGPR save */
503	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
504	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
505	s_and_b32	m0, m0, 1
506	s_cmp_eq_u32	m0, 1
507	s_cbranch_scc1	L_ENABLE_SAVE_4VGPR_EXEC_HI
508	s_mov_b32	exec_hi, 0x00000000
509	s_branch	L_SAVE_4VGPR_WAVE32
510L_ENABLE_SAVE_4VGPR_EXEC_HI:
511	s_mov_b32	exec_hi, 0xFFFFFFFF
512	s_branch	L_SAVE_4VGPR_WAVE64
513L_SAVE_4VGPR_WAVE32:
514	// VGPR Allocated in 4-GPR granularity
515	global_store_addtid_b32	v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128
516	global_store_addtid_b32	v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*2
517	global_store_addtid_b32	v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*3
518	s_branch	L_SAVE_HWREG
519
520L_SAVE_4VGPR_WAVE64:
521	// VGPR Allocated in 4-GPR granularity
522	global_store_addtid_b32	v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256
523	global_store_addtid_b32	v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*2
524	global_store_addtid_b32	v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*3
525
526	/* save HW registers */
527
528L_SAVE_HWREG:
529	// HWREG SR memory offset : size(VGPR)+size(SGPR)
530	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
531	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
532
533	v_mov_b32	v0, 0x0							//Offset[31:0] from buffer resource
534	v_mov_b32	v1, 0x0							//Offset[63:32] from buffer resource
535	v_mov_b32	v2, 0x0							//Set of SGPRs for TCP store
536	s_mov_b32	m0, 0x0							//Next lane of v2 to write to
537
538	write_hwreg_to_v2(s_save_m0)
539
540	// Ensure no further changes to barrier or LDS state.
541	// STATE_PRIV.*BARRIER_COMPLETE may change up to this point.
542	wait_trap_barriers(s_save_tmp, s_save_m0, 1)
543
544	// Re-read final state of *BARRIER_COMPLETE fields for save.
545	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV)
546	s_and_b32	s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK
547	s_andn2_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK
548	s_or_b32	s_save_state_priv, s_save_state_priv, s_save_tmp
549
550	write_hwreg_to_v2(s_save_pc_lo)
551	s_and_b32       s_save_tmp, s_save_pc_hi, ADDRESS_HI32_MASK
552	write_hwreg_to_v2(s_save_tmp)
553	write_hwreg_to_v2(s_save_exec_lo)
554#if WAVE32_ONLY
555	s_mov_b32	s_save_tmp, 0
556	write_hwreg_to_v2(s_save_tmp)
557#else
558	write_hwreg_to_v2(s_save_exec_hi)
559#endif
560	write_hwreg_to_v2(s_save_state_priv)
561
562	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
563	write_hwreg_to_v2(s_save_tmp)
564
565#if HAVE_XNACK
566	write_hwreg_to_v2(s_save_xnack_mask)
567#else
568	s_mov_b32	s_save_tmp, 0
569	write_hwreg_to_v2(s_save_tmp)
570#endif
571
572	s_getreg_b32	s_save_m0, hwreg(HW_REG_WAVE_MODE)
573
574#if HAVE_BANKED_VGPRS
575	s_bfe_u32	s_save_tmp, s_save_pc_hi, (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT | (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE << 0x10))
576	s_lshl_b32	s_save_tmp, s_save_tmp, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT
577	s_or_b32	s_save_m0, s_save_m0, s_save_tmp
578#endif
579
580	write_hwreg_to_v2(s_save_m0)
581
582	s_getreg_b32	s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO)
583	write_hwreg_to_v2(s_save_m0)
584
585	s_getreg_b32	s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI)
586	write_hwreg_to_v2(s_save_m0)
587
588	s_getreg_b32	s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
589	write_hwreg_to_v2(s_save_m0)
590
591	s_getreg_b32	s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL)
592	write_hwreg_to_v2(s_save_m0)
593
594	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
595	write_hwreg_to_v2(s_save_tmp)
596
597	s_get_barrier_state s_save_tmp, -1
598	s_wait_kmcnt (0)
599	write_hwreg_to_v2(s_save_tmp)
600
601#if HAVE_CLUSTER_BARRIER
602	s_sendmsg_rtn_b32	s_save_tmp, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE)
603	s_wait_kmcnt	0
604	write_hwreg_to_v2(s_save_tmp)
605#endif
606
607#if ASIC_FAMILY >= CHIP_GC_12_0_3
608	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_SCHED_MODE)
609	write_hwreg_to_v2(s_save_tmp)
610#endif
611
612#if ! SAVE_TTMPS_IN_SGPR_BLOCK
613	// Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
614	s_mov_b32       exec_lo, 0xFFFF
615#else
616	// All 128 bytes are available for HWREGs.
617	s_mov_b32       exec_lo, 0xFFFFFFFF
618#endif
619	s_mov_b32	exec_hi, 0x0
620	s_add_u32	s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
621	s_addc_u32	s_save_addr_hi, s_save_base_addr_hi, 0x0
622	global_store_addtid_b32	v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
623
624	// Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
625	s_mov_b32       exec_lo, 0xFFFFFFFF
626
627#if NUM_NAMED_BARRIERS
628	v_mov_b32	v2, 0
629
630	for var bar_idx = 0; bar_idx < NUM_NAMED_BARRIERS; bar_idx ++
631		s_get_barrier_state s_save_tmp, (bar_idx + 1)
632		s_wait_kmcnt	0
633		v_writelane_b32	v2, s_save_tmp, bar_idx
634	end
635
636	global_store_addtid_b32	v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:NAMED_BARRIERS_SR_OFFSET_FROM_HWREG
637#endif
638
639	/* save SGPRs */
640	// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
641
642	// SGPR SR memory offset : size(VGPR)
643	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
644
645	s_mov_b32	ttmp13, 0x0						//next VGPR lane to copy SGPR into
646
647	s_mov_b32	m0, 0x0							//SGPR initial index value =0
648	s_nop		0x0							//Manually inserted wait states
649L_SAVE_SGPR_LOOP:
650	// SGPR is allocated in 16 SGPR granularity
651	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
652	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
653	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
654	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
655	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
656	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
657	s_movrels_b64	s12, s12						//s12 = s[12+m0], s13 = s[13+m0]
658	s_movrels_b64	s14, s14						//s14 = s[14+m0], s15 = s[15+m0]
659
660	write_16sgpr_to_v2(s0)
661
662	s_cmp_eq_u32	ttmp13, 0x20						//have 32 VGPR lanes filled?
663	s_cbranch_scc0	L_SAVE_SGPR_SKIP_TCP_STORE
664
665	s_add_u32	s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
666	s_addc_u32	s_save_addr_hi, s_save_base_addr_hi, 0x0
667	global_store_addtid_b32	v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
668	s_add_u32	s_save_mem_offset, s_save_mem_offset, 0x80
669	s_mov_b32	ttmp13, 0x0
670	v_mov_b32	v2, 0x0
671L_SAVE_SGPR_SKIP_TCP_STORE:
672
673	s_add_u32	m0, m0, 16						//next sgpr index
674	s_cmp_lt_u32	m0, 96							//scc = (m0 < first 96 SGPR) ? 1 : 0
675	s_cbranch_scc1	L_SAVE_SGPR_LOOP					//first 96 SGPR save is complete?
676
677	//save the rest 12 SGPR
678	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
679	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
680	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
681	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
682	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
683	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
684	write_12sgpr_to_v2(s0)
685
686#if SAVE_TTMPS_IN_SGPR_BLOCK
687	// Last 16 dwords of the SGPR block already contain the TTMPS.  Make
688	// sure to not override them.
689	s_mov_b32	exec_lo, 0xFFFF
690#endif
691	s_add_u32	s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
692	s_addc_u32	s_save_addr_hi, s_save_base_addr_hi, 0x0
693	global_store_addtid_b32	v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
694
695	/* save LDS */
696
697L_SAVE_LDS:
698	// Change EXEC to all threads...
699	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
700	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
701	s_and_b32	m0, m0, 1
702	s_cmp_eq_u32	m0, 1
703	s_cbranch_scc1	L_ENABLE_SAVE_LDS_EXEC_HI
704	s_mov_b32	exec_hi, 0x00000000
705	s_branch	L_SAVE_LDS_NORMAL
706L_ENABLE_SAVE_LDS_EXEC_HI:
707	s_mov_b32	exec_hi, 0xFFFFFFFF
708L_SAVE_LDS_NORMAL:
709	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
710	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//lds_size is zero?
711	s_cbranch_scc0	L_SAVE_LDS_DONE						//no lds used? jump to L_SAVE_DONE
712
713	s_and_b32	s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
714	s_cbranch_scc0	L_SAVE_LDS_DONE
715
716	// first wave do LDS save;
717
718	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
719
720	// LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
721	//
722	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
723	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
724	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
725
726	//load 0~63*4(byte address) to vgpr v0
727	v_mbcnt_lo_u32_b32	v0, -1, 0
728	v_mbcnt_hi_u32_b32	v0, -1, v0
729	v_mul_u32_u24	v0, 4, v0
730
731	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
732	s_and_b32	m0, m0, 1
733	s_cmp_eq_u32	m0, 1
734	s_mov_b32	m0, 0x0
735	s_cbranch_scc1	L_SAVE_LDS_W64
736
737L_SAVE_LDS_W32:
738	s_mov_b32	s3, 128
739	s_nop		0
740	s_nop		0
741	s_nop		0
742L_SAVE_LDS_LOOP_W32:
743	ds_read_b32	v1, v0
744	s_wait_idle
745	s_add_u32	s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
746	s_addc_u32	s_save_addr_hi, s_save_base_addr_hi, 0x0
747	global_store_addtid_b32	v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
748
749	s_add_u32	m0, m0, s3						//every buffer_store_lds does 128 bytes
750	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
751	v_add_nc_u32	v0, v0, 128						//mem offset increased by 128 bytes
752	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
753	s_cbranch_scc1	L_SAVE_LDS_LOOP_W32					//LDS save is complete?
754
755	s_branch	L_SAVE_LDS_DONE
756
757L_SAVE_LDS_W64:
758	s_mov_b32	s3, 256
759	s_nop		0
760	s_nop		0
761	s_nop		0
762L_SAVE_LDS_LOOP_W64:
763	ds_read_b32	v1, v0
764	s_wait_idle
765	s_add_u32	s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
766	s_addc_u32	s_save_addr_hi, s_save_base_addr_hi, 0x0
767	global_store_addtid_b32	v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
768
769	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
770	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
771	v_add_nc_u32	v0, v0, 256						//mem offset increased by 256 bytes
772	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
773	s_cbranch_scc1	L_SAVE_LDS_LOOP_W64					//LDS save is complete?
774
775L_SAVE_LDS_DONE:
776	/* save VGPRs  - set the Rest VGPRs */
777L_SAVE_VGPR:
778	// VGPR SR memory offset: 0
779	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
780	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
781	s_and_b32	m0, m0, 1
782	s_cmp_eq_u32	m0, 1
783	s_cbranch_scc1	L_ENABLE_SAVE_VGPR_EXEC_HI
784	s_mov_b32	s_save_mem_offset, (0+128*4)				// for the rest VGPRs
785	s_mov_b32	exec_hi, 0x00000000
786	s_branch	L_SAVE_VGPR_NORMAL
787L_ENABLE_SAVE_VGPR_EXEC_HI:
788	s_mov_b32	s_save_mem_offset, (0+256*4)				// for the rest VGPRs
789	s_mov_b32	exec_hi, 0xFFFFFFFF
790L_SAVE_VGPR_NORMAL:
791	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
792	s_add_u32	s_save_alloc_size, s_save_alloc_size, 1
793	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
794	//determine it is wave32 or wave64
795	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
796	s_and_b32	m0, m0, 1
797	s_cmp_eq_u32	m0, 1
798	s_cbranch_scc1	L_SAVE_VGPR_WAVE64
799
800	// VGPR Allocated in 4-GPR granularity
801
802	// VGPR store using dw burst
803	s_mov_b32	m0, 0x4							//VGPR initial index value =4
804	s_cmp_lt_u32	m0, s_save_alloc_size
805	s_cbranch_scc0	L_SAVE_VGPR_END
806
807L_SAVE_VGPR_W32_LOOP:
808	v_movrels_b32	v0, v0							//v0 = v[0+m0]
809	v_movrels_b32	v1, v1							//v1 = v[1+m0]
810	v_movrels_b32	v2, v2							//v2 = v[2+m0]
811	v_movrels_b32	v3, v3							//v3 = v[3+m0]
812
813	s_add_u32	s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
814	s_addc_u32	s_save_addr_hi, s_save_base_addr_hi, 0x0
815	global_store_addtid_b32	v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
816	global_store_addtid_b32	v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128
817	global_store_addtid_b32	v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*2
818	global_store_addtid_b32	v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*3
819
820	s_add_u32	m0, m0, 4						//next vgpr index
821	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128*4		//every buffer_store_dword does 128 bytes
822	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
823	s_cbranch_scc1	L_SAVE_VGPR_W32_LOOP					//VGPR save is complete?
824
825	s_branch	L_SAVE_VGPR_END
826
827L_SAVE_VGPR_WAVE64:
828	// VGPR store using dw burst
829	s_mov_b32	m0, 0x4							//VGPR initial index value =4
830	s_cmp_lt_u32	m0, s_save_alloc_size
831	s_cbranch_scc0	L_SAVE_VGPR_END
832
833L_SAVE_VGPR_W64_LOOP:
834	v_movrels_b32	v0, v0							//v0 = v[0+m0]
835	v_movrels_b32	v1, v1							//v1 = v[1+m0]
836	v_movrels_b32	v2, v2							//v2 = v[2+m0]
837	v_movrels_b32	v3, v3							//v3 = v[3+m0]
838
839	s_add_u32	s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
840	s_addc_u32	s_save_addr_hi, s_save_base_addr_hi, 0x0
841	global_store_addtid_b32	v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
842	global_store_addtid_b32	v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256
843	global_store_addtid_b32	v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*2
844	global_store_addtid_b32	v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*3
845
846	s_add_u32	m0, m0, 4						//next vgpr index
847	s_add_u32	s_save_mem_offset, s_save_mem_offset, 256*4		//every buffer_store_dword does 256 bytes
848	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
849	s_cbranch_scc1	L_SAVE_VGPR_W64_LOOP					//VGPR save is complete?
850
851L_SAVE_VGPR_END:
852	s_branch	L_END_PGM
853
854L_RESTORE:
855	s_mov_b32	s_restore_base_addr_lo, s_restore_spi_init_lo
856	s_and_b32	s_restore_base_addr_hi, s_restore_spi_init_hi, ADDRESS_HI32_MASK
857
858	// Save s_restore_spi_init_hi for later use.
859	s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi
860
861	//determine it is wave32 or wave64
862	get_wave_size2(s_restore_size)
863
864	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
865	s_cbranch_scc0	L_RESTORE_VGPR
866
867	/* restore LDS */
868L_RESTORE_LDS:
869	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
870	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
871	s_and_b32	m0, m0, 1
872	s_cmp_eq_u32	m0, 1
873	s_cbranch_scc1	L_ENABLE_RESTORE_LDS_EXEC_HI
874	s_mov_b32	exec_hi, 0x00000000
875	s_branch	L_RESTORE_LDS_NORMAL
876L_ENABLE_RESTORE_LDS_EXEC_HI:
877	s_mov_b32	exec_hi, 0xFFFFFFFF
878L_RESTORE_LDS_NORMAL:
879	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
880	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//lds_size is zero?
881	s_cbranch_scc0	L_RESTORE_VGPR						//no lds used? jump to L_RESTORE_VGPR
882	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
883
884	// LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
885	//
886	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
887	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
888	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
889
890	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
891	s_and_b32	m0, m0, 1
892	s_cmp_eq_u32	m0, 1
893	s_mov_b32	m0, 0x0
894
895	v_mbcnt_lo_u32_b32	v1, -1, 0
896	v_mbcnt_hi_u32_b32	v1, -1, v1
897	v_lshlrev_b32		v1, 2, v1					// 0, 4, 8, ... 124 (W32) or 252 (W64)
898
899	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64
900
901L_RESTORE_LDS_LOOP_W32:
902	s_add_u32	s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
903	s_addc_u32	s_restore_addr_hi, s_restore_base_addr_hi, 0x0
904	global_load_addtid_b32	v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
905	s_wait_idle
906	ds_store_b32	v1, v0
907	v_add_nc_u32	v1, v1, 128
908	s_add_u32	m0, m0, 128						// 128 DW
909	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128		//mem offset increased by 128DW
910	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
911	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W32					//LDS restore is complete?
912	s_branch	L_RESTORE_VGPR
913
914L_RESTORE_LDS_LOOP_W64:
915	s_add_u32	s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
916	s_addc_u32	s_restore_addr_hi, s_restore_base_addr_hi, 0x0
917	global_load_addtid_b32	v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
918	s_wait_idle
919	ds_store_b32	v1, v0
920	v_add_nc_u32	v1, v1, 256
921	s_add_u32	m0, m0, 256						// 256 DW
922	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256		//mem offset increased by 256DW
923	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
924	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64					//LDS restore is complete?
925
926	/* restore VGPRs */
927L_RESTORE_VGPR:
928	// VGPR SR memory offset : 0
929	s_mov_b32	s_restore_mem_offset, 0x0
930	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
931	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
932	s_and_b32	m0, m0, 1
933	s_cmp_eq_u32	m0, 1
934	s_cbranch_scc1	L_ENABLE_RESTORE_VGPR_EXEC_HI
935	s_mov_b32	exec_hi, 0x00000000
936	s_branch	L_RESTORE_VGPR_NORMAL
937L_ENABLE_RESTORE_VGPR_EXEC_HI:
938	s_mov_b32	exec_hi, 0xFFFFFFFF
939L_RESTORE_VGPR_NORMAL:
940	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
941	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, 1
942	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
943	//determine it is wave32 or wave64
944	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
945	s_and_b32	m0, m0, 1
946	s_cmp_eq_u32	m0, 1
947	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64
948
949	// VGPR load using dw burst
950	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v1, v0 will be the last
951	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4
952	s_mov_b32	m0, 4							//VGPR initial index value = 4
953
954L_RESTORE_VGPR_WAVE32_LOOP:
955	s_add_u32	s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
956	s_addc_u32	s_restore_addr_hi, s_restore_base_addr_hi, 0x0
957	global_load_addtid_b32	v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
958	global_load_addtid_b32	v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128
959	global_load_addtid_b32	v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2
960	global_load_addtid_b32	v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3
961	s_wait_idle
962	v_movreld_b32	v0, v0							//v[0+m0] = v0
963	v_movreld_b32	v1, v1
964	v_movreld_b32	v2, v2
965	v_movreld_b32	v3, v3
966	s_add_u32	m0, m0, 4						//next vgpr index
967	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4	//every buffer_load_dword does 128 bytes
968	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
969	s_cbranch_scc1	L_RESTORE_VGPR_WAVE32_LOOP				//VGPR restore (except v0) is complete?
970
971	/* VGPR restore on v0 */
972	s_add_u32	s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save
973	s_addc_u32	s_restore_addr_hi, s_restore_base_addr_hi, 0x0
974	global_load_addtid_b32	v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
975	global_load_addtid_b32	v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128
976	global_load_addtid_b32	v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2
977	global_load_addtid_b32	v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3
978	s_wait_idle
979
980	s_branch	L_RESTORE_SGPR
981
982L_RESTORE_VGPR_WAVE64:
983	// VGPR load using dw burst
984	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v4, v0 will be the last
985	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4
986	s_mov_b32	m0, 4							//VGPR initial index value = 4
987	s_cmp_lt_u32	m0, s_restore_alloc_size
988	s_cbranch_scc0	L_RESTORE_V0
989
990L_RESTORE_VGPR_WAVE64_LOOP:
991	s_add_u32	s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
992	s_addc_u32	s_restore_addr_hi, s_restore_base_addr_hi, 0x0
993	global_load_addtid_b32	v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
994	global_load_addtid_b32	v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256
995	global_load_addtid_b32	v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2
996	global_load_addtid_b32	v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3
997	s_wait_idle
998	v_movreld_b32	v0, v0							//v[0+m0] = v0
999	v_movreld_b32	v1, v1
1000	v_movreld_b32	v2, v2
1001	v_movreld_b32	v3, v3
1002	s_add_u32	m0, m0, 4						//next vgpr index
1003	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4	//every buffer_load_dword does 256 bytes
1004	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
1005	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64_LOOP				//VGPR restore (except v0) is complete?
1006
1007	/* VGPR restore on v0 */
1008L_RESTORE_V0:
1009	s_add_u32	s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save
1010	s_addc_u32	s_restore_addr_hi, s_restore_base_addr_hi, 0x0
1011	global_load_addtid_b32	v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
1012	global_load_addtid_b32	v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256
1013	global_load_addtid_b32	v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2
1014	global_load_addtid_b32	v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3
1015	s_wait_idle
1016
1017	/* restore SGPRs */
1018	//will be 2+8+16*6
1019	// SGPR SR memory offset : size(VGPR)
1020L_RESTORE_SGPR:
1021	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1022	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1023	s_sub_u32	s_restore_mem_offset, s_restore_mem_offset, 24*4	// s[104:107]
1024	s_add_u32	s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
1025	s_addc_u32	s_restore_addr_hi, s_restore_base_addr_hi, 0x0
1026
1027	s_mov_b32	m0, s_sgpr_save_num
1028
1029	s_load_b128	s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
1030	s_wait_idle
1031
1032	s_sub_u32	m0, m0, 4						// Restore from S[0] to S[104]
1033	s_nop		0							// hazard SALU M0=> S_MOVREL
1034
1035	s_movreld_b64	s0, s0							//s[0+m0] = s0
1036	s_movreld_b64	s2, s2
1037
1038	s_sub_co_u32	s_restore_addr_lo, s_restore_addr_lo, 8*4		// s[96:103]
1039	s_sub_co_ci_u32	s_restore_addr_hi, s_restore_addr_hi, 0
1040	s_load_b256	s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
1041	s_wait_idle
1042
1043	s_sub_u32	m0, m0, 8						// Restore from S[0] to S[96]
1044	s_nop		0							// hazard SALU M0=> S_MOVREL
1045
1046	s_movreld_b64	s0, s0							//s[0+m0] = s0
1047	s_movreld_b64	s2, s2
1048	s_movreld_b64	s4, s4
1049	s_movreld_b64	s6, s6
1050
1051 L_RESTORE_SGPR_LOOP:
1052	s_sub_co_u32	s_restore_addr_lo, s_restore_addr_lo, 16*4		// s[0,16,32,48,64,80]
1053	s_sub_co_ci_u32	s_restore_addr_hi, s_restore_addr_hi, 0
1054	s_load_b512	s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
1055	s_wait_idle
1056
1057	s_sub_u32	m0, m0, 16						// Restore from S[n] to S[0]
1058	s_nop		0							// hazard SALU M0=> S_MOVREL
1059
1060	s_movreld_b64	s0, s0							//s[0+m0] = s0
1061	s_movreld_b64	s2, s2
1062	s_movreld_b64	s4, s4
1063	s_movreld_b64	s6, s6
1064	s_movreld_b64	s8, s8
1065	s_movreld_b64	s10, s10
1066	s_movreld_b64	s12, s12
1067	s_movreld_b64	s14, s14
1068
1069	s_cmp_eq_u32	m0, 0							//scc = (m0 < s_sgpr_save_num) ? 1 : 0
1070	s_cbranch_scc0	L_RESTORE_SGPR_LOOP
1071
1072	// s_barrier with STATE_PRIV.TRAP_AFTER_INST=1, STATUS.PRIV=1 incorrectly asserts debug exception.
1073	// Clear DEBUG_EN before and restore MODE after the barrier.
1074	s_setreg_imm32_b32	hwreg(HW_REG_WAVE_MODE), 0
1075
1076	/* restore HW registers */
1077L_RESTORE_HWREG:
1078	// HWREG SR memory offset : size(VGPR)+size(SGPR)
1079	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1080	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1081	s_add_u32	s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
1082	s_addc_u32	s_restore_addr_hi, s_restore_base_addr_hi, 0x0
1083
1084	// Restore s_restore_spi_init_hi before the saved value gets clobbered.
1085	s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save
1086
1087	s_load_b32	s_restore_m0, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS
1088	s_load_b32	s_restore_pc_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x4
1089	s_load_b32	s_restore_pc_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x8
1090	s_load_b32	s_restore_exec_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0xC
1091	s_load_b32	s_restore_exec_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x10
1092	s_load_b32	s_restore_state_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x14
1093	s_load_b32	s_restore_excp_flag_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x18
1094	s_load_b32	s_restore_xnack_mask, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x1C
1095	s_load_b32	s_restore_mode, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x20
1096	s_load_b32	s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x24
1097	s_wait_idle
1098
1099	s_setreg_b32	hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch
1100
1101	s_load_b32	s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x28
1102	s_wait_idle
1103
1104	s_setreg_b32	hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch
1105
1106	s_load_b32	s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x2C
1107	s_wait_idle
1108	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
1109
1110	s_load_b32	s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x30
1111	s_wait_idle
1112	s_setreg_b32	hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
1113
1114	// Only the first wave needs to restore group barriers.
1115	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
1116	s_cbranch_scc0	L_SKIP_GROUP_BARRIER_RESTORE
1117
1118	// Skip over WAVE_STATUS, since there is no state to restore from it
1119
1120	s_load_b32	s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x38
1121	s_wait_idle
1122
1123	// Skip group barriers if wave is not part of a group.
1124	s_bitcmp1_b32	s_restore_tmp, BARRIER_STATE_VALID_OFFSET
1125	s_cbranch_scc0	L_SKIP_GROUP_BARRIER_RESTORE
1126
1127	// Restore workgroup barrier signal count.
1128	restore_barrier_signal_count(-1)
1129
1130#if NUM_NAMED_BARRIERS
1131	s_mov_b32	s_restore_mem_offset, NAMED_BARRIERS_SR_OFFSET_FROM_HWREG
1132	s_mov_b32	m0, 1
1133
1134L_RESTORE_NAMED_BARRIER_LOOP:
1135	s_load_b32	s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], s_restore_mem_offset scope:SCOPE_SYS
1136	s_wait_kmcnt	0
1137	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 0x4
1138
1139	// Restore named barrier member count.
1140	s_bfe_u32	exec_lo, s_restore_tmp, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 16))
1141	s_lshl_b32	exec_lo, exec_lo, S_BARRIER_INIT_MEMBERCNT_SHIFT
1142	s_or_b32	m0, m0, exec_lo
1143	s_barrier_init	m0
1144	s_andn2_b32	m0, m0, S_BARRIER_INIT_MEMBERCNT_MASK
1145
1146	// Restore named barrier signal count.
1147	restore_barrier_signal_count(m0)
1148
1149	s_add_u32	m0, m0, 1
1150	s_cmp_gt_u32	m0, NUM_NAMED_BARRIERS
1151	s_cbranch_scc0	L_RESTORE_NAMED_BARRIER_LOOP
1152#endif
1153
1154L_SKIP_GROUP_BARRIER_RESTORE:
1155#if HAVE_CLUSTER_BARRIER
1156	s_load_b32	s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x3C
1157	s_wait_kmcnt	0
1158
1159	// Skip cluster barrier restore if wave is not part of a cluster.
1160	s_bitcmp1_b32	s_restore_tmp, BARRIER_STATE_VALID_OFFSET
1161	s_cbranch_scc0	L_SKIP_CLUSTER_BARRIER_RESTORE
1162
1163	// Only the first wave in the group signals the trap cluster barrier.
1164	s_bitcmp1_b32	s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT
1165	s_cbranch_scc0	L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL
1166
1167	// Clear SCC: s_barrier_signal_isfirst -4 writes SCC=>1 but not SCC=>0.
1168	s_cmp_eq_u32	0, 1
1169	s_barrier_signal_isfirst	-4
1170L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL:
1171	s_barrier_wait	-4
1172
1173	// Only the first wave in the cluster restores the barrier.
1174	s_cbranch_scc0	L_SKIP_CLUSTER_BARRIER_RESTORE
1175
1176	// Restore cluster barrier signal count.
1177	restore_barrier_signal_count(-3)
1178L_SKIP_CLUSTER_BARRIER_RESTORE:
1179#endif
1180
1181#if ASIC_FAMILY >= CHIP_GC_12_0_3
1182	s_load_b32	s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x40
1183	s_wait_kmcnt	0
1184	s_setreg_b32	hwreg(HW_REG_WAVE_SCHED_MODE), s_restore_tmp
1185#endif
1186
1187	s_mov_b32	m0, s_restore_m0
1188	s_mov_b32	exec_lo, s_restore_exec_lo
1189	s_mov_b32	exec_hi, s_restore_exec_hi
1190
1191#if HAVE_XNACK
1192	s_setreg_b32	hwreg(HW_REG_WAVE_XNACK_MASK), s_restore_xnack_mask
1193#endif
1194
1195	// EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed.
1196	// Only restore the other fields to avoid clobbering them.
1197	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv
1198	s_lshr_b32	s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT
1199	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE), s_restore_excp_flag_priv
1200	s_lshr_b32	s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT
1201	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE), s_restore_excp_flag_priv
1202
1203	s_setreg_b32	hwreg(HW_REG_WAVE_MODE), s_restore_mode
1204
1205	// Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
1206	// ttmp SR memory offset :
1207	// - gfx12:   size(VGPR)+size(SGPR)+0x40
1208	// - gfx12.5: size(VGPR)+size(SGPR)-0x40
1209	get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
1210	s_add_u32	s_restore_ttmps_lo, s_restore_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG)
1211	s_add_u32	s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_base_addr_lo
1212	s_addc_u32	s_restore_ttmps_hi, s_restore_base_addr_hi, 0x0
1213	s_load_dwordx4	[ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x10 scope:SCOPE_SYS
1214	s_load_dwordx4	[ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x20 scope:SCOPE_SYS
1215	s_load_dword	ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x34 scope:SCOPE_SYS
1216	s_wait_idle
1217
1218#if HAVE_XNACK
1219	restore_xnack_state_priv(s_restore_tmp)
1220#endif
1221
1222	s_and_b32	s_restore_pc_hi, s_restore_pc_hi, ADDRESS_HI32_MASK	//Do it here in order not to affect STATUS
1223	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
1224	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
1225
1226#if RELAXED_SCHEDULING_IN_TRAP
1227	// Assume relaxed scheduling mode after this point.
1228	restore_sched_mode(s_restore_tmp)
1229#endif
1230
1231	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv	// SCC is included, which is changed by previous salu
1232
1233	// Make barrier and LDS state visible to all waves in the group/cluster.
1234	// STATE_PRIV.*BARRIER_COMPLETE may change after this point.
1235	wait_trap_barriers(s_restore_tmp, 0, 0)
1236
1237#if HAVE_CLUSTER_BARRIER
1238	// SCC is changed by wait_trap_barriers, restore it separately.
1239	s_lshr_b32	s_restore_state_priv, s_restore_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
1240	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, 1), s_restore_state_priv
1241#endif
1242
1243	s_rfe_b64	s_restore_pc_lo						//Return to the main shader program and resume execution
1244
1245L_END_PGM:
1246	// Make sure that no wave of the group/cluster can exit the trap handler
1247	// before the group/cluster barrier state is saved.
1248	wait_trap_barriers(s_restore_tmp, 0, 0)
1249
1250	s_endpgm_saved
1251end
1252
1253function write_hwreg_to_v2(s)
1254	// Copy into VGPR for later TCP store.
1255	v_writelane_b32	v2, s, m0
1256	s_add_u32	m0, m0, 0x1
1257end
1258
1259
1260function write_16sgpr_to_v2(s)
1261	// Copy into VGPR for later TCP store.
1262	for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
1263		v_writelane_b32	v2, s[sgpr_idx], ttmp13
1264		s_add_u32	ttmp13, ttmp13, 0x1
1265	end
1266end
1267
1268function write_12sgpr_to_v2(s)
1269	// Copy into VGPR for later TCP store.
1270	for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
1271		v_writelane_b32	v2, s[sgpr_idx], ttmp13
1272		s_add_u32	ttmp13, ttmp13, 0x1
1273	end
1274end
1275
1276function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
1277	s_getreg_b32	s_vgpr_size_byte, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1278	s_add_u32	s_vgpr_size_byte, s_vgpr_size_byte, 1
1279	s_bitcmp1_b32	s_size, S_WAVE_SIZE
1280	s_cbranch_scc1	L_ENABLE_SHIFT_W64
1281	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+7)		//Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
1282	s_branch	L_SHIFT_DONE
1283L_ENABLE_SHIFT_W64:
1284	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+8)		//Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
1285L_SHIFT_DONE:
1286end
1287
1288function get_sgpr_size_bytes
1289	return 512
1290end
1291
1292function get_hwreg_size_bytes
1293#if ASIC_FAMILY >= CHIP_GC_12_0_3
1294	return 512
1295#else
1296	return 128
1297#endif
1298end
1299
1300function get_wave_size2(s_reg)
1301	s_getreg_b32	s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE)
1302	s_lshl_b32	s_reg, s_reg, S_WAVE_SIZE
1303end
1304
1305#if HAVE_XNACK
1306function save_and_clear_xnack_state_priv(s_tmp)
1307	// Preserve and clear XNACK state before issuing further translations.
1308	// Save XNACK_STATE_PRIV.{FIRST_REPLAY, REPLAY_W64H, FXPTR} into ttmp11[22:14].
1309	s_andn2_b32	ttmp11, ttmp11, (TTMP11_FIRST_REPLAY_MASK | TTMP11_REPLAY_W64H_MASK | TTMP11_FXPTR_MASK)
1310
1311	s_getreg_b32	s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE)
1312	s_lshl_b32	s_tmp, s_tmp, TTMP11_FIRST_REPLAY_SHIFT
1313	s_or_b32	ttmp11, ttmp11, s_tmp
1314
1315	s_getreg_b32	s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE)
1316	s_lshl_b32	s_tmp, s_tmp, TTMP11_REPLAY_W64H_SHIFT
1317	s_or_b32	ttmp11, ttmp11, s_tmp
1318
1319	s_getreg_b32	s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE)
1320	s_lshl_b32	s_tmp, s_tmp, TTMP11_FXPTR_SHIFT
1321	s_or_b32	ttmp11, ttmp11, s_tmp
1322
1323	s_setreg_imm32_b32	hwreg(HW_REG_WAVE_XNACK_STATE_PRIV), 0
1324end
1325
1326function restore_xnack_state_priv(s_tmp)
1327	s_lshr_b32	s_tmp, ttmp11, TTMP11_FIRST_REPLAY_SHIFT
1328	s_setreg_b32	hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE), s_tmp
1329
1330	s_lshr_b32	s_tmp, ttmp11, TTMP11_REPLAY_W64H_SHIFT
1331	s_setreg_b32	hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE), s_tmp
1332
1333	s_lshr_b32	s_tmp, ttmp11, TTMP11_FXPTR_SHIFT
1334	s_setreg_b32	hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE), s_tmp
1335end
1336#endif
1337
1338function wait_trap_barriers(s_tmp1, s_tmp2, serialize_wa)
1339#if HAVE_CLUSTER_BARRIER
1340	// If not in a WG then wave cannot use s_barrier_signal_isfirst.
1341	s_getreg_b32	s_tmp1, hwreg(HW_REG_WAVE_STATUS)
1342	s_bitcmp0_b32	s_tmp1, SQ_WAVE_STATUS_IN_WG_SHIFT
1343	s_cbranch_scc1	L_TRAP_CLUSTER_BARRIER_SIGNAL
1344
1345	s_barrier_signal_isfirst	-2
1346	s_barrier_wait	-2
1347
1348	// Only the first wave in the group signals the trap cluster barrier.
1349	s_cbranch_scc0	L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL
1350
1351L_TRAP_CLUSTER_BARRIER_SIGNAL:
1352	s_barrier_signal	-4
1353
1354L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL:
1355	s_barrier_wait	-4
1356
1357#if CLUSTER_BARRIER_SERIALIZE_WORKAROUND
1358if serialize_wa
1359	// Trap cluster barrier may complete with a user cluster barrier in-flight.
1360	// This is indicated if user cluster member count and signal count are equal.
1361L_WAIT_USER_CLUSTER_BARRIER_COMPLETE:
1362	s_sendmsg_rtn_b32	s_tmp1, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE)
1363	s_wait_kmcnt	0
1364	s_bitcmp0_b32	s_tmp1, BARRIER_STATE_VALID_OFFSET
1365	s_cbranch_scc1	L_NOT_IN_CLUSTER
1366
1367	s_bfe_u32	s_tmp2, s_tmp1, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 0x10))
1368	s_bfe_u32	s_tmp1, s_tmp1, (BARRIER_STATE_SIGNAL_OFFSET | (BARRIER_STATE_SIGNAL_SIZE << 0x10))
1369	s_cmp_eq_u32	s_tmp1, s_tmp2
1370	s_cbranch_scc1	L_WAIT_USER_CLUSTER_BARRIER_COMPLETE
1371end
1372L_NOT_IN_CLUSTER:
1373#endif
1374
1375#else
1376	s_barrier_signal	-2
1377	s_barrier_wait	-2
1378#endif
1379end
1380
1381#if RELAXED_SCHEDULING_IN_TRAP
1382function restore_sched_mode(s_tmp)
1383	s_bfe_u32	s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10))
1384	s_setreg_b32	hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp
1385end
1386#endif
1387
1388function restore_barrier_signal_count(barrier_id)
1389	// extract the saved signal count from s_restore_tmp
1390	s_lshr_b32	s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
1391
1392	// We need to call s_barrier_signal repeatedly to restore the signal count
1393	// of the group/cluster barrier. The member count is already initialized.
1394L_BARRIER_RESTORE_LOOP:
1395	s_and_b32	s_restore_tmp, s_restore_tmp, s_restore_tmp
1396	s_cbranch_scc0	L_BARRIER_RESTORE_DONE
1397	s_barrier_signal	barrier_id
1398	s_add_i32	s_restore_tmp, s_restore_tmp, -1
1399	s_branch	L_BARRIER_RESTORE_LOOP
1400
1401L_BARRIER_RESTORE_DONE:
1402end
1403
1404#if HAVE_INSTRUCTION_FIXUP
1405function fixup_instruction
1406	// PC read may fault if memory violation has been asserted.
1407	// In this case no further progress is expected so fixup is not needed.
1408	s_bitcmp1_b32	s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT
1409	s_cbranch_scc1	L_FIXUP_DONE
1410
1411	// ttmp[0:1]: {7b'0} PC[56:0]
1412	// ttmp2, 3, 10, 13, 14, 15: free
1413	s_load_b64	[ttmp14, ttmp15], [ttmp0, ttmp1], 0 scope:SCOPE_CU	// Load the 2 instruction DW we are returning to
1414	s_wait_kmcnt	0
1415	s_load_b64	[ttmp2, ttmp3], [ttmp0, ttmp1], 8 scope:SCOPE_CU	// Load the next 2 instruction DW, just in case
1416	s_and_b32	ttmp10, ttmp14, 0x80000000				// Check bit 31 in the first DWORD
1417										// SCC set if ttmp10 is != 0, i.e. if bit 31 == 1
1418	s_cbranch_scc1	L_FIXUP_NOT_VOP12C					// If bit 31 is 1, we are not VOP1, VOP2, or VOP3C
1419	// Fall through here means bit 31 == 0, meaning we are VOP1, VOP2, or VOPC
1420	// Size of instruction depends on Opcode or SRC0_9
1421	// Check for VOP2 opcode
1422	s_bfe_u32	ttmp10, ttmp14, (25 | (6 << 0x10))			// Check bits 30:25 for VOP2 Opcode
1423	// VOP2 V_FMAMK_F64 of V_FMAAK_F64 has implied 64-bit literature, 3 DW
1424	s_sub_co_i32	ttmp13, ttmp10, 0x23					// V_FMAMK_F64 is 0x23, V_FMAAK_F64 is 0x24
1425	s_cmp_le_u32	ttmp13, 0x1						// 0==0x23, 1==0x24
1426	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// If either, this is 3 DWORD inst
1427	// VOP2 V_FMAMK_F32, V_FMAAK_F32, V_FMAMK_F16, V_FMAAK_F16, 2 DW
1428	s_sub_co_i32	ttmp13, ttmp10, 0x2c					// V_FMAMK_F32 is 0x2c, V_FMAAK_F32 is 0x2d
1429	s_cmp_le_u32	ttmp13, 0x1						// 0==0x2c, 1==0x2d
1430	s_cbranch_scc1	L_FIXUP_TWO_DWORD					// If either, this is 2 DWORD inst
1431	s_sub_co_i32	ttmp13, ttmp10, 0x37					// V_FMAMK_F16 is 0x37, V_FMAAK_F16 is 0x38
1432	s_cmp_le_u32	ttmp13, 0x1						// 0==0x37, 1==0x38
1433	s_cbranch_scc1	L_FIXUP_TWO_DWORD					// If either, this is 2 DWORD inst
1434	// Check SRC0_9 for VOP1, VOP2, and VOPC
1435	s_and_b32	ttmp10, ttmp14, 0x1ff					// Check bits 8:0 for SRC0_9
1436	// Literal constant 64 is 3 DWORDs
1437	s_cmp_eq_u32	ttmp10, 0xfe						// 0xfe == 254 == Literal constant64
1438	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// 3 DWORD inst
1439	// Literal constant 32, DPP16, DPP8, and DPP8FI are 2 DWORDs
1440	s_cmp_eq_u32	ttmp10, 0xff						// 0xff == 255 = Literal constant32
1441	s_cbranch_scc1	L_FIXUP_TWO_DWORD					// 2 DWORD inst
1442	s_cmp_eq_u32	ttmp10, 0xfa						// 0xfa == 250 = DPP16
1443	s_cbranch_scc1	L_FIXUP_TWO_DWORD					// 2 DWORD inst
1444	s_sub_co_i32	ttmp13, ttmp10, 0xe9					// DPP8 is 0xe9, DPP8FI is 0xea
1445	s_cmp_le_u32	ttmp13, 0x1						// 0==0xe9, 1==0xea
1446	s_cbranch_scc1	L_FIXUP_TWO_DWORD					// If either, this is 2 DWORD inst
1447	// Instruction is 1 DWORD otherwise
1448
1449L_FIXUP_ONE_DWORD:
1450	// Check if TTMP15 contains the value for S_SET_VGPR_MSB instruction
1451	s_and_b32	ttmp10, ttmp15, 0xffff0000				// Check encoding in upper 16 bits
1452	s_cmp_eq_u32	ttmp10, 0xbf860000					// Check if SOPP (9b'10_1111111) and S_SET_VGPR_MSB (7b'0000110)
1453	s_cbranch_scc0	L_FIXUP_DONE						// No problem, no fixup needed
1454	// VALU op followed by a S_SET_VGPR_MSB. Need to pull SIMM[15:8] to fix up MODE.*_VGPR_MSB
1455	s_bfe_u32	ttmp10, ttmp15, (14 | (2 << 0x10))			// Shift SIMM[15:14] over to 1:0, Dst
1456	s_and_b32	ttmp13, ttmp15, 0x3f00					// Mask to get SIMM[13:8] only
1457	s_lshr_b32	ttmp13, ttmp13, 6					// Shift SIMM[13:8] into 7:2, Src2, Src1, Src0
1458	s_or_b32	ttmp10, ttmp10, ttmp13					// Src2, Src1, Src0, Dst --> format in MODE register
1459	s_setreg_b32	hwreg(HW_REG_WAVE_MODE, 12, 8), ttmp10			// Write value into MODE[19:12]
1460	s_branch	L_FIXUP_DONE
1461
1462L_FIXUP_NOT_VOP12C:
1463	// ttmp[0:1]: {7b'0} PC[56:0]
1464	// ttmp2: PC+2 value (not waitcnt'ed yet)
1465	// ttmp3: PC+3 value (not waitcnt'ed yet)
1466	// ttmp10, ttmp13: free
1467	// ttmp14: PC+O value
1468	// ttmp15: PC+1 value
1469	// Not VOP1, VOP2, or VOPC.
1470	// Check if we are VOP3 or VOP3SD
1471	s_and_b32	ttmp10, ttmp14, 0xfc000000				// Bits 31:26
1472	s_cmp_eq_u32	ttmp10, 0xd4000000					// If 31:26 = 0x35, this is VOP3 or VOP3SD
1473	s_cbranch_scc1	L_FIXUP_CHECK_VOP3					// If VOP3 or VOP3SD, need to check SRC2_9, SRC1_9, SRC0_9
1474	// Not VOP1, VOP2, VOPC, VOP3, or VOP3SD.
1475	// Check for VOPD
1476	s_cmp_eq_u32	ttmp10, 0xc8000000					// If 31:26 = 0x32, this is VOPD
1477	s_cbranch_scc1	L_FIXUP_CHECK_VOPD					// If VOPD, need to check OpX, OpY, SRCX0 and SRCY0
1478	// Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD.
1479	// Check if we are VOPD3
1480	s_and_b32	ttmp10, ttmp14, 0xff000000				// Bits 31:24
1481	s_cmp_eq_u32	ttmp10, 0xcf000000					// If 31:24 = 0xcf, this is VOPD3
1482	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// If VOPD3, 3 DWORD inst
1483	// Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD, or VOPD3.
1484	// Check if we are in the middle of VOP3PX.
1485	s_and_b32	ttmp13, ttmp14, 0xffff0000				// Bits 31:16
1486	s_cmp_eq_u32	ttmp13, 0xcc330000					// If 31:16 = 0xcc33, this is 8 bytes past VOP3PX
1487	s_cbranch_scc1	L_FIXUP_VOP3PX_MIDDLE
1488	s_cmp_eq_u32	ttmp13, 0xcc880000					// If 31:16 = 0xcc88, this is 8 bytes past VOP3PX
1489	s_cbranch_scc1	L_FIXUP_VOP3PX_MIDDLE
1490	// Might be in VOP3P, but we must ensure we are not VOP3PX2
1491	s_cmp_eq_u32	ttmp13, 0xcc350000					// If 31:16 = 0xcc35, this is VOP3PX2
1492	s_cbranch_scc1	L_FIXUP_DONE						// If VOP3PX2, no fixup needed
1493	s_cmp_eq_u32	ttmp13, 0xcc3a0000					// If 31:16 = 0xcc3a, this is VOP3PX2
1494	s_cbranch_scc1	L_FIXUP_DONE						// If VOP3PX2, no fixup needed
1495	// Check if we are VOP3P
1496	s_cmp_eq_u32	ttmp10, 0xcc000000					// If 31:24 = 0xcc, this is VOP3P
1497	s_cbranch_scc0	L_FIXUP_DONE						// Not in VOP3P, so instruction is not VOP1, VOP2,
1498										// VOPC, VOP3, VOP3SD, VOP3P, VOPD, or VOPD3
1499										// No fixup needed.
1500	// Fall-through if we are in VOP3P to check SRC2_9, SRC1_9, and SRC0_9
1501L_FIXUP_CHECK_VOP3:
1502	// Start with Src0, which is in bits 8:0 of second instruction DW, ttmp15
1503	s_and_b32	ttmp10, ttmp15, 0x1ff					// Mask out unused bits
1504	// Src0_9 == Literal constant 32, DPP16, DPP8, and DPP8FI means 3 DWORDs
1505	s_cmp_eq_u32	ttmp10, 0xff						// 0xff == 255 = Literal constant32
1506	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// 3 DWORD inst
1507	s_cmp_eq_u32	ttmp10, 0xfa						// 0xfa == 250 = DPP16
1508	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// 3 DWORD inst
1509	s_sub_co_i32	ttmp10, ttmp10, 0xe9					// DPP8 is 0xe9, DPP8FI is 0xea
1510	s_cmp_le_u32	ttmp10, 0x1						// 0==0xe9, 1==0xea
1511	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// If either, this is 3 DWORD inst
1512	s_and_b32	ttmp10, ttmp15, 0x3fe00					// Next is Src1, which is in 17:9
1513	s_cmp_eq_u32	ttmp10, 0x1fe00						// 0xff == 255 = Literal constant32
1514	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// 3 DWORD inst
1515	s_and_b32	ttmp10, ttmp15, 0x7fc0000				// Next is Src2, which is in 26:18
1516	s_cmp_eq_u32	ttmp10, 0x3fc0000					// 0xff == 255 = Literal constant32
1517	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// 3 DWORD inst
1518	s_branch	L_FIXUP_TWO_DWORD					// No special encodings, VOP3* is 2 Dword
1519
1520L_FIXUP_CHECK_VOPD:
1521	// OpX being V_DUAL_FMA*K_F32 means 3 DWORDs
1522	s_bfe_u32	ttmp10, ttmp14, (22 | (4 << 0x10))			// OPX is bits 25:22
1523	s_sub_co_i32	ttmp10, ttmp10, 0x1					// V_DUAL_FMAAK_F32 is 0x1, V_DUAL_FMAMK_F32 is 0x2
1524	s_cmp_le_u32	ttmp10, 0x1						// 0==0x1, 1==0x2
1525	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// If either, this is 3 DWORD inst
1526	// OpY being V_DUAL_FMA*K_F32 means 3 DWORDs
1527	s_bfe_u32	ttmp10, ttmp14, (17 | (5 << 0x10))			// OPX is bits 21:17
1528	s_sub_co_i32	ttmp10, ttmp10, 0x1					// V_DUAL_FMAAK_F32 is 0x1, V_DUAL_FMAMK_F32 is 0x2
1529	s_cmp_le_u32	ttmp10, 0x1						// 0==0x1, 1==0x2
1530	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// If either, this is 3 DWORD inst
1531	// SRCX0 == Literal constant 32 means 3 DWORDs
1532	s_and_b32	ttmp10, ttmp14, 0x1ff					// SRCX0 is in bits 8:0 of 1st DWORD
1533	s_cmp_eq_u32	ttmp10, 0xff						// 0xff == 255 = Literal constant32
1534	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// 3 DWORD inst
1535	// SRCY0 == Literal constant 32 means 3 DWORDs
1536	s_and_b32	ttmp10, ttmp15, 0x1ff					// SRCY0 is in bits 8:0 of 2nd DWORD
1537	s_cmp_eq_u32	ttmp10, 0xff						// 0xff == 255 = Literal constant32
1538	s_cbranch_scc1	L_FIXUP_THREE_DWORD					// 3 DWORD inst
1539										// If otherwise, no special encodings. Default VOPD is 2 Dword
1540										// Fall-thru if true, because this is a 2 DWORD inst
1541L_FIXUP_TWO_DWORD:
1542	s_wait_kmcnt	0							// Wait for PC+2 and PC+3 to arrive in ttmp2 and ttmp3
1543	s_mov_b32	ttmp15, ttmp2						// Move possible S_SET_VGPR_MSB into ttmp15
1544	s_branch	L_FIXUP_ONE_DWORD					// Go to common logic that checks if it is S_SET_VGPR_MSB
1545
1546L_FIXUP_THREE_DWORD:
1547	s_wait_kmcnt	0							// Wait for PC+2 and PC+3 to arrive in ttmp2 and ttmp3
1548	s_mov_b32	ttmp15, ttmp3						// Move possible S_SET_VGPR_MSB into ttmp15
1549	s_branch	L_FIXUP_ONE_DWORD					// Go to common logic that checks if it is S_SET_VGPR_MSB
1550
1551L_FIXUP_VOP3PX_MIDDLE:
1552	s_sub_co_u32	ttmp0, ttmp0, 8						// Rewind PC 8 bytes to beginning of instruction
1553	s_sub_co_ci_u32	ttmp1, ttmp1, 0
1554	s_branch	L_FIXUP_TWO_DWORD					// 2 DWORD inst (2nd half of a 4 DWORD inst)
1555
1556L_FIXUP_DONE:
1557	s_wait_kmcnt	0							// Ensure load of ttmp2 and ttmp3 is done
1558end
1559#endif
1560