xref: /linux/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm (revision 237f1bbfe3d84a74ad8e6e207660bdb3e6d9a84d)
1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23/* To compile this assembly code:
24 *
25 * gfx12:
26 *   cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx12.asm -P -o gfx12.sp3
27 *   sp3 gfx12.sp3 -hex gfx12.hex
28 */
29
30#define CHIP_GFX12 37
31
32#define SINGLE_STEP_MISSED_WORKAROUND 1	//workaround for lost TRAP_AFTER_INST exception when SAVECTX raised
33#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12)
34
35var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK	= 0x4
36var SQ_WAVE_STATE_PRIV_SCC_SHIFT		= 9
37var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK		= 0xC00
38var SQ_WAVE_STATE_PRIV_HALT_MASK		= 0x4000
39var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK		= 0x8000
40var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT		= 15
41var SQ_WAVE_STATUS_WAVE64_SHIFT			= 29
42var SQ_WAVE_STATUS_WAVE64_SIZE			= 1
43var SQ_WAVE_STATUS_NO_VGPRS_SHIFT		= 24
44var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK	= SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
45var S_SAVE_PC_HI_TRAP_ID_MASK			= 0xF0000000
46
47var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT		= 12
48var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
49var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE		= 8
50var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT		= 12
51var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT	= 24
52var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE	= 4
53var SQ_WAVE_LDS_ALLOC_GRANULARITY		= 9
54
55var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK	= 0xF
56var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK	= 0x10
57var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT	= 5
58var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK	= 0x20
59var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK	= 0x40
60var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT	= 6
61var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK	= 0x80
62var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT	= 7
63var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK	= 0x100
64var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT	= 8
65var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK	= 0x200
66var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK	= 0x800
67var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK		= 0x80
68var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK	= 0x200
69
70var SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK= SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK		|\
71						  SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK	|\
72						  SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK		|\
73						  SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK	|\
74						  SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK		|\
75						  SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK
76var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE	= SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT
77var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT	= SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
78var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE	= SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
79var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT	= SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
80var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE	= 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT
81
82var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT		= 0
83var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE		= 2
84
85var BARRIER_STATE_SIGNAL_OFFSET			= 16
86var BARRIER_STATE_VALID_OFFSET			= 0
87
88var TTMP11_SCHED_MODE_SHIFT			= 26
89var TTMP11_SCHED_MODE_SIZE			= 2
90var TTMP11_SCHED_MODE_MASK			= 0xC000000
91var TTMP11_DEBUG_TRAP_ENABLED_SHIFT		= 23
92var TTMP11_DEBUG_TRAP_ENABLED_MASK		= 0x800000
93
94// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
95// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
96var S_SAVE_BUF_RSRC_WORD1_STRIDE		= 0x00040000
97var S_SAVE_BUF_RSRC_WORD3_MISC			= 0x10807FAC
98var S_SAVE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
99var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
100
101var S_SAVE_PC_HI_FIRST_WAVE_MASK		= 0x80000000
102var S_SAVE_PC_HI_FIRST_WAVE_SHIFT		= 31
103
104var s_sgpr_save_num				= 108
105
106var s_save_spi_init_lo				= exec_lo
107var s_save_spi_init_hi				= exec_hi
108var s_save_pc_lo				= ttmp0
109var s_save_pc_hi				= ttmp1
110var s_save_exec_lo				= ttmp2
111var s_save_exec_hi				= ttmp3
112var s_save_state_priv				= ttmp12
113var s_save_excp_flag_priv			= ttmp15
114var s_save_xnack_mask				= s_save_excp_flag_priv
115var s_wave_size					= ttmp7
116var s_save_buf_rsrc0				= ttmp8
117var s_save_buf_rsrc1				= ttmp9
118var s_save_buf_rsrc2				= ttmp10
119var s_save_buf_rsrc3				= ttmp11
120var s_save_mem_offset				= ttmp4
121var s_save_alloc_size				= s_save_excp_flag_priv
122var s_save_tmp					= ttmp14
123var s_save_m0					= ttmp5
124var s_save_ttmps_lo				= s_save_tmp
125var s_save_ttmps_hi				= s_save_excp_flag_priv
126
127var S_RESTORE_BUF_RSRC_WORD1_STRIDE		= S_SAVE_BUF_RSRC_WORD1_STRIDE
128var S_RESTORE_BUF_RSRC_WORD3_MISC		= S_SAVE_BUF_RSRC_WORD3_MISC
129
130var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
131var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
132var S_WAVE_SIZE					= 25
133
134var s_restore_spi_init_lo			= exec_lo
135var s_restore_spi_init_hi			= exec_hi
136var s_restore_mem_offset			= ttmp12
137var s_restore_alloc_size			= ttmp3
138var s_restore_tmp				= ttmp2
139var s_restore_mem_offset_save			= s_restore_tmp
140var s_restore_m0				= s_restore_alloc_size
141var s_restore_mode				= ttmp7
142var s_restore_flat_scratch			= s_restore_tmp
143var s_restore_pc_lo				= ttmp0
144var s_restore_pc_hi				= ttmp1
145var s_restore_exec_lo				= ttmp4
146var s_restore_exec_hi				= ttmp5
147var s_restore_state_priv			= ttmp14
148var s_restore_excp_flag_priv			= ttmp15
149var s_restore_xnack_mask			= ttmp13
150var s_restore_buf_rsrc0				= ttmp8
151var s_restore_buf_rsrc1				= ttmp9
152var s_restore_buf_rsrc2				= ttmp10
153var s_restore_buf_rsrc3				= ttmp11
154var s_restore_size				= ttmp6
155var s_restore_ttmps_lo				= s_restore_tmp
156var s_restore_ttmps_hi				= s_restore_alloc_size
157var s_restore_spi_init_hi_save			= s_restore_exec_hi
158
159shader main
160	asic(DEFAULT)
161	type(CS)
162	wave_size(32)
163
164	s_branch	L_SKIP_RESTORE						//NOT restore. might be a regular trap or save
165
166L_JUMP_TO_RESTORE:
167	s_branch	L_RESTORE
168
169L_SKIP_RESTORE:
170	// Assume most relaxed scheduling mode is set. Save and revert to normal mode.
171	s_getreg_b32	ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE)
172	s_wait_alu	0
173	s_setreg_imm32_b32	hwreg(HW_REG_WAVE_SCHED_MODE, \
174		SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0
175
176	s_getreg_b32	s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV)	//save STATUS since we will change SCC
177
178	// Save SCHED_MODE[1:0] into ttmp11[27:26].
179	s_andn2_b32	ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK
180	s_lshl_b32	ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT
181	s_or_b32	ttmp11, ttmp11, ttmp2
182
183	// Clear SPI_PRIO: do not save with elevated priority.
184	// Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
185	s_andn2_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK
186
187	s_getreg_b32	s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
188
189	s_and_b32       ttmp2, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK
190	s_cbranch_scc0	L_NOT_HALTED
191
192L_HALTED:
193	// Host trap may occur while wave is halted.
194	s_and_b32	ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
195	s_cbranch_scc1	L_FETCH_2ND_TRAP
196
197L_CHECK_SAVE:
198	s_and_b32	ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK
199	s_cbranch_scc1	L_SAVE
200
201	// Wave is halted but neither host trap nor SAVECTX is raised.
202	// Caused by instruction fetch memory violation.
203	// Spin wait until context saved to prevent interrupt storm.
204	s_sleep		0x10
205	s_getreg_b32	s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
206	s_branch	L_CHECK_SAVE
207
208L_NOT_HALTED:
209	// Let second-level handle non-SAVECTX exception or trap.
210	// Any concurrent SAVECTX will be handled upon re-entry once halted.
211
212	// Check non-maskable exceptions. memory_violation, illegal_instruction
213	// and xnack_error exceptions always cause the wave to enter the trap
214	// handler.
215	s_and_b32	ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK
216	s_cbranch_scc1	L_FETCH_2ND_TRAP
217
218	// Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
219	// Maskable exceptions only cause the wave to enter the trap handler if
220	// their respective bit in mode.excp_en is set.
221	s_getreg_b32	ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
222	s_and_b32	ttmp3, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK
223	s_cbranch_scc0	L_NOT_ADDR_WATCH
224	s_or_b32	ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK
225
226L_NOT_ADDR_WATCH:
227	s_getreg_b32	ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL)
228	s_and_b32	ttmp2, ttmp3, ttmp2
229	s_cbranch_scc1	L_FETCH_2ND_TRAP
230
231L_CHECK_TRAP_ID:
232	// Check trap_id != 0
233	s_and_b32	ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
234	s_cbranch_scc1	L_FETCH_2ND_TRAP
235
236#if SINGLE_STEP_MISSED_WORKAROUND
237	// Prioritize single step exception over context save.
238	// Second-level trap will halt wave and RFE, re-entering for SAVECTX.
239	// WAVE_TRAP_CTRL is already in ttmp3.
240	s_and_b32	ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK
241	s_cbranch_scc1	L_FETCH_2ND_TRAP
242#endif
243
244	s_and_b32	ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK
245	s_cbranch_scc1	L_SAVE
246
247L_FETCH_2ND_TRAP:
248	// Read second-level TBA/TMA from first-level TMA and jump if available.
249	// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
250	// ttmp12 holds SQ_WAVE_STATUS
251	s_sendmsg_rtn_b64       [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
252	s_wait_idle
253	s_lshl_b64	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
254
255	s_bitcmp1_b32	ttmp15, 0xF
256	s_cbranch_scc0	L_NO_SIGN_EXTEND_TMA
257	s_or_b32	ttmp15, ttmp15, 0xFFFF0000
258L_NO_SIGN_EXTEND_TMA:
259#if ASIC_FAMILY == CHIP_GFX12
260	// Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI).
261	// The second-level trap will restore from ttmp1 for backwards compatibility.
262	s_and_b32	ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK
263	s_andn2_b32	ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK
264	s_or_b32	ttmp1, ttmp1, ttmp2
265#endif
266
267	s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS		// debug trap enabled flag
268	s_wait_idle
269	s_lshl_b32      ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
270	s_andn2_b32     ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
271	s_or_b32        ttmp11, ttmp11, ttmp2
272
273	s_load_dwordx2	[ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 scope:SCOPE_SYS	// second-level TBA
274	s_wait_idle
275	s_load_dwordx2	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 scope:SCOPE_SYS	// second-level TMA
276	s_wait_idle
277
278	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
279	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set
280	s_setpc_b64	[ttmp2, ttmp3]						// jump to second-level trap handler
281
282L_NO_NEXT_TRAP:
283	// If not caused by trap then halt wave to prevent re-entry.
284	s_and_b32	ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
285	s_cbranch_scc1	L_TRAP_CASE
286
287	// Host trap will not cause trap re-entry.
288	s_getreg_b32	ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
289	s_and_b32	ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
290	s_cbranch_scc1	L_EXIT_TRAP
291	s_or_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK
292
293	// If the PC points to S_ENDPGM then context save will fail if STATE_PRIV.HALT is set.
294	// Rewind the PC to prevent this from occurring.
295	s_sub_u32	ttmp0, ttmp0, 0x8
296	s_subb_u32	ttmp1, ttmp1, 0x0
297
298	s_branch	L_EXIT_TRAP
299
300L_TRAP_CASE:
301	// Advance past trap instruction to prevent re-entry.
302	s_add_u32	ttmp0, ttmp0, 0x4
303	s_addc_u32	ttmp1, ttmp1, 0x0
304
305L_EXIT_TRAP:
306	s_and_b32	ttmp1, ttmp1, 0xFFFF
307
308	// Restore SQ_WAVE_STATUS.
309	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
310	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
311
312	// STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
313	// Only restore fields which the trap handler changes.
314	s_lshr_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
315
316	// Assume relaxed scheduling mode after this point.
317	restore_sched_mode(ttmp2)
318
319	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
320		SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv
321
322	s_rfe_b64	[ttmp0, ttmp1]
323
324L_SAVE:
325	// If VGPRs have been deallocated then terminate the wavefront.
326	// It has no remaining program to run and cannot save without VGPRs.
327	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
328	s_bitcmp1_b32	s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
329	s_cbranch_scc0	L_HAVE_VGPRS
330	s_endpgm
331L_HAVE_VGPRS:
332
333	s_and_b32	s_save_pc_hi, s_save_pc_hi, 0x0000ffff			//pc[47:32]
334	s_mov_b32	s_save_tmp, 0
335	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp	//clear saveCtx bit
336
337	/* inform SPI the readiness and wait for SPI's go signal */
338	s_mov_b32	s_save_exec_lo, exec_lo					//save EXEC and use EXEC for the go signal from SPI
339	s_mov_b32	s_save_exec_hi, exec_hi
340	s_mov_b64	exec, 0x0						//clear EXEC to get ready to receive
341
342	s_sendmsg_rtn_b64       [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE)
343	s_wait_idle
344
345	// Save first_wave flag so we can clear high bits of save address.
346	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
347	s_lshl_b32	s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
348	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
349
350	// Trap temporaries must be saved via VGPR but all VGPRs are in use.
351	// There is no ttmp space to hold the resource constant for VGPR save.
352	// Save v0 by itself since it requires only two SGPRs.
353	s_mov_b32	s_save_ttmps_lo, exec_lo
354	s_and_b32	s_save_ttmps_hi, exec_hi, 0xFFFF
355	s_mov_b32	exec_lo, 0xFFFFFFFF
356	s_mov_b32	exec_hi, 0xFFFFFFFF
357	global_store_dword_addtid	v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS
358	v_mov_b32	v0, 0x0
359	s_mov_b32	exec_lo, s_save_ttmps_lo
360	s_mov_b32	exec_hi, s_save_ttmps_hi
361
362	// Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
363	// ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
364	get_wave_size2(s_save_ttmps_hi)
365	get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
366	get_svgpr_size_bytes(s_save_ttmps_hi)
367	s_add_u32	s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
368	s_and_b32	s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
369	s_add_u32	s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
370	s_add_u32	s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
371	s_addc_u32	s_save_ttmps_hi, s_save_ttmps_hi, 0x0
372
373	v_writelane_b32	v0, ttmp4, 0x4
374	v_writelane_b32	v0, ttmp5, 0x5
375	v_writelane_b32	v0, ttmp6, 0x6
376	v_writelane_b32	v0, ttmp7, 0x7
377	v_writelane_b32	v0, ttmp8, 0x8
378	v_writelane_b32	v0, ttmp9, 0x9
379	v_writelane_b32	v0, ttmp10, 0xA
380	v_writelane_b32	v0, ttmp11, 0xB
381	v_writelane_b32	v0, ttmp13, 0xD
382	v_writelane_b32	v0, exec_lo, 0xE
383	v_writelane_b32	v0, exec_hi, 0xF
384	valu_sgpr_hazard()
385
386	s_mov_b32	exec_lo, 0x3FFF
387	s_mov_b32	exec_hi, 0x0
388	global_store_dword_addtid	v0, [s_save_ttmps_lo, s_save_ttmps_hi] offset:0x40 scope:SCOPE_SYS
389	v_readlane_b32	ttmp14, v0, 0xE
390	v_readlane_b32	ttmp15, v0, 0xF
391	s_mov_b32	exec_lo, ttmp14
392	s_mov_b32	exec_hi, ttmp15
393
394	/* setup Resource Contants */
395	s_mov_b32	s_save_buf_rsrc0, s_save_spi_init_lo			//base_addr_lo
396	s_and_b32	s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF	//base_addr_hi
397	s_or_b32	s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
398	s_mov_b32	s_save_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
399	s_mov_b32	s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
400
401	s_mov_b32	s_save_m0, m0
402
403	/* global mem offset */
404	s_mov_b32	s_save_mem_offset, 0x0
405	get_wave_size2(s_wave_size)
406
407	/* save first 4 VGPRs, needed for SGPR save */
408	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
409	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
410	s_and_b32	m0, m0, 1
411	s_cmp_eq_u32	m0, 1
412	s_cbranch_scc1	L_ENABLE_SAVE_4VGPR_EXEC_HI
413	s_mov_b32	exec_hi, 0x00000000
414	s_branch	L_SAVE_4VGPR_WAVE32
415L_ENABLE_SAVE_4VGPR_EXEC_HI:
416	s_mov_b32	exec_hi, 0xFFFFFFFF
417	s_branch	L_SAVE_4VGPR_WAVE64
418L_SAVE_4VGPR_WAVE32:
419	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
420
421	// VGPR Allocated in 4-GPR granularity
422
423	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128
424	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2
425	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3
426	s_branch	L_SAVE_HWREG
427
428L_SAVE_4VGPR_WAVE64:
429	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
430
431	// VGPR Allocated in 4-GPR granularity
432
433	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256
434	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2
435	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3
436
437	/* save HW registers */
438
439L_SAVE_HWREG:
440	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
441	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
442	get_svgpr_size_bytes(s_save_tmp)
443	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
444	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
445
446	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
447
448	v_mov_b32	v0, 0x0							//Offset[31:0] from buffer resource
449	v_mov_b32	v1, 0x0							//Offset[63:32] from buffer resource
450	v_mov_b32	v2, 0x0							//Set of SGPRs for TCP store
451
452	// Ensure no further changes to barrier or LDS state.
453	// STATE_PRIV.BARRIER_COMPLETE may change up to this point.
454	s_barrier_signal	-2
455	s_barrier_wait	-2
456
457	// Re-read final state of BARRIER_COMPLETE field for save.
458	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV)
459	s_and_b32	s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
460	s_andn2_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
461	s_or_b32	s_save_state_priv, s_save_state_priv, s_save_tmp
462
463	s_andn2_b32	s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
464	v_writelane_b32	v2, s_save_m0, 0x0
465	v_writelane_b32	v2, s_save_pc_lo, 0x1
466	v_writelane_b32	v2, s_save_tmp, 0x2
467	v_writelane_b32	v2, s_save_exec_lo, 0x3
468	v_writelane_b32	v2, s_save_exec_hi, 0x4
469	v_writelane_b32	v2, s_save_state_priv, 0x5
470	v_writelane_b32	v2, s_save_xnack_mask, 0x7
471	valu_sgpr_hazard()
472
473	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
474	v_writelane_b32	v2, s_save_tmp, 0x6
475
476	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_MODE)
477	v_writelane_b32	v2, s_save_tmp, 0x8
478
479	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO)
480	v_writelane_b32	v2, s_save_tmp, 0x9
481
482	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI)
483	v_writelane_b32	v2, s_save_tmp, 0xA
484
485	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
486	v_writelane_b32	v2, s_save_tmp, 0xB
487
488	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_TRAP_CTRL)
489	v_writelane_b32	v2, s_save_tmp, 0xC
490
491	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
492	v_writelane_b32	v2, s_save_tmp, 0xD
493
494	s_get_barrier_state s_save_tmp, -1
495	s_wait_kmcnt (0)
496	v_writelane_b32	v2, s_save_tmp, 0xE
497	valu_sgpr_hazard()
498
499	// Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
500	s_mov_b32       exec_lo, 0xFFFF
501	s_mov_b32	exec_hi, 0x0
502	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
503
504	// Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
505	s_mov_b32       exec_lo, 0xFFFFFFFF
506
507	/* save SGPRs */
508	// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
509
510	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
511	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
512	get_svgpr_size_bytes(s_save_tmp)
513	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
514	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
515
516	s_mov_b32	ttmp13, 0x0						//next VGPR lane to copy SGPR into
517
518	s_mov_b32	m0, 0x0							//SGPR initial index value =0
519	s_nop		0x0							//Manually inserted wait states
520L_SAVE_SGPR_LOOP:
521	// SGPR is allocated in 16 SGPR granularity
522	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
523	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
524	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
525	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
526	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
527	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
528	s_movrels_b64	s12, s12						//s12 = s[12+m0], s13 = s[13+m0]
529	s_movrels_b64	s14, s14						//s14 = s[14+m0], s15 = s[15+m0]
530
531	s_cmp_eq_u32	ttmp13, 0x0
532	s_cbranch_scc0	L_WRITE_V2_SECOND_HALF
533	write_16sgpr_to_v2(s0, 0x0)
534	s_branch	L_SAVE_SGPR_SKIP_TCP_STORE
535L_WRITE_V2_SECOND_HALF:
536	write_16sgpr_to_v2(s0, 0x10)
537
538	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
539	s_add_u32	s_save_mem_offset, s_save_mem_offset, 0x80
540	s_mov_b32	ttmp13, 0x0
541	v_mov_b32	v2, 0x0
542L_SAVE_SGPR_SKIP_TCP_STORE:
543
544	s_add_u32	m0, m0, 16						//next sgpr index
545	s_cmp_lt_u32	m0, 96							//scc = (m0 < first 96 SGPR) ? 1 : 0
546	s_cbranch_scc1	L_SAVE_SGPR_LOOP					//first 96 SGPR save is complete?
547
548	//save the rest 12 SGPR
549	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
550	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
551	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
552	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
553	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
554	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
555	write_12sgpr_to_v2(s0)
556
557	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
558
559	/* save LDS */
560
561L_SAVE_LDS:
562	// Change EXEC to all threads...
563	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
564	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
565	s_and_b32	m0, m0, 1
566	s_cmp_eq_u32	m0, 1
567	s_cbranch_scc1	L_ENABLE_SAVE_LDS_EXEC_HI
568	s_mov_b32	exec_hi, 0x00000000
569	s_branch	L_SAVE_LDS_NORMAL
570L_ENABLE_SAVE_LDS_EXEC_HI:
571	s_mov_b32	exec_hi, 0xFFFFFFFF
572L_SAVE_LDS_NORMAL:
573	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
574	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//lds_size is zero?
575	s_cbranch_scc0	L_SAVE_LDS_DONE						//no lds used? jump to L_SAVE_DONE
576
577	s_and_b32	s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
578	s_cbranch_scc0	L_SAVE_LDS_DONE
579
580	// first wave do LDS save;
581
582	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
583	s_mov_b32	s_save_buf_rsrc2, s_save_alloc_size			//NUM_RECORDS in bytes
584
585	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
586	//
587	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
588	get_svgpr_size_bytes(s_save_tmp)
589	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
590	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
591	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
592
593	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
594
595	//load 0~63*4(byte address) to vgpr v0
596	v_mbcnt_lo_u32_b32	v0, -1, 0
597	v_mbcnt_hi_u32_b32	v0, -1, v0
598	v_mul_u32_u24	v0, 4, v0
599
600	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
601	s_and_b32	m0, m0, 1
602	s_cmp_eq_u32	m0, 1
603	s_mov_b32	m0, 0x0
604	s_cbranch_scc1	L_SAVE_LDS_W64
605
606L_SAVE_LDS_W32:
607	s_mov_b32	s3, 128
608	s_nop		0
609	s_nop		0
610	s_nop		0
611L_SAVE_LDS_LOOP_W32:
612	ds_read_b32	v1, v0
613	s_wait_idle
614	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
615
616	s_add_u32	m0, m0, s3						//every buffer_store_lds does 128 bytes
617	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
618	v_add_nc_u32	v0, v0, 128						//mem offset increased by 128 bytes
619	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
620	s_cbranch_scc1	L_SAVE_LDS_LOOP_W32					//LDS save is complete?
621
622	s_branch	L_SAVE_LDS_DONE
623
624L_SAVE_LDS_W64:
625	s_mov_b32	s3, 256
626	s_nop		0
627	s_nop		0
628	s_nop		0
629L_SAVE_LDS_LOOP_W64:
630	ds_read_b32	v1, v0
631	s_wait_idle
632	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
633
634	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
635	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
636	v_add_nc_u32	v0, v0, 256						//mem offset increased by 256 bytes
637	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
638	s_cbranch_scc1	L_SAVE_LDS_LOOP_W64					//LDS save is complete?
639
640L_SAVE_LDS_DONE:
641	/* save VGPRs  - set the Rest VGPRs */
642L_SAVE_VGPR:
643	// VGPR SR memory offset: 0
644	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
645	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
646	s_and_b32	m0, m0, 1
647	s_cmp_eq_u32	m0, 1
648	s_cbranch_scc1	L_ENABLE_SAVE_VGPR_EXEC_HI
649	s_mov_b32	s_save_mem_offset, (0+128*4)				// for the rest VGPRs
650	s_mov_b32	exec_hi, 0x00000000
651	s_branch	L_SAVE_VGPR_NORMAL
652L_ENABLE_SAVE_VGPR_EXEC_HI:
653	s_mov_b32	s_save_mem_offset, (0+256*4)				// for the rest VGPRs
654	s_mov_b32	exec_hi, 0xFFFFFFFF
655L_SAVE_VGPR_NORMAL:
656	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
657	s_add_u32	s_save_alloc_size, s_save_alloc_size, 1
658	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
659	//determine it is wave32 or wave64
660	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
661	s_and_b32	m0, m0, 1
662	s_cmp_eq_u32	m0, 1
663	s_cbranch_scc1	L_SAVE_VGPR_WAVE64
664
665	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
666
667	// VGPR Allocated in 4-GPR granularity
668
669	// VGPR store using dw burst
670	s_mov_b32	m0, 0x4							//VGPR initial index value =4
671	s_cmp_lt_u32	m0, s_save_alloc_size
672	s_cbranch_scc0	L_SAVE_VGPR_END
673
674L_SAVE_VGPR_W32_LOOP:
675	v_movrels_b32	v0, v0							//v0 = v[0+m0]
676	v_movrels_b32	v1, v1							//v1 = v[1+m0]
677	v_movrels_b32	v2, v2							//v2 = v[2+m0]
678	v_movrels_b32	v3, v3							//v3 = v[3+m0]
679
680	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
681	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128
682	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2
683	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3
684
685	s_add_u32	m0, m0, 4						//next vgpr index
686	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128*4		//every buffer_store_dword does 128 bytes
687	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
688	s_cbranch_scc1	L_SAVE_VGPR_W32_LOOP					//VGPR save is complete?
689
690	s_branch	L_SAVE_VGPR_END
691
692L_SAVE_VGPR_WAVE64:
693	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
694
695	// VGPR store using dw burst
696	s_mov_b32	m0, 0x4							//VGPR initial index value =4
697	s_cmp_lt_u32	m0, s_save_alloc_size
698	s_cbranch_scc0	L_SAVE_SHARED_VGPR
699
700L_SAVE_VGPR_W64_LOOP:
701	v_movrels_b32	v0, v0							//v0 = v[0+m0]
702	v_movrels_b32	v1, v1							//v1 = v[1+m0]
703	v_movrels_b32	v2, v2							//v2 = v[2+m0]
704	v_movrels_b32	v3, v3							//v3 = v[3+m0]
705
706	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
707	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256
708	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2
709	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3
710
711	s_add_u32	m0, m0, 4						//next vgpr index
712	s_add_u32	s_save_mem_offset, s_save_mem_offset, 256*4		//every buffer_store_dword does 256 bytes
713	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
714	s_cbranch_scc1	L_SAVE_VGPR_W64_LOOP					//VGPR save is complete?
715
716L_SAVE_SHARED_VGPR:
717	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
718	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
719	s_cbranch_scc0	L_SAVE_VGPR_END						//no shared_vgpr used? jump to L_SAVE_LDS
720	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 3			//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
721	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
722	//save shared_vgpr will start from the index of m0
723	s_add_u32	s_save_alloc_size, s_save_alloc_size, m0
724	s_mov_b32	exec_lo, 0xFFFFFFFF
725	s_mov_b32	exec_hi, 0x00000000
726
727L_SAVE_SHARED_VGPR_WAVE64_LOOP:
728	v_movrels_b32	v0, v0							//v0 = v[0+m0]
729	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
730	s_add_u32	m0, m0, 1						//next vgpr index
731	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128
732	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
733	s_cbranch_scc1	L_SAVE_SHARED_VGPR_WAVE64_LOOP				//SHARED_VGPR save is complete?
734
735L_SAVE_VGPR_END:
736	s_branch	L_END_PGM
737
738L_RESTORE:
739	/* Setup Resource Contants */
740	s_mov_b32	s_restore_buf_rsrc0, s_restore_spi_init_lo		//base_addr_lo
741	s_and_b32	s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF	//base_addr_hi
742	s_or_b32	s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
743	s_mov_b32	s_restore_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes)
744	s_mov_b32	s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
745
746	// Save s_restore_spi_init_hi for later use.
747	s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi
748
749	//determine it is wave32 or wave64
750	get_wave_size2(s_restore_size)
751
752	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
753	s_cbranch_scc0	L_RESTORE_VGPR
754
755	/* restore LDS */
756L_RESTORE_LDS:
757	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
758	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
759	s_and_b32	m0, m0, 1
760	s_cmp_eq_u32	m0, 1
761	s_cbranch_scc1	L_ENABLE_RESTORE_LDS_EXEC_HI
762	s_mov_b32	exec_hi, 0x00000000
763	s_branch	L_RESTORE_LDS_NORMAL
764L_ENABLE_RESTORE_LDS_EXEC_HI:
765	s_mov_b32	exec_hi, 0xFFFFFFFF
766L_RESTORE_LDS_NORMAL:
767	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
768	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//lds_size is zero?
769	s_cbranch_scc0	L_RESTORE_VGPR						//no lds used? jump to L_RESTORE_VGPR
770	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
771	s_mov_b32	s_restore_buf_rsrc2, s_restore_alloc_size		//NUM_RECORDS in bytes
772
773	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
774	//
775	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
776	get_svgpr_size_bytes(s_restore_tmp)
777	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
778	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
779	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
780
781	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
782
783	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
784	s_and_b32	m0, m0, 1
785	s_cmp_eq_u32	m0, 1
786	s_mov_b32	m0, 0x0
787	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64
788
789L_RESTORE_LDS_LOOP_W32:
790	buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
791	s_wait_idle
792	ds_store_addtid_b32     v0
793	s_add_u32	m0, m0, 128						// 128 DW
794	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128		//mem offset increased by 128DW
795	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
796	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W32					//LDS restore is complete?
797	s_branch	L_RESTORE_VGPR
798
799L_RESTORE_LDS_LOOP_W64:
800	buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
801	s_wait_idle
802	ds_store_addtid_b32     v0
803	s_add_u32	m0, m0, 256						// 256 DW
804	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256		//mem offset increased by 256DW
805	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
806	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64					//LDS restore is complete?
807
808	/* restore VGPRs */
809L_RESTORE_VGPR:
810	// VGPR SR memory offset : 0
811	s_mov_b32	s_restore_mem_offset, 0x0
812	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
813	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
814	s_and_b32	m0, m0, 1
815	s_cmp_eq_u32	m0, 1
816	s_cbranch_scc1	L_ENABLE_RESTORE_VGPR_EXEC_HI
817	s_mov_b32	exec_hi, 0x00000000
818	s_branch	L_RESTORE_VGPR_NORMAL
819L_ENABLE_RESTORE_VGPR_EXEC_HI:
820	s_mov_b32	exec_hi, 0xFFFFFFFF
821L_RESTORE_VGPR_NORMAL:
822	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
823	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, 1
824	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
825	//determine it is wave32 or wave64
826	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
827	s_and_b32	m0, m0, 1
828	s_cmp_eq_u32	m0, 1
829	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64
830
831	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
832
833	// VGPR load using dw burst
834	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v1, v0 will be the last
835	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4
836	s_mov_b32	m0, 4							//VGPR initial index value = 4
837	s_cmp_lt_u32	m0, s_restore_alloc_size
838	s_cbranch_scc0	L_RESTORE_SGPR
839
840L_RESTORE_VGPR_WAVE32_LOOP:
841	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
842	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128
843	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*2
844	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*3
845	s_wait_idle
846	v_movreld_b32	v0, v0							//v[0+m0] = v0
847	v_movreld_b32	v1, v1
848	v_movreld_b32	v2, v2
849	v_movreld_b32	v3, v3
850	s_add_u32	m0, m0, 4						//next vgpr index
851	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4	//every buffer_load_dword does 128 bytes
852	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
853	s_cbranch_scc1	L_RESTORE_VGPR_WAVE32_LOOP				//VGPR restore (except v0) is complete?
854
855	/* VGPR restore on v0 */
856	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS
857	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128
858	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*2
859	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*3
860	s_wait_idle
861
862	s_branch	L_RESTORE_SGPR
863
864L_RESTORE_VGPR_WAVE64:
865	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
866
867	// VGPR load using dw burst
868	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v4, v0 will be the last
869	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4
870	s_mov_b32	m0, 4							//VGPR initial index value = 4
871	s_cmp_lt_u32	m0, s_restore_alloc_size
872	s_cbranch_scc0	L_RESTORE_SHARED_VGPR
873
874L_RESTORE_VGPR_WAVE64_LOOP:
875	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
876	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256
877	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*2
878	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*3
879	s_wait_idle
880	v_movreld_b32	v0, v0							//v[0+m0] = v0
881	v_movreld_b32	v1, v1
882	v_movreld_b32	v2, v2
883	v_movreld_b32	v3, v3
884	s_add_u32	m0, m0, 4						//next vgpr index
885	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4	//every buffer_load_dword does 256 bytes
886	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
887	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64_LOOP				//VGPR restore (except v0) is complete?
888
889L_RESTORE_SHARED_VGPR:
890	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)	//shared_vgpr_size
891	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
892	s_cbranch_scc0	L_RESTORE_V0						//no shared_vgpr used?
893	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 3		//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
894	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
895	//restore shared_vgpr will start from the index of m0
896	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, m0
897	s_mov_b32	exec_lo, 0xFFFFFFFF
898	s_mov_b32	exec_hi, 0x00000000
899L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
900	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
901	s_wait_idle
902	v_movreld_b32	v0, v0							//v[0+m0] = v0
903	s_add_u32	m0, m0, 1						//next vgpr index
904	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128
905	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
906	s_cbranch_scc1	L_RESTORE_SHARED_VGPR_WAVE64_LOOP			//VGPR restore (except v0) is complete?
907
908	s_mov_b32	exec_hi, 0xFFFFFFFF					//restore back exec_hi before restoring V0!!
909
910	/* VGPR restore on v0 */
911L_RESTORE_V0:
912	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS
913	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256
914	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*2
915	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*3
916	s_wait_idle
917
918	/* restore SGPRs */
919	//will be 2+8+16*6
920	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
921L_RESTORE_SGPR:
922	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
923	get_svgpr_size_bytes(s_restore_tmp)
924	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
925	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
926	s_sub_u32	s_restore_mem_offset, s_restore_mem_offset, 20*4	//s108~s127 is not saved
927
928	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
929
930	s_mov_b32	m0, s_sgpr_save_num
931
932	read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
933	s_wait_idle
934
935	s_sub_u32	m0, m0, 4						// Restore from S[0] to S[104]
936	s_nop		0							// hazard SALU M0=> S_MOVREL
937
938	s_movreld_b64	s0, s0							//s[0+m0] = s0
939	s_movreld_b64	s2, s2
940
941	read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
942	s_wait_idle
943
944	s_sub_u32	m0, m0, 8						// Restore from S[0] to S[96]
945	s_nop		0							// hazard SALU M0=> S_MOVREL
946
947	s_movreld_b64	s0, s0							//s[0+m0] = s0
948	s_movreld_b64	s2, s2
949	s_movreld_b64	s4, s4
950	s_movreld_b64	s6, s6
951
952 L_RESTORE_SGPR_LOOP:
953	read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
954	s_wait_idle
955
956	s_sub_u32	m0, m0, 16						// Restore from S[n] to S[0]
957	s_nop		0							// hazard SALU M0=> S_MOVREL
958
959	s_movreld_b64	s0, s0							//s[0+m0] = s0
960	s_movreld_b64	s2, s2
961	s_movreld_b64	s4, s4
962	s_movreld_b64	s6, s6
963	s_movreld_b64	s8, s8
964	s_movreld_b64	s10, s10
965	s_movreld_b64	s12, s12
966	s_movreld_b64	s14, s14
967
968	s_cmp_eq_u32	m0, 0							//scc = (m0 < s_sgpr_save_num) ? 1 : 0
969	s_cbranch_scc0	L_RESTORE_SGPR_LOOP
970
971	// s_barrier with STATE_PRIV.TRAP_AFTER_INST=1, STATUS.PRIV=1 incorrectly asserts debug exception.
972	// Clear DEBUG_EN before and restore MODE after the barrier.
973	s_setreg_imm32_b32	hwreg(HW_REG_WAVE_MODE), 0
974
975	/* restore HW registers */
976L_RESTORE_HWREG:
977	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
978	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
979	get_svgpr_size_bytes(s_restore_tmp)
980	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
981	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
982
983	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
984
985	// Restore s_restore_spi_init_hi before the saved value gets clobbered.
986	s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save
987
988	read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
989	read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
990	read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
991	read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
992	read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
993	read_hwreg_from_mem(s_restore_state_priv, s_restore_buf_rsrc0, s_restore_mem_offset)
994	read_hwreg_from_mem(s_restore_excp_flag_priv, s_restore_buf_rsrc0, s_restore_mem_offset)
995	read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
996	read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
997	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
998	s_wait_idle
999
1000	s_setreg_b32	hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch
1001
1002	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1003	s_wait_idle
1004
1005	s_setreg_b32	hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch
1006
1007	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1008	s_wait_idle
1009	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
1010
1011	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1012	s_wait_idle
1013	s_setreg_b32	hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
1014
1015	// Only the first wave needs to restore the workgroup barrier.
1016	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
1017	s_cbranch_scc0	L_SKIP_BARRIER_RESTORE
1018
1019	// Skip over WAVE_STATUS, since there is no state to restore from it
1020	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 4
1021
1022	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1023	s_wait_idle
1024
1025	s_bitcmp1_b32	s_restore_tmp, BARRIER_STATE_VALID_OFFSET
1026	s_cbranch_scc0	L_SKIP_BARRIER_RESTORE
1027
1028	// extract the saved signal count from s_restore_tmp
1029	s_lshr_b32	s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
1030
1031	// We need to call s_barrier_signal repeatedly to restore the signal
1032	// count of the work group barrier.  The member count is already
1033	// initialized with the number of waves in the work group.
1034L_BARRIER_RESTORE_LOOP:
1035	s_and_b32	s_restore_tmp, s_restore_tmp, s_restore_tmp
1036	s_cbranch_scc0	L_SKIP_BARRIER_RESTORE
1037	s_barrier_signal	-1
1038	s_add_i32	s_restore_tmp, s_restore_tmp, -1
1039	s_branch	L_BARRIER_RESTORE_LOOP
1040
1041L_SKIP_BARRIER_RESTORE:
1042
1043	s_mov_b32	m0, s_restore_m0
1044	s_mov_b32	exec_lo, s_restore_exec_lo
1045	s_mov_b32	exec_hi, s_restore_exec_hi
1046
1047	// EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed.
1048	// Only restore the other fields to avoid clobbering them.
1049	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv
1050	s_lshr_b32	s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT
1051	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE), s_restore_excp_flag_priv
1052	s_lshr_b32	s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT
1053	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE), s_restore_excp_flag_priv
1054
1055	s_setreg_b32	hwreg(HW_REG_WAVE_MODE), s_restore_mode
1056
1057	// Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
1058	// ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
1059	get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
1060	get_svgpr_size_bytes(s_restore_ttmps_hi)
1061	s_add_u32	s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1062	s_add_u32	s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
1063	s_add_u32	s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1064	s_addc_u32	s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1065	s_and_b32	s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1066	s_load_dwordx4	[ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 scope:SCOPE_SYS
1067	s_load_dwordx4	[ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 scope:SCOPE_SYS
1068	s_load_dword	ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 scope:SCOPE_SYS
1069	s_wait_idle
1070
1071	s_and_b32	s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff		//pc[47:32] //Do it here in order not to affect STATUS
1072	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
1073	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
1074
1075	// Assume relaxed scheduling mode after this point.
1076	restore_sched_mode(s_restore_tmp)
1077
1078	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv	// SCC is included, which is changed by previous salu
1079
1080	// Make barrier and LDS state visible to all waves in the group.
1081	// STATE_PRIV.BARRIER_COMPLETE may change after this point.
1082	s_barrier_signal	-2
1083	s_barrier_wait	-2
1084
1085	s_rfe_b64	s_restore_pc_lo						//Return to the main shader program and resume execution
1086
1087L_END_PGM:
1088	// Make sure that no wave of the workgroup can exit the trap handler
1089	// before the workgroup barrier state is saved.
1090	s_barrier_signal	-2
1091	s_barrier_wait	-2
1092	s_endpgm_saved
1093end
1094
1095function write_16sgpr_to_v2(s, lane_offset)
1096	// Copy into VGPR for later TCP store.
1097	for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
1098		v_writelane_b32	v2, s[sgpr_idx], sgpr_idx + lane_offset
1099	end
1100	valu_sgpr_hazard()
1101	s_add_u32	ttmp13, ttmp13, 0x10
1102end
1103
1104function write_12sgpr_to_v2(s)
1105	// Copy into VGPR for later TCP store.
1106	for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
1107		v_writelane_b32	v2, s[sgpr_idx], sgpr_idx
1108	end
1109	valu_sgpr_hazard()
1110end
1111
1112function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1113	s_buffer_load_dword	s, s_rsrc, s_mem_offset scope:SCOPE_SYS
1114	s_add_u32	s_mem_offset, s_mem_offset, 4
1115end
1116
1117function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1118	s_sub_u32	s_mem_offset, s_mem_offset, 4*16
1119	s_buffer_load_dwordx16	s, s_rsrc, s_mem_offset scope:SCOPE_SYS
1120end
1121
1122function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
1123	s_sub_u32	s_mem_offset, s_mem_offset, 4*8
1124	s_buffer_load_dwordx8	s, s_rsrc, s_mem_offset scope:SCOPE_SYS
1125end
1126
1127function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
1128	s_sub_u32	s_mem_offset, s_mem_offset, 4*4
1129	s_buffer_load_dwordx4	s, s_rsrc, s_mem_offset scope:SCOPE_SYS
1130end
1131
1132function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
1133	s_getreg_b32	s_vgpr_size_byte, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1134	s_add_u32	s_vgpr_size_byte, s_vgpr_size_byte, 1
1135	s_bitcmp1_b32	s_size, S_WAVE_SIZE
1136	s_cbranch_scc1	L_ENABLE_SHIFT_W64
1137	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+7)		//Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
1138	s_branch	L_SHIFT_DONE
1139L_ENABLE_SHIFT_W64:
1140	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+8)		//Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
1141L_SHIFT_DONE:
1142end
1143
1144function get_svgpr_size_bytes(s_svgpr_size_byte)
1145	s_getreg_b32	s_svgpr_size_byte, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1146	s_lshl_b32	s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
1147end
1148
1149function get_sgpr_size_bytes
1150	return 512
1151end
1152
1153function get_hwreg_size_bytes
1154	return 128
1155end
1156
1157function get_wave_size2(s_reg)
1158	s_getreg_b32	s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE)
1159	s_lshl_b32	s_reg, s_reg, S_WAVE_SIZE
1160end
1161
1162function valu_sgpr_hazard
1163#if HAVE_VALU_SGPR_HAZARD
1164	for var rep = 0; rep < 8; rep ++
1165		ds_nop
1166	end
1167#endif
1168end
1169
1170function restore_sched_mode(s_tmp)
1171	s_bfe_u32	s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10))
1172	s_setreg_b32	hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp
1173end
1174