1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23/* To compile this assembly code: 24 * 25 * Navi1x: 26 * cpp -DASIC_FAMILY=CHIP_NAVI10 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3 27 * sp3 nv1x.sp3 -hex nv1x.hex 28 * 29 * gfx10: 30 * cpp -DASIC_FAMILY=CHIP_SIENNA_CICHLID cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3 31 * sp3 gfx10.sp3 -hex gfx10.hex 32 * 33 * gfx11: 34 * cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3 35 * sp3 gfx11.sp3 -hex gfx11.hex 36 * 37 * gfx12: 38 * cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx10.asm -P -o gfx12.sp3 39 * sp3 gfx12.sp3 -hex gfx12.hex 40 */ 41 42#define CHIP_NAVI10 26 43#define CHIP_SIENNA_CICHLID 30 44#define CHIP_PLUM_BONITO 36 45#define CHIP_GFX12 37 46 47#define NO_SQC_STORE (ASIC_FAMILY >= CHIP_SIENNA_CICHLID) 48#define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID) 49#define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO) 50#define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO) 51#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO && ASIC_FAMILY < CHIP_GFX12) 52#define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger 53#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised 54 55#if ASIC_FAMILY < CHIP_GFX12 56#define S_COHERENCE glc:1 57#define V_COHERENCE slc:1 glc:1 58#define S_WAITCNT_0 s_waitcnt 0 59#else 60#define S_COHERENCE scope:SCOPE_SYS 61#define V_COHERENCE scope:SCOPE_SYS 62#define S_WAITCNT_0 s_wait_idle 63 64#define HW_REG_SHADER_FLAT_SCRATCH_LO HW_REG_WAVE_SCRATCH_BASE_LO 65#define HW_REG_SHADER_FLAT_SCRATCH_HI HW_REG_WAVE_SCRATCH_BASE_HI 66#define HW_REG_GPR_ALLOC HW_REG_WAVE_GPR_ALLOC 67#define HW_REG_LDS_ALLOC HW_REG_WAVE_LDS_ALLOC 68#define HW_REG_MODE HW_REG_WAVE_MODE 69#endif 70 71#if ASIC_FAMILY < CHIP_GFX12 72var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 73var SQ_WAVE_STATUS_HALT_MASK = 0x2000 74var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000 75var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6 76var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 77var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 78var SQ_WAVE_LDS_ALLOC_GRANULARITY = 8 79var S_STATUS_HWREG = HW_REG_STATUS 80var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK 81var S_STATUS_HALT_MASK = SQ_WAVE_STATUS_HALT_MASK 82var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000 83var S_SAVE_PC_HI_HT_MASK = 0x01000000 84#else 85var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4 86var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 87var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00 88var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000 89var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK = 0x8000 90var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15 91var SQ_WAVE_STATUS_WAVE64_SHIFT = 29 92var SQ_WAVE_STATUS_WAVE64_SIZE = 1 93var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9 94var S_STATUS_HWREG = HW_REG_WAVE_STATE_PRIV 95var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK 96var S_STATUS_HALT_MASK = SQ_WAVE_STATE_PRIV_HALT_MASK 97var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000 98#endif 99 100var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 101var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 102var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 103var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 104var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 105var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 106 107#if ASIC_FAMILY < CHIP_PLUM_BONITO 108var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 109#else 110var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12 111#endif 112 113#if ASIC_FAMILY < CHIP_GFX12 114var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 115var SQ_WAVE_TRAPSTS_EXCP_MASK = 0x1FF 116var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 117var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK = 0x80 118var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7 119var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 120var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 121var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 122var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000 123#if ASIC_FAMILY >= CHIP_PLUM_BONITO 124var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000 125var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000 126var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000 127#endif 128var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000 129 130var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12 131var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19 132 133var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 134var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT = 25 135var SQ_WAVE_IB_STS_REPLAY_W64H_MASK = 0x02000000 136var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 137 138var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 139 140#if ASIC_FAMILY < CHIP_PLUM_BONITO 141var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK 142#else 143var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK |\ 144 SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |\ 145 SQ_WAVE_TRAPSTS_WAVE_START_MASK |\ 146 SQ_WAVE_TRAPSTS_WAVE_END_MASK |\ 147 SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK 148#endif 149var S_TRAPSTS_HWREG = HW_REG_TRAPSTS 150var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_TRAPSTS_SAVECTX_MASK 151var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT 152#else 153var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF 154var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10 155var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5 156var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20 157var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40 158var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6 159var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80 160var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK = 0x100 161var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT = 8 162var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200 163var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800 164var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80 165var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK = 0x200 166 167var S_TRAPSTS_HWREG = HW_REG_WAVE_EXCP_FLAG_PRIV 168var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK 169var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT 170var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK |\ 171 SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK |\ 172 SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK |\ 173 SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK |\ 174 SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK |\ 175 SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK 176var BARRIER_STATE_SIGNAL_OFFSET = 16 177var BARRIER_STATE_VALID_OFFSET = 0 178#endif 179 180// bits [31:24] unused by SPI debug data 181var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31 182var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000 183var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 24 184var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0x7F000000 185var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23 186var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 187 188// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] 189// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE 190var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 191var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC 192var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 193var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 194 195var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000 196var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31 197 198var s_sgpr_save_num = 108 199 200var s_save_spi_init_lo = exec_lo 201var s_save_spi_init_hi = exec_hi 202var s_save_pc_lo = ttmp0 203var s_save_pc_hi = ttmp1 204var s_save_exec_lo = ttmp2 205var s_save_exec_hi = ttmp3 206var s_save_status = ttmp12 207var s_save_trapsts = ttmp15 208var s_save_xnack_mask = s_save_trapsts 209var s_wave_size = ttmp7 210var s_save_buf_rsrc0 = ttmp8 211var s_save_buf_rsrc1 = ttmp9 212var s_save_buf_rsrc2 = ttmp10 213var s_save_buf_rsrc3 = ttmp11 214var s_save_mem_offset = ttmp4 215var s_save_alloc_size = s_save_trapsts 216var s_save_tmp = ttmp14 217var s_save_m0 = ttmp5 218var s_save_ttmps_lo = s_save_tmp 219var s_save_ttmps_hi = s_save_trapsts 220 221var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE 222var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC 223 224var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 225var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 226var S_WAVE_SIZE = 25 227 228var s_restore_spi_init_lo = exec_lo 229var s_restore_spi_init_hi = exec_hi 230var s_restore_mem_offset = ttmp12 231var s_restore_alloc_size = ttmp3 232var s_restore_tmp = ttmp2 233var s_restore_mem_offset_save = s_restore_tmp 234var s_restore_m0 = s_restore_alloc_size 235var s_restore_mode = ttmp7 236var s_restore_flat_scratch = s_restore_tmp 237var s_restore_pc_lo = ttmp0 238var s_restore_pc_hi = ttmp1 239var s_restore_exec_lo = ttmp4 240var s_restore_exec_hi = ttmp5 241var s_restore_status = ttmp14 242var s_restore_trapsts = ttmp15 243var s_restore_xnack_mask = ttmp13 244var s_restore_buf_rsrc0 = ttmp8 245var s_restore_buf_rsrc1 = ttmp9 246var s_restore_buf_rsrc2 = ttmp10 247var s_restore_buf_rsrc3 = ttmp11 248var s_restore_size = ttmp6 249var s_restore_ttmps_lo = s_restore_tmp 250var s_restore_ttmps_hi = s_restore_alloc_size 251var s_restore_spi_init_hi_save = s_restore_exec_hi 252 253shader main 254 asic(DEFAULT) 255 type(CS) 256 wave_size(32) 257 258 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save 259 260L_JUMP_TO_RESTORE: 261 s_branch L_RESTORE 262 263L_SKIP_RESTORE: 264 s_getreg_b32 s_save_status, hwreg(S_STATUS_HWREG) //save STATUS since we will change SCC 265 266 // Clear SPI_PRIO: do not save with elevated priority. 267 // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. 268 s_andn2_b32 s_save_status, s_save_status, S_STATUS_ALWAYS_CLEAR_MASK 269 270 s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) 271 272#if SW_SA_TRAP 273 // If ttmp1[30] is set then issue s_barrier to unblock dependent waves. 274 s_bitcmp1_b32 s_save_pc_hi, 30 275 s_cbranch_scc0 L_TRAP_NO_BARRIER 276 s_barrier 277 278L_TRAP_NO_BARRIER: 279 // If ttmp1[31] is set then trap may occur early. 280 // Spin wait until SAVECTX exception is raised. 281 s_bitcmp1_b32 s_save_pc_hi, 31 282 s_cbranch_scc1 L_CHECK_SAVE 283#endif 284 285 s_and_b32 ttmp2, s_save_status, S_STATUS_HALT_MASK 286 s_cbranch_scc0 L_NOT_HALTED 287 288L_HALTED: 289 // Host trap may occur while wave is halted. 290#if ASIC_FAMILY < CHIP_GFX12 291 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK 292#else 293 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK 294#endif 295 s_cbranch_scc1 L_FETCH_2ND_TRAP 296 297L_CHECK_SAVE: 298 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK 299 s_cbranch_scc1 L_SAVE 300 301 // Wave is halted but neither host trap nor SAVECTX is raised. 302 // Caused by instruction fetch memory violation. 303 // Spin wait until context saved to prevent interrupt storm. 304 s_sleep 0x10 305 s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) 306 s_branch L_CHECK_SAVE 307 308L_NOT_HALTED: 309 // Let second-level handle non-SAVECTX exception or trap. 310 // Any concurrent SAVECTX will be handled upon re-entry once halted. 311 312 // Check non-maskable exceptions. memory_violation, illegal_instruction 313 // and xnack_error exceptions always cause the wave to enter the trap 314 // handler. 315 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_NON_MASKABLE_EXCP_MASK 316 s_cbranch_scc1 L_FETCH_2ND_TRAP 317 318 // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi. 319 // Maskable exceptions only cause the wave to enter the trap handler if 320 // their respective bit in mode.excp_en is set. 321#if ASIC_FAMILY < CHIP_GFX12 322 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK 323 s_cbranch_scc0 L_CHECK_TRAP_ID 324 325 s_and_b32 ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK 326 s_cbranch_scc0 L_NOT_ADDR_WATCH 327 s_bitset1_b32 ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch 328 329L_NOT_ADDR_WATCH: 330 s_getreg_b32 ttmp3, hwreg(HW_REG_MODE) 331 s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT 332 s_and_b32 ttmp2, ttmp2, ttmp3 333 s_cbranch_scc1 L_FETCH_2ND_TRAP 334#else 335 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) 336 s_and_b32 ttmp3, s_save_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK 337 s_cbranch_scc0 L_NOT_ADDR_WATCH 338 s_or_b32 ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK 339 340L_NOT_ADDR_WATCH: 341 s_getreg_b32 ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL) 342 s_and_b32 ttmp2, ttmp3, ttmp2 343 s_cbranch_scc1 L_FETCH_2ND_TRAP 344#endif 345 346L_CHECK_TRAP_ID: 347 // Check trap_id != 0 348 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK 349 s_cbranch_scc1 L_FETCH_2ND_TRAP 350 351#if SINGLE_STEP_MISSED_WORKAROUND 352 // Prioritize single step exception over context save. 353 // Second-level trap will halt wave and RFE, re-entering for SAVECTX. 354#if ASIC_FAMILY < CHIP_GFX12 355 s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) 356 s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK 357#else 358 // WAVE_TRAP_CTRL is already in ttmp3. 359 s_and_b32 ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK 360#endif 361 s_cbranch_scc1 L_FETCH_2ND_TRAP 362#endif 363 364 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK 365 s_cbranch_scc1 L_SAVE 366 367L_FETCH_2ND_TRAP: 368#if HAVE_XNACK 369 save_and_clear_ib_sts(ttmp14, ttmp15) 370#endif 371 372 // Read second-level TBA/TMA from first-level TMA and jump if available. 373 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) 374 // ttmp12 holds SQ_WAVE_STATUS 375#if HAVE_SENDMSG_RTN 376 s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA) 377 S_WAITCNT_0 378#else 379 s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO) 380 s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI) 381#endif 382 s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 383 384 s_bitcmp1_b32 ttmp15, 0xF 385 s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA 386 s_or_b32 ttmp15, ttmp15, 0xFFFF0000 387L_NO_SIGN_EXTEND_TMA: 388 389 s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE // debug trap enabled flag 390 S_WAITCNT_0 391 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT 392 s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK 393 s_or_b32 ttmp11, ttmp11, ttmp2 394 395 s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE // second-level TBA 396 S_WAITCNT_0 397 s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE // second-level TMA 398 S_WAITCNT_0 399 400 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] 401 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set 402 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler 403 404L_NO_NEXT_TRAP: 405 // If not caused by trap then halt wave to prevent re-entry. 406 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK 407 s_cbranch_scc1 L_TRAP_CASE 408 409 // Host trap will not cause trap re-entry. 410#if ASIC_FAMILY < CHIP_GFX12 411 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK 412#else 413 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) 414 s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK 415#endif 416 s_cbranch_scc1 L_EXIT_TRAP 417 s_or_b32 s_save_status, s_save_status, S_STATUS_HALT_MASK 418 419 // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. 420 // Rewind the PC to prevent this from occurring. 421 s_sub_u32 ttmp0, ttmp0, 0x8 422 s_subb_u32 ttmp1, ttmp1, 0x0 423 424 s_branch L_EXIT_TRAP 425 426L_TRAP_CASE: 427 // Advance past trap instruction to prevent re-entry. 428 s_add_u32 ttmp0, ttmp0, 0x4 429 s_addc_u32 ttmp1, ttmp1, 0x0 430 431L_EXIT_TRAP: 432 s_and_b32 ttmp1, ttmp1, 0xFFFF 433 434#if HAVE_XNACK 435 restore_ib_sts(ttmp14, ttmp15) 436#endif 437 438 // Restore SQ_WAVE_STATUS. 439 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 440 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 441 442#if ASIC_FAMILY < CHIP_GFX12 443 s_setreg_b32 hwreg(S_STATUS_HWREG), s_save_status 444#else 445 // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it. 446 // Only restore fields which the trap handler changes. 447 s_lshr_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_SCC_SHIFT 448 s_setreg_b32 hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ 449 SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status 450#endif 451 452 s_rfe_b64 [ttmp0, ttmp1] 453 454L_SAVE: 455 // If VGPRs have been deallocated then terminate the wavefront. 456 // It has no remaining program to run and cannot save without VGPRs. 457#if ASIC_FAMILY == CHIP_PLUM_BONITO 458 s_bitcmp1_b32 s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT 459 s_cbranch_scc0 L_HAVE_VGPRS 460 s_endpgm 461L_HAVE_VGPRS: 462#endif 463#if ASIC_FAMILY >= CHIP_GFX12 464 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) 465 s_bitcmp1_b32 s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT 466 s_cbranch_scc0 L_HAVE_VGPRS 467 s_endpgm 468L_HAVE_VGPRS: 469#endif 470 471 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] 472 s_mov_b32 s_save_tmp, 0 473 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit 474 475#if HAVE_XNACK 476 save_and_clear_ib_sts(s_save_tmp, s_save_trapsts) 477#endif 478 479 /* inform SPI the readiness and wait for SPI's go signal */ 480 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI 481 s_mov_b32 s_save_exec_hi, exec_hi 482 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive 483 484#if HAVE_SENDMSG_RTN 485 s_sendmsg_rtn_b64 [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE) 486#else 487 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC 488#endif 489 490#if ASIC_FAMILY < CHIP_SIENNA_CICHLID 491L_SLEEP: 492 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause 493 // SQ hang, since the 7,8th wave could not get arbit to exec inst, while 494 // other waves are stuck into the sleep-loop and waiting for wrexec!=0 495 s_sleep 0x2 496 s_cbranch_execz L_SLEEP 497#else 498 S_WAITCNT_0 499#endif 500 501 // Save first_wave flag so we can clear high bits of save address. 502 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK 503 s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT) 504 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 505 506#if NO_SQC_STORE 507#if ASIC_FAMILY <= CHIP_SIENNA_CICHLID 508 // gfx10: If there was a VALU exception, the exception state must be 509 // cleared before executing the VALU instructions below. 510 v_clrexcp 511#endif 512 513 // Trap temporaries must be saved via VGPR but all VGPRs are in use. 514 // There is no ttmp space to hold the resource constant for VGPR save. 515 // Save v0 by itself since it requires only two SGPRs. 516 s_mov_b32 s_save_ttmps_lo, exec_lo 517 s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF 518 s_mov_b32 exec_lo, 0xFFFFFFFF 519 s_mov_b32 exec_hi, 0xFFFFFFFF 520 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] V_COHERENCE 521 v_mov_b32 v0, 0x0 522 s_mov_b32 exec_lo, s_save_ttmps_lo 523 s_mov_b32 exec_hi, s_save_ttmps_hi 524#endif 525 526 // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic 527 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 528 get_wave_size2(s_save_ttmps_hi) 529 get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) 530 get_svgpr_size_bytes(s_save_ttmps_hi) 531 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi 532 s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF 533 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() 534 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo 535 s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 536 537#if NO_SQC_STORE 538 v_writelane_b32 v0, ttmp4, 0x4 539 v_writelane_b32 v0, ttmp5, 0x5 540 v_writelane_b32 v0, ttmp6, 0x6 541 v_writelane_b32 v0, ttmp7, 0x7 542 v_writelane_b32 v0, ttmp8, 0x8 543 v_writelane_b32 v0, ttmp9, 0x9 544 v_writelane_b32 v0, ttmp10, 0xA 545 v_writelane_b32 v0, ttmp11, 0xB 546 v_writelane_b32 v0, ttmp13, 0xD 547 v_writelane_b32 v0, exec_lo, 0xE 548 v_writelane_b32 v0, exec_hi, 0xF 549 550 s_mov_b32 exec_lo, 0x3FFF 551 s_mov_b32 exec_hi, 0x0 552 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 V_COHERENCE 553 v_readlane_b32 ttmp14, v0, 0xE 554 v_readlane_b32 ttmp15, v0, 0xF 555 s_mov_b32 exec_lo, ttmp14 556 s_mov_b32 exec_hi, ttmp15 557#else 558 s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 S_COHERENCE 559 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 S_COHERENCE 560 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 S_COHERENCE 561#endif 562 563 /* setup Resource Contants */ 564 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo 565 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi 566 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE 567 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited 568 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC 569 570 s_mov_b32 s_save_m0, m0 571 572 /* global mem offset */ 573 s_mov_b32 s_save_mem_offset, 0x0 574 get_wave_size2(s_wave_size) 575 576#if HAVE_XNACK 577 // Save and clear vector XNACK state late to free up SGPRs. 578 s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) 579 s_setreg_imm32_b32 hwreg(HW_REG_SHADER_XNACK_MASK), 0x0 580#endif 581 582 /* save first 4 VGPRs, needed for SGPR save */ 583 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 584 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 585 s_and_b32 m0, m0, 1 586 s_cmp_eq_u32 m0, 1 587 s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI 588 s_mov_b32 exec_hi, 0x00000000 589 s_branch L_SAVE_4VGPR_WAVE32 590L_ENABLE_SAVE_4VGPR_EXEC_HI: 591 s_mov_b32 exec_hi, 0xFFFFFFFF 592 s_branch L_SAVE_4VGPR_WAVE64 593L_SAVE_4VGPR_WAVE32: 594 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 595 596 // VGPR Allocated in 4-GPR granularity 597 598#if SAVE_AFTER_XNACK_ERROR 599 check_if_tcp_store_ok() 600 s_cbranch_scc1 L_SAVE_FIRST_VGPRS32_WITH_TCP 601 602 write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) 603 s_branch L_SAVE_HWREG 604 605L_SAVE_FIRST_VGPRS32_WITH_TCP: 606#endif 607 608#if !NO_SQC_STORE 609 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 610#endif 611 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 612 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 613 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 614 s_branch L_SAVE_HWREG 615 616L_SAVE_4VGPR_WAVE64: 617 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 618 619 // VGPR Allocated in 4-GPR granularity 620 621#if SAVE_AFTER_XNACK_ERROR 622 check_if_tcp_store_ok() 623 s_cbranch_scc1 L_SAVE_FIRST_VGPRS64_WITH_TCP 624 625 write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) 626 s_branch L_SAVE_HWREG 627 628L_SAVE_FIRST_VGPRS64_WITH_TCP: 629#endif 630 631#if !NO_SQC_STORE 632 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 633#endif 634 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 635 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 636 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 637 638 /* save HW registers */ 639 640L_SAVE_HWREG: 641 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) 642 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 643 get_svgpr_size_bytes(s_save_tmp) 644 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 645 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() 646 647 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 648 649#if NO_SQC_STORE 650 v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource 651 v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource 652 v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store 653 s_mov_b32 m0, 0x0 //Next lane of v2 to write to 654#endif 655 656#if ASIC_FAMILY >= CHIP_GFX12 657 // Ensure no further changes to barrier or LDS state. 658 // STATE_PRIV.BARRIER_COMPLETE may change up to this point. 659 s_barrier_signal -2 660 s_barrier_wait -2 661 662 // Re-read final state of BARRIER_COMPLETE field for save. 663 s_getreg_b32 s_save_tmp, hwreg(S_STATUS_HWREG) 664 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK 665 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK 666 s_or_b32 s_save_status, s_save_status, s_save_tmp 667#endif 668 669 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 670 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) 671 s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK 672 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) 673 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) 674 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) 675 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) 676 677 s_getreg_b32 s_save_tmp, hwreg(S_TRAPSTS_HWREG) 678 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) 679 680 // Not used on Sienna_Cichlid but keep layout same for debugger. 681 write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset) 682 683 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) 684 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 685 686 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO) 687 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 688 689 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI) 690 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 691 692#if ASIC_FAMILY >= CHIP_GFX12 693 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) 694 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 695 696 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL) 697 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 698 699 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) 700 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) 701 702 s_get_barrier_state s_save_tmp, -1 703 s_wait_kmcnt (0) 704 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) 705#endif 706 707#if NO_SQC_STORE 708 // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. 709 s_mov_b32 exec_lo, 0xFFFF 710 s_mov_b32 exec_hi, 0x0 711 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 712 713 // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. 714 s_mov_b32 exec_lo, 0xFFFFFFFF 715#endif 716 717 /* save SGPRs */ 718 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... 719 720 // SGPR SR memory offset : size(VGPR)+size(SVGPR) 721 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 722 get_svgpr_size_bytes(s_save_tmp) 723 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 724 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 725 726#if NO_SQC_STORE 727 s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into 728#else 729 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 730 s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0 731 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset 732 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 733#endif 734 735 s_mov_b32 m0, 0x0 //SGPR initial index value =0 736 s_nop 0x0 //Manually inserted wait states 737L_SAVE_SGPR_LOOP: 738 // SGPR is allocated in 16 SGPR granularity 739 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 740 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 741 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 742 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 743 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 744 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 745 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] 746 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] 747 748 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) 749 750#if NO_SQC_STORE 751 s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? 752 s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE 753 754 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 755 s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 756 s_mov_b32 ttmp13, 0x0 757 v_mov_b32 v2, 0x0 758L_SAVE_SGPR_SKIP_TCP_STORE: 759#endif 760 761 s_add_u32 m0, m0, 16 //next sgpr index 762 s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 763 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? 764 765 //save the rest 12 SGPR 766 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 767 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 768 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 769 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 770 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 771 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 772 write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) 773 774#if NO_SQC_STORE 775 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 776#else 777 // restore s_save_buf_rsrc0,1 778 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask 779#endif 780 781 /* save LDS */ 782 783L_SAVE_LDS: 784 // Change EXEC to all threads... 785 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 786 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 787 s_and_b32 m0, m0, 1 788 s_cmp_eq_u32 m0, 1 789 s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI 790 s_mov_b32 exec_hi, 0x00000000 791 s_branch L_SAVE_LDS_NORMAL 792L_ENABLE_SAVE_LDS_EXEC_HI: 793 s_mov_b32 exec_hi, 0xFFFFFFFF 794L_SAVE_LDS_NORMAL: 795 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 796 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? 797 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE 798 799#if ASIC_FAMILY < CHIP_GFX12 800 s_barrier //LDS is used? wait for other waves in the same TG 801#endif 802 s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK 803 s_cbranch_scc0 L_SAVE_LDS_DONE 804 805 // first wave do LDS save; 806 807 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY 808 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes 809 810 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) 811 // 812 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 813 get_svgpr_size_bytes(s_save_tmp) 814 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 815 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() 816 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() 817 818 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 819 820 //load 0~63*4(byte address) to vgpr v0 821 v_mbcnt_lo_u32_b32 v0, -1, 0 822 v_mbcnt_hi_u32_b32 v0, -1, v0 823 v_mul_u32_u24 v0, 4, v0 824 825 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 826 s_and_b32 m0, m0, 1 827 s_cmp_eq_u32 m0, 1 828 s_mov_b32 m0, 0x0 829 s_cbranch_scc1 L_SAVE_LDS_W64 830 831L_SAVE_LDS_W32: 832#if SAVE_AFTER_XNACK_ERROR 833 check_if_tcp_store_ok() 834 s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W32 835 836L_SAVE_LDS_LOOP_SQC_W32: 837 ds_read_b32 v1, v0 838 S_WAITCNT_0 839 840 write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) 841 842 s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes 843 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes 844 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 845 s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W32 //LDS save is complete? 846 847 s_branch L_SAVE_LDS_DONE 848 849L_SAVE_LDS_WITH_TCP_W32: 850#endif 851 852 s_mov_b32 s3, 128 853 s_nop 0 854 s_nop 0 855 s_nop 0 856L_SAVE_LDS_LOOP_W32: 857 ds_read_b32 v1, v0 858 S_WAITCNT_0 859 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 860 861 s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes 862 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 863 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes 864 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 865 s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? 866 867 s_branch L_SAVE_LDS_DONE 868 869L_SAVE_LDS_W64: 870#if SAVE_AFTER_XNACK_ERROR 871 check_if_tcp_store_ok() 872 s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W64 873 874L_SAVE_LDS_LOOP_SQC_W64: 875 ds_read_b32 v1, v0 876 S_WAITCNT_0 877 878 write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) 879 880 s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes 881 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes 882 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 883 s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W64 //LDS save is complete? 884 885 s_branch L_SAVE_LDS_DONE 886 887L_SAVE_LDS_WITH_TCP_W64: 888#endif 889 890 s_mov_b32 s3, 256 891 s_nop 0 892 s_nop 0 893 s_nop 0 894L_SAVE_LDS_LOOP_W64: 895 ds_read_b32 v1, v0 896 S_WAITCNT_0 897 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 898 899 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes 900 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 901 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes 902 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 903 s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? 904 905L_SAVE_LDS_DONE: 906 /* save VGPRs - set the Rest VGPRs */ 907L_SAVE_VGPR: 908 // VGPR SR memory offset: 0 909 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 910 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 911 s_and_b32 m0, m0, 1 912 s_cmp_eq_u32 m0, 1 913 s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI 914 s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs 915 s_mov_b32 exec_hi, 0x00000000 916 s_branch L_SAVE_VGPR_NORMAL 917L_ENABLE_SAVE_VGPR_EXEC_HI: 918 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs 919 s_mov_b32 exec_hi, 0xFFFFFFFF 920L_SAVE_VGPR_NORMAL: 921 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 922 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 923 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 924 //determine it is wave32 or wave64 925 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 926 s_and_b32 m0, m0, 1 927 s_cmp_eq_u32 m0, 1 928 s_cbranch_scc1 L_SAVE_VGPR_WAVE64 929 930 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 931 932 // VGPR Allocated in 4-GPR granularity 933 934 // VGPR store using dw burst 935 s_mov_b32 m0, 0x4 //VGPR initial index value =4 936 s_cmp_lt_u32 m0, s_save_alloc_size 937 s_cbranch_scc0 L_SAVE_VGPR_END 938 939#if SAVE_AFTER_XNACK_ERROR 940 check_if_tcp_store_ok() 941 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP 942 943L_SAVE_VGPR_LOOP_SQC_W32: 944 v_movrels_b32 v0, v0 //v0 = v[0+m0] 945 v_movrels_b32 v1, v1 //v1 = v[1+m0] 946 v_movrels_b32 v2, v2 //v2 = v[2+m0] 947 v_movrels_b32 v3, v3 //v3 = v[3+m0] 948 949 write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) 950 951 s_add_u32 m0, m0, 4 952 s_cmp_lt_u32 m0, s_save_alloc_size 953 s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W32 954 955 s_branch L_SAVE_VGPR_END 956#endif 957 958L_SAVE_VGPR_W32_LOOP: 959 v_movrels_b32 v0, v0 //v0 = v[0+m0] 960 v_movrels_b32 v1, v1 //v1 = v[1+m0] 961 v_movrels_b32 v2, v2 //v2 = v[2+m0] 962 v_movrels_b32 v3, v3 //v3 = v[3+m0] 963 964 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 965 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 966 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 967 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 968 969 s_add_u32 m0, m0, 4 //next vgpr index 970 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes 971 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 972 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete? 973 974 s_branch L_SAVE_VGPR_END 975 976L_SAVE_VGPR_WAVE64: 977 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 978 979 // VGPR store using dw burst 980 s_mov_b32 m0, 0x4 //VGPR initial index value =4 981 s_cmp_lt_u32 m0, s_save_alloc_size 982 s_cbranch_scc0 L_SAVE_SHARED_VGPR 983 984#if SAVE_AFTER_XNACK_ERROR 985 check_if_tcp_store_ok() 986 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP 987 988L_SAVE_VGPR_LOOP_SQC_W64: 989 v_movrels_b32 v0, v0 //v0 = v[0+m0] 990 v_movrels_b32 v1, v1 //v1 = v[1+m0] 991 v_movrels_b32 v2, v2 //v2 = v[2+m0] 992 v_movrels_b32 v3, v3 //v3 = v[3+m0] 993 994 write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) 995 996 s_add_u32 m0, m0, 4 997 s_cmp_lt_u32 m0, s_save_alloc_size 998 s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W64 999 1000 s_branch L_SAVE_VGPR_END 1001#endif 1002 1003L_SAVE_VGPR_W64_LOOP: 1004 v_movrels_b32 v0, v0 //v0 = v[0+m0] 1005 v_movrels_b32 v1, v1 //v1 = v[1+m0] 1006 v_movrels_b32 v2, v2 //v2 = v[2+m0] 1007 v_movrels_b32 v3, v3 //v3 = v[3+m0] 1008 1009 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 1010 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 1011 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 1012 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 1013 1014 s_add_u32 m0, m0, 4 //next vgpr index 1015 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes 1016 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 1017 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete? 1018 1019L_SAVE_SHARED_VGPR: 1020 //Below part will be the save shared vgpr part (new for gfx10) 1021 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 1022 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? 1023 s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS 1024 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) 1025 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. 1026 //save shared_vgpr will start from the index of m0 1027 s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 1028 s_mov_b32 exec_lo, 0xFFFFFFFF 1029 s_mov_b32 exec_hi, 0x00000000 1030 1031#if SAVE_AFTER_XNACK_ERROR 1032 check_if_tcp_store_ok() 1033 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP 1034 1035L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC: 1036 v_movrels_b32 v0, v0 1037 1038 write_vgprs_to_mem_with_sqc_w64(v0, 1, s_save_buf_rsrc0, s_save_mem_offset) 1039 1040 s_add_u32 m0, m0, 1 1041 s_cmp_lt_u32 m0, s_save_alloc_size 1042 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC 1043 1044 s_branch L_SAVE_VGPR_END 1045#endif 1046 1047L_SAVE_SHARED_VGPR_WAVE64_LOOP: 1048 v_movrels_b32 v0, v0 //v0 = v[0+m0] 1049 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 1050 s_add_u32 m0, m0, 1 //next vgpr index 1051 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 1052 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 1053 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? 1054 1055L_SAVE_VGPR_END: 1056 s_branch L_END_PGM 1057 1058L_RESTORE: 1059 /* Setup Resource Contants */ 1060 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo 1061 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi 1062 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE 1063 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) 1064 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC 1065 1066#if ASIC_FAMILY >= CHIP_GFX12 1067 // Save s_restore_spi_init_hi for later use. 1068 s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi 1069#endif 1070 1071 //determine it is wave32 or wave64 1072 get_wave_size2(s_restore_size) 1073 1074 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK 1075 s_cbranch_scc0 L_RESTORE_VGPR 1076 1077 /* restore LDS */ 1078L_RESTORE_LDS: 1079 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 1080 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 1081 s_and_b32 m0, m0, 1 1082 s_cmp_eq_u32 m0, 1 1083 s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI 1084 s_mov_b32 exec_hi, 0x00000000 1085 s_branch L_RESTORE_LDS_NORMAL 1086L_ENABLE_RESTORE_LDS_EXEC_HI: 1087 s_mov_b32 exec_hi, 0xFFFFFFFF 1088L_RESTORE_LDS_NORMAL: 1089 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 1090 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? 1091 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR 1092 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY 1093 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes 1094 1095 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) 1096 // 1097 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 1098 get_svgpr_size_bytes(s_restore_tmp) 1099 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 1100 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 1101 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() 1102 1103 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1104 1105 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 1106 s_and_b32 m0, m0, 1 1107 s_cmp_eq_u32 m0, 1 1108 s_mov_b32 m0, 0x0 1109 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 1110 1111L_RESTORE_LDS_LOOP_W32: 1112#if HAVE_BUFFER_LDS_LOAD 1113 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 1114#else 1115 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset 1116 S_WAITCNT_0 1117 ds_store_addtid_b32 v0 1118#endif 1119 s_add_u32 m0, m0, 128 // 128 DW 1120 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW 1121 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 1122 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? 1123 s_branch L_RESTORE_VGPR 1124 1125L_RESTORE_LDS_LOOP_W64: 1126#if HAVE_BUFFER_LDS_LOAD 1127 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 1128#else 1129 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset 1130 S_WAITCNT_0 1131 ds_store_addtid_b32 v0 1132#endif 1133 s_add_u32 m0, m0, 256 // 256 DW 1134 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW 1135 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 1136 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? 1137 1138 /* restore VGPRs */ 1139L_RESTORE_VGPR: 1140 // VGPR SR memory offset : 0 1141 s_mov_b32 s_restore_mem_offset, 0x0 1142 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 1143 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 1144 s_and_b32 m0, m0, 1 1145 s_cmp_eq_u32 m0, 1 1146 s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI 1147 s_mov_b32 exec_hi, 0x00000000 1148 s_branch L_RESTORE_VGPR_NORMAL 1149L_ENABLE_RESTORE_VGPR_EXEC_HI: 1150 s_mov_b32 exec_hi, 0xFFFFFFFF 1151L_RESTORE_VGPR_NORMAL: 1152 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 1153 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 1154 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 1155 //determine it is wave32 or wave64 1156 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 1157 s_and_b32 m0, m0, 1 1158 s_cmp_eq_u32 m0, 1 1159 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 1160 1161 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1162 1163 // VGPR load using dw burst 1164 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last 1165 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 1166 s_mov_b32 m0, 4 //VGPR initial index value = 4 1167 s_cmp_lt_u32 m0, s_restore_alloc_size 1168 s_cbranch_scc0 L_RESTORE_SGPR 1169 1170L_RESTORE_VGPR_WAVE32_LOOP: 1171 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE 1172 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128 1173 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2 1174 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3 1175 S_WAITCNT_0 1176 v_movreld_b32 v0, v0 //v[0+m0] = v0 1177 v_movreld_b32 v1, v1 1178 v_movreld_b32 v2, v2 1179 v_movreld_b32 v3, v3 1180 s_add_u32 m0, m0, 4 //next vgpr index 1181 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes 1182 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 1183 s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? 1184 1185 /* VGPR restore on v0 */ 1186 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE 1187 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128 1188 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2 1189 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3 1190 S_WAITCNT_0 1191 1192 s_branch L_RESTORE_SGPR 1193 1194L_RESTORE_VGPR_WAVE64: 1195 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1196 1197 // VGPR load using dw burst 1198 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last 1199 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 1200 s_mov_b32 m0, 4 //VGPR initial index value = 4 1201 s_cmp_lt_u32 m0, s_restore_alloc_size 1202 s_cbranch_scc0 L_RESTORE_SHARED_VGPR 1203 1204L_RESTORE_VGPR_WAVE64_LOOP: 1205 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE 1206 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256 1207 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2 1208 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3 1209 S_WAITCNT_0 1210 v_movreld_b32 v0, v0 //v[0+m0] = v0 1211 v_movreld_b32 v1, v1 1212 v_movreld_b32 v2, v2 1213 v_movreld_b32 v3, v3 1214 s_add_u32 m0, m0, 4 //next vgpr index 1215 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes 1216 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 1217 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? 1218 1219L_RESTORE_SHARED_VGPR: 1220 //Below part will be the restore shared vgpr part (new for gfx10) 1221 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size 1222 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? 1223 s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? 1224 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) 1225 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. 1226 //restore shared_vgpr will start from the index of m0 1227 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 1228 s_mov_b32 exec_lo, 0xFFFFFFFF 1229 s_mov_b32 exec_hi, 0x00000000 1230L_RESTORE_SHARED_VGPR_WAVE64_LOOP: 1231 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE 1232 S_WAITCNT_0 1233 v_movreld_b32 v0, v0 //v[0+m0] = v0 1234 s_add_u32 m0, m0, 1 //next vgpr index 1235 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 1236 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 1237 s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? 1238 1239 s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! 1240 1241 /* VGPR restore on v0 */ 1242L_RESTORE_V0: 1243 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE 1244 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256 1245 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2 1246 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3 1247 S_WAITCNT_0 1248 1249 /* restore SGPRs */ 1250 //will be 2+8+16*6 1251 // SGPR SR memory offset : size(VGPR)+size(SVGPR) 1252L_RESTORE_SGPR: 1253 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 1254 get_svgpr_size_bytes(s_restore_tmp) 1255 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 1256 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 1257 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved 1258 1259 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1260 1261 s_mov_b32 m0, s_sgpr_save_num 1262 1263 read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 1264 S_WAITCNT_0 1265 1266 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] 1267 s_nop 0 // hazard SALU M0=> S_MOVREL 1268 1269 s_movreld_b64 s0, s0 //s[0+m0] = s0 1270 s_movreld_b64 s2, s2 1271 1272 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 1273 S_WAITCNT_0 1274 1275 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] 1276 s_nop 0 // hazard SALU M0=> S_MOVREL 1277 1278 s_movreld_b64 s0, s0 //s[0+m0] = s0 1279 s_movreld_b64 s2, s2 1280 s_movreld_b64 s4, s4 1281 s_movreld_b64 s6, s6 1282 1283 L_RESTORE_SGPR_LOOP: 1284 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 1285 S_WAITCNT_0 1286 1287 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] 1288 s_nop 0 // hazard SALU M0=> S_MOVREL 1289 1290 s_movreld_b64 s0, s0 //s[0+m0] = s0 1291 s_movreld_b64 s2, s2 1292 s_movreld_b64 s4, s4 1293 s_movreld_b64 s6, s6 1294 s_movreld_b64 s8, s8 1295 s_movreld_b64 s10, s10 1296 s_movreld_b64 s12, s12 1297 s_movreld_b64 s14, s14 1298 1299 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0 1300 s_cbranch_scc0 L_RESTORE_SGPR_LOOP 1301 1302 // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception. 1303 // Clear DEBUG_EN before and restore MODE after the barrier. 1304 s_setreg_imm32_b32 hwreg(HW_REG_MODE), 0 1305#if ASIC_FAMILY < CHIP_GFX12 1306 s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG 1307#endif 1308 1309 /* restore HW registers */ 1310L_RESTORE_HWREG: 1311 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) 1312 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 1313 get_svgpr_size_bytes(s_restore_tmp) 1314 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 1315 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 1316 1317 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1318 1319#if ASIC_FAMILY >= CHIP_GFX12 1320 // Restore s_restore_spi_init_hi before the saved value gets clobbered. 1321 s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save 1322#endif 1323 1324 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) 1325 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) 1326 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 1327 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) 1328 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 1329 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) 1330 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) 1331 read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) 1332 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) 1333 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) 1334 S_WAITCNT_0 1335 1336 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch 1337 1338 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) 1339 S_WAITCNT_0 1340 1341 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch 1342 1343#if ASIC_FAMILY >= CHIP_GFX12 1344 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) 1345 S_WAITCNT_0 1346 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp 1347 1348 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) 1349 S_WAITCNT_0 1350 s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp 1351 1352 // Only the first wave needs to restore the workgroup barrier. 1353 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK 1354 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE 1355 1356 // Skip over WAVE_STATUS, since there is no state to restore from it 1357 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4 1358 1359 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) 1360 S_WAITCNT_0 1361 1362 s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET 1363 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE 1364 1365 // extract the saved signal count from s_restore_tmp 1366 s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET 1367 1368 // We need to call s_barrier_signal repeatedly to restore the signal 1369 // count of the work group barrier. The member count is already 1370 // initialized with the number of waves in the work group. 1371L_BARRIER_RESTORE_LOOP: 1372 s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp 1373 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE 1374 s_barrier_signal -1 1375 s_add_i32 s_restore_tmp, s_restore_tmp, -1 1376 s_branch L_BARRIER_RESTORE_LOOP 1377 1378L_SKIP_BARRIER_RESTORE: 1379#endif 1380 1381 s_mov_b32 m0, s_restore_m0 1382 s_mov_b32 exec_lo, s_restore_exec_lo 1383 s_mov_b32 exec_hi, s_restore_exec_hi 1384 1385#if HAVE_XNACK 1386 s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask 1387#endif 1388 1389#if ASIC_FAMILY < CHIP_GFX12 1390 s_setreg_b32 hwreg(S_TRAPSTS_HWREG), s_restore_trapsts 1391#else 1392 // EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed. 1393 // Only restore the other fields to avoid clobbering them. 1394 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, 0, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT), s_restore_trapsts 1395 s_lshr_b32 s_restore_trapsts, s_restore_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT 1396 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT, 1), s_restore_trapsts 1397 s_lshr_b32 s_restore_trapsts, s_restore_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT 1398 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT, 32 - SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT), s_restore_trapsts 1399#endif 1400 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode 1401 1402 // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic 1403 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 1404 get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) 1405 get_svgpr_size_bytes(s_restore_ttmps_hi) 1406 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi 1407 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() 1408 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 1409 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 1410 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF 1411 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE 1412 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE 1413 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE 1414 S_WAITCNT_0 1415 1416#if HAVE_XNACK 1417 restore_ib_sts(s_restore_tmp, s_restore_m0) 1418#endif 1419 1420 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS 1421 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 1422 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 1423 1424#if SW_SA_TRAP 1425 // If traps are enabled then return to the shader with PRIV=0. 1426 // Otherwise retain PRIV=1 for subsequent context save requests. 1427 s_getreg_b32 s_restore_tmp, hwreg(HW_REG_STATUS) 1428 s_bitcmp1_b32 s_restore_tmp, SQ_WAVE_STATUS_TRAP_EN_SHIFT 1429 s_cbranch_scc1 L_RETURN_WITHOUT_PRIV 1430 1431 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu 1432 s_setpc_b64 [s_restore_pc_lo, s_restore_pc_hi] 1433L_RETURN_WITHOUT_PRIV: 1434#endif 1435 1436 s_setreg_b32 hwreg(S_STATUS_HWREG), s_restore_status // SCC is included, which is changed by previous salu 1437 1438#if ASIC_FAMILY >= CHIP_GFX12 1439 // Make barrier and LDS state visible to all waves in the group. 1440 // STATE_PRIV.BARRIER_COMPLETE may change after this point. 1441 s_barrier_signal -2 1442 s_barrier_wait -2 1443#endif 1444 1445 s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution 1446 1447L_END_PGM: 1448 s_endpgm_saved 1449end 1450 1451function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) 1452#if NO_SQC_STORE 1453 // Copy into VGPR for later TCP store. 1454 v_writelane_b32 v2, s, m0 1455 s_add_u32 m0, m0, 0x1 1456#else 1457 s_mov_b32 exec_lo, m0 1458 s_mov_b32 m0, s_mem_offset 1459 s_buffer_store_dword s, s_rsrc, m0 S_COHERENCE 1460 s_add_u32 s_mem_offset, s_mem_offset, 4 1461 s_mov_b32 m0, exec_lo 1462#endif 1463end 1464 1465 1466function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) 1467#if NO_SQC_STORE 1468 // Copy into VGPR for later TCP store. 1469 for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ 1470 v_writelane_b32 v2, s[sgpr_idx], ttmp13 1471 s_add_u32 ttmp13, ttmp13, 0x1 1472 end 1473#else 1474 s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE 1475 s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE 1476 s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE 1477 s_buffer_store_dwordx4 s[12], s_rsrc, 48 S_COHERENCE 1478 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 1479 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 1480#endif 1481end 1482 1483function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) 1484#if NO_SQC_STORE 1485 // Copy into VGPR for later TCP store. 1486 for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ 1487 v_writelane_b32 v2, s[sgpr_idx], ttmp13 1488 s_add_u32 ttmp13, ttmp13, 0x1 1489 end 1490#else 1491 s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE 1492 s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE 1493 s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE 1494 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 1495 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 1496#endif 1497end 1498 1499function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) 1500 s_buffer_load_dword s, s_rsrc, s_mem_offset S_COHERENCE 1501 s_add_u32 s_mem_offset, s_mem_offset, 4 1502end 1503 1504function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) 1505 s_sub_u32 s_mem_offset, s_mem_offset, 4*16 1506 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset S_COHERENCE 1507end 1508 1509function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) 1510 s_sub_u32 s_mem_offset, s_mem_offset, 4*8 1511 s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset S_COHERENCE 1512end 1513 1514function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) 1515 s_sub_u32 s_mem_offset, s_mem_offset, 4*4 1516 s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset S_COHERENCE 1517end 1518 1519#if SAVE_AFTER_XNACK_ERROR 1520function check_if_tcp_store_ok 1521 // If TRAPSTS.XNACK_ERROR=1 then TCP stores will fail. 1522 s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) 1523 s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp 1524 1525L_TCP_STORE_CHECK_DONE: 1526end 1527 1528function write_vgpr_to_mem_with_sqc(vgpr, n_lanes, s_rsrc, s_mem_offset) 1529 s_mov_b32 s4, 0 1530 1531L_WRITE_VGPR_LANE_LOOP: 1532 for var lane = 0; lane < 4; ++lane 1533 v_readlane_b32 s[lane], vgpr, s4 1534 s_add_u32 s4, s4, 1 1535 end 1536 1537 s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1 1538 1539 s_add_u32 s_mem_offset, s_mem_offset, 0x10 1540 s_cmp_eq_u32 s4, n_lanes 1541 s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP 1542end 1543 1544function write_vgprs_to_mem_with_sqc_w32(vgpr0, n_vgprs, s_rsrc, s_mem_offset) 1545 for var vgpr = 0; vgpr < n_vgprs; ++vgpr 1546 write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 32, s_rsrc, s_mem_offset) 1547 end 1548end 1549 1550function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset) 1551 for var vgpr = 0; vgpr < n_vgprs; ++vgpr 1552 write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 64, s_rsrc, s_mem_offset) 1553 end 1554end 1555#endif 1556 1557function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) 1558 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 1559 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 1560 s_bitcmp1_b32 s_size, S_WAVE_SIZE 1561 s_cbranch_scc1 L_ENABLE_SHIFT_W64 1562 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) 1563 s_branch L_SHIFT_DONE 1564L_ENABLE_SHIFT_W64: 1565 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) 1566L_SHIFT_DONE: 1567end 1568 1569function get_svgpr_size_bytes(s_svgpr_size_byte) 1570 s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 1571 s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7) 1572end 1573 1574function get_sgpr_size_bytes 1575 return 512 1576end 1577 1578function get_hwreg_size_bytes 1579 return 128 1580end 1581 1582function get_wave_size2(s_reg) 1583#if ASIC_FAMILY < CHIP_GFX12 1584 s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) 1585#else 1586 s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE) 1587#endif 1588 s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE 1589end 1590 1591#if HAVE_XNACK 1592function save_and_clear_ib_sts(tmp1, tmp2) 1593 // Preserve and clear scalar XNACK state before issuing scalar loads. 1594 // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into 1595 // unused space ttmp11[31:24]. 1596 s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK) 1597 s_getreg_b32 tmp1, hwreg(HW_REG_IB_STS) 1598 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK 1599 s_lshl_b32 tmp2, tmp2, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) 1600 s_or_b32 ttmp11, ttmp11, tmp2 1601 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 1602 s_lshl_b32 tmp2, tmp2, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) 1603 s_or_b32 ttmp11, ttmp11, tmp2 1604 s_andn2_b32 tmp1, tmp1, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK) 1605 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1 1606end 1607 1608function restore_ib_sts(tmp1, tmp2) 1609 s_lshr_b32 tmp1, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) 1610 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 1611 s_lshr_b32 tmp1, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) 1612 s_and_b32 tmp1, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK 1613 s_or_b32 tmp1, tmp1, tmp2 1614 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1 1615end 1616#endif 1617