1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23/* To compile this assembly code: 24 * 25 * Navi1x: 26 * cpp -DASIC_FAMILY=CHIP_NAVI10 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3 27 * sp3 nv1x.sp3 -hex nv1x.hex 28 * 29 * gfx10: 30 * cpp -DASIC_FAMILY=CHIP_SIENNA_CICHLID cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3 31 * sp3 gfx10.sp3 -hex gfx10.hex 32 * 33 * gfx11: 34 * cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3 35 * sp3 gfx11.sp3 -hex gfx11.hex 36 * 37 */ 38 39#define CHIP_NAVI10 26 40#define CHIP_SIENNA_CICHLID 30 41#define CHIP_PLUM_BONITO 36 42 43#define NO_SQC_STORE (ASIC_FAMILY >= CHIP_SIENNA_CICHLID) 44#define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID) 45#define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO) 46#define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO) 47#define SW_SA_TRAP (ASIC_FAMILY == CHIP_PLUM_BONITO) 48#define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger 49#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised 50 51#define S_COHERENCE glc:1 52#define V_COHERENCE slc:1 glc:1 53#define S_WAITCNT_0 s_waitcnt 0 54 55var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 56var SQ_WAVE_STATUS_HALT_MASK = 0x2000 57var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000 58var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6 59var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 60var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 61var SQ_WAVE_LDS_ALLOC_GRANULARITY = 8 62var S_STATUS_HWREG = HW_REG_STATUS 63var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK 64var S_STATUS_HALT_MASK = SQ_WAVE_STATUS_HALT_MASK 65var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000 66var S_SAVE_PC_HI_HT_MASK = 0x01000000 67 68var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 69var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 70var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 71var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 72var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 73var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 74 75#if ASIC_FAMILY < CHIP_PLUM_BONITO 76var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 77#else 78var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12 79#endif 80 81var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 82var SQ_WAVE_TRAPSTS_EXCP_MASK = 0x1FF 83var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 84var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK = 0x80 85var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7 86var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 87var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 88var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 89var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT = 11 90var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000 91#if ASIC_FAMILY >= CHIP_PLUM_BONITO 92var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT = 16 93var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000 94var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT = 17 95var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000 96var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000 97#endif 98var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000 99 100var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12 101var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19 102 103var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 104var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT = 25 105var SQ_WAVE_IB_STS_REPLAY_W64H_MASK = 0x02000000 106var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 107 108var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 109 110var S_TRAPSTS_RESTORE_PART_1_SIZE = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT 111var S_TRAPSTS_RESTORE_PART_2_SHIFT = SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT 112 113#if ASIC_FAMILY < CHIP_PLUM_BONITO 114var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK 115var S_TRAPSTS_RESTORE_PART_2_SIZE = 32 - S_TRAPSTS_RESTORE_PART_2_SHIFT 116var S_TRAPSTS_RESTORE_PART_3_SHIFT = 0 117var S_TRAPSTS_RESTORE_PART_3_SIZE = 0 118#else 119var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK |\ 120 SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |\ 121 SQ_WAVE_TRAPSTS_WAVE_START_MASK |\ 122 SQ_WAVE_TRAPSTS_WAVE_END_MASK |\ 123 SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK 124var S_TRAPSTS_RESTORE_PART_2_SIZE = SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT - SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT 125var S_TRAPSTS_RESTORE_PART_3_SHIFT = SQ_WAVE_TRAPSTS_WAVE_START_SHIFT 126var S_TRAPSTS_RESTORE_PART_3_SIZE = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT 127#endif 128var S_TRAPSTS_HWREG = HW_REG_TRAPSTS 129var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_TRAPSTS_SAVECTX_MASK 130var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT 131 132// bits [31:24] unused by SPI debug data 133var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31 134var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000 135var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 24 136var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0x7F000000 137var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23 138var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 139 140// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] 141// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE 142var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 143var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC 144var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 145var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 146 147var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000 148var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31 149 150var s_sgpr_save_num = 108 151 152var s_save_spi_init_lo = exec_lo 153var s_save_spi_init_hi = exec_hi 154var s_save_pc_lo = ttmp0 155var s_save_pc_hi = ttmp1 156var s_save_exec_lo = ttmp2 157var s_save_exec_hi = ttmp3 158var s_save_status = ttmp12 159var s_save_trapsts = ttmp15 160var s_save_xnack_mask = s_save_trapsts 161var s_wave_size = ttmp7 162var s_save_buf_rsrc0 = ttmp8 163var s_save_buf_rsrc1 = ttmp9 164var s_save_buf_rsrc2 = ttmp10 165var s_save_buf_rsrc3 = ttmp11 166var s_save_mem_offset = ttmp4 167var s_save_alloc_size = s_save_trapsts 168var s_save_tmp = ttmp14 169var s_save_m0 = ttmp5 170var s_save_ttmps_lo = s_save_tmp 171var s_save_ttmps_hi = s_save_trapsts 172 173var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE 174var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC 175 176var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 177var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 178var S_WAVE_SIZE = 25 179 180var s_restore_spi_init_lo = exec_lo 181var s_restore_spi_init_hi = exec_hi 182var s_restore_mem_offset = ttmp12 183var s_restore_alloc_size = ttmp3 184var s_restore_tmp = ttmp2 185var s_restore_mem_offset_save = s_restore_tmp 186var s_restore_m0 = s_restore_alloc_size 187var s_restore_mode = ttmp7 188var s_restore_flat_scratch = s_restore_tmp 189var s_restore_pc_lo = ttmp0 190var s_restore_pc_hi = ttmp1 191var s_restore_exec_lo = ttmp4 192var s_restore_exec_hi = ttmp5 193var s_restore_status = ttmp14 194var s_restore_trapsts = ttmp15 195var s_restore_xnack_mask = ttmp13 196var s_restore_buf_rsrc0 = ttmp8 197var s_restore_buf_rsrc1 = ttmp9 198var s_restore_buf_rsrc2 = ttmp10 199var s_restore_buf_rsrc3 = ttmp11 200var s_restore_size = ttmp6 201var s_restore_ttmps_lo = s_restore_tmp 202var s_restore_ttmps_hi = s_restore_alloc_size 203var s_restore_spi_init_hi_save = s_restore_exec_hi 204 205shader main 206 asic(DEFAULT) 207 type(CS) 208 wave_size(32) 209 210 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save 211 212L_JUMP_TO_RESTORE: 213 s_branch L_RESTORE 214 215L_SKIP_RESTORE: 216 s_getreg_b32 s_save_status, hwreg(S_STATUS_HWREG) //save STATUS since we will change SCC 217 218 // Clear SPI_PRIO: do not save with elevated priority. 219 // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. 220 s_andn2_b32 s_save_status, s_save_status, S_STATUS_ALWAYS_CLEAR_MASK 221 222 s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) 223 224#if SW_SA_TRAP 225 // If ttmp1[30] is set then issue s_barrier to unblock dependent waves. 226 s_bitcmp1_b32 s_save_pc_hi, 30 227 s_cbranch_scc0 L_TRAP_NO_BARRIER 228 s_barrier 229 230L_TRAP_NO_BARRIER: 231 // If ttmp1[31] is set then trap may occur early. 232 // Spin wait until SAVECTX exception is raised. 233 s_bitcmp1_b32 s_save_pc_hi, 31 234 s_cbranch_scc1 L_CHECK_SAVE 235#endif 236 237 s_and_b32 ttmp2, s_save_status, S_STATUS_HALT_MASK 238 s_cbranch_scc0 L_NOT_HALTED 239 240L_HALTED: 241 // Host trap may occur while wave is halted. 242 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK 243 s_cbranch_scc1 L_FETCH_2ND_TRAP 244 245L_CHECK_SAVE: 246 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK 247 s_cbranch_scc1 L_SAVE 248 249 // Wave is halted but neither host trap nor SAVECTX is raised. 250 // Caused by instruction fetch memory violation. 251 // Spin wait until context saved to prevent interrupt storm. 252 s_sleep 0x10 253 s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) 254 s_branch L_CHECK_SAVE 255 256L_NOT_HALTED: 257 // Let second-level handle non-SAVECTX exception or trap. 258 // Any concurrent SAVECTX will be handled upon re-entry once halted. 259 260 // Check non-maskable exceptions. memory_violation, illegal_instruction 261 // and xnack_error exceptions always cause the wave to enter the trap 262 // handler. 263 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_NON_MASKABLE_EXCP_MASK 264 s_cbranch_scc1 L_FETCH_2ND_TRAP 265 266 // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi. 267 // Maskable exceptions only cause the wave to enter the trap handler if 268 // their respective bit in mode.excp_en is set. 269 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK 270 s_cbranch_scc0 L_CHECK_TRAP_ID 271 272 s_and_b32 ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK 273 s_cbranch_scc0 L_NOT_ADDR_WATCH 274 s_bitset1_b32 ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch 275 276L_NOT_ADDR_WATCH: 277 s_getreg_b32 ttmp3, hwreg(HW_REG_MODE) 278 s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT 279 s_and_b32 ttmp2, ttmp2, ttmp3 280 s_cbranch_scc1 L_FETCH_2ND_TRAP 281 282L_CHECK_TRAP_ID: 283 // Check trap_id != 0 284 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK 285 s_cbranch_scc1 L_FETCH_2ND_TRAP 286 287#if SINGLE_STEP_MISSED_WORKAROUND 288 // Prioritize single step exception over context save. 289 // Second-level trap will halt wave and RFE, re-entering for SAVECTX. 290 s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) 291 s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK 292 s_cbranch_scc1 L_FETCH_2ND_TRAP 293#endif 294 295 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK 296 s_cbranch_scc1 L_SAVE 297 298L_FETCH_2ND_TRAP: 299#if HAVE_XNACK 300 save_and_clear_ib_sts(ttmp14, ttmp15) 301#endif 302 303 // Read second-level TBA/TMA from first-level TMA and jump if available. 304 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) 305 // ttmp12 holds SQ_WAVE_STATUS 306#if HAVE_SENDMSG_RTN 307 s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA) 308 S_WAITCNT_0 309#else 310 s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO) 311 s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI) 312#endif 313 s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 314 315 s_bitcmp1_b32 ttmp15, 0xF 316 s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA 317 s_or_b32 ttmp15, ttmp15, 0xFFFF0000 318L_NO_SIGN_EXTEND_TMA: 319 320 s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE // debug trap enabled flag 321 S_WAITCNT_0 322 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT 323 s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK 324 s_or_b32 ttmp11, ttmp11, ttmp2 325 326 s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE // second-level TBA 327 S_WAITCNT_0 328 s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE // second-level TMA 329 S_WAITCNT_0 330 331 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] 332 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set 333 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler 334 335L_NO_NEXT_TRAP: 336 // If not caused by trap then halt wave to prevent re-entry. 337 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK 338 s_cbranch_scc1 L_TRAP_CASE 339 340 // Host trap will not cause trap re-entry. 341 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK 342 s_cbranch_scc1 L_EXIT_TRAP 343 s_or_b32 s_save_status, s_save_status, S_STATUS_HALT_MASK 344 345 // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. 346 // Rewind the PC to prevent this from occurring. 347 s_sub_u32 ttmp0, ttmp0, 0x8 348 s_subb_u32 ttmp1, ttmp1, 0x0 349 350 s_branch L_EXIT_TRAP 351 352L_TRAP_CASE: 353 // Advance past trap instruction to prevent re-entry. 354 s_add_u32 ttmp0, ttmp0, 0x4 355 s_addc_u32 ttmp1, ttmp1, 0x0 356 357L_EXIT_TRAP: 358 s_and_b32 ttmp1, ttmp1, 0xFFFF 359 360#if HAVE_XNACK 361 restore_ib_sts(ttmp14, ttmp15) 362#endif 363 364 // Restore SQ_WAVE_STATUS. 365 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 366 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 367 368 s_setreg_b32 hwreg(S_STATUS_HWREG), s_save_status 369 s_rfe_b64 [ttmp0, ttmp1] 370 371L_SAVE: 372 // If VGPRs have been deallocated then terminate the wavefront. 373 // It has no remaining program to run and cannot save without VGPRs. 374#if ASIC_FAMILY == CHIP_PLUM_BONITO 375 s_bitcmp1_b32 s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT 376 s_cbranch_scc0 L_HAVE_VGPRS 377 s_endpgm 378L_HAVE_VGPRS: 379#endif 380 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] 381 s_mov_b32 s_save_tmp, 0 382 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit 383 384#if HAVE_XNACK 385 save_and_clear_ib_sts(s_save_tmp, s_save_trapsts) 386#endif 387 388 /* inform SPI the readiness and wait for SPI's go signal */ 389 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI 390 s_mov_b32 s_save_exec_hi, exec_hi 391 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive 392 393#if HAVE_SENDMSG_RTN 394 s_sendmsg_rtn_b64 [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE) 395#else 396 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC 397#endif 398 399#if ASIC_FAMILY < CHIP_SIENNA_CICHLID 400L_SLEEP: 401 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause 402 // SQ hang, since the 7,8th wave could not get arbit to exec inst, while 403 // other waves are stuck into the sleep-loop and waiting for wrexec!=0 404 s_sleep 0x2 405 s_cbranch_execz L_SLEEP 406#else 407 S_WAITCNT_0 408#endif 409 410 // Save first_wave flag so we can clear high bits of save address. 411 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK 412 s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT) 413 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 414 415#if NO_SQC_STORE 416#if ASIC_FAMILY <= CHIP_SIENNA_CICHLID 417 // gfx10: If there was a VALU exception, the exception state must be 418 // cleared before executing the VALU instructions below. 419 v_clrexcp 420#endif 421 422 // Trap temporaries must be saved via VGPR but all VGPRs are in use. 423 // There is no ttmp space to hold the resource constant for VGPR save. 424 // Save v0 by itself since it requires only two SGPRs. 425 s_mov_b32 s_save_ttmps_lo, exec_lo 426 s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF 427 s_mov_b32 exec_lo, 0xFFFFFFFF 428 s_mov_b32 exec_hi, 0xFFFFFFFF 429 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] V_COHERENCE 430 v_mov_b32 v0, 0x0 431 s_mov_b32 exec_lo, s_save_ttmps_lo 432 s_mov_b32 exec_hi, s_save_ttmps_hi 433#endif 434 435 // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic 436 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 437 get_wave_size2(s_save_ttmps_hi) 438 get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) 439 get_svgpr_size_bytes(s_save_ttmps_hi) 440 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi 441 s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF 442 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() 443 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo 444 s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 445 446#if NO_SQC_STORE 447 v_writelane_b32 v0, ttmp4, 0x4 448 v_writelane_b32 v0, ttmp5, 0x5 449 v_writelane_b32 v0, ttmp6, 0x6 450 v_writelane_b32 v0, ttmp7, 0x7 451 v_writelane_b32 v0, ttmp8, 0x8 452 v_writelane_b32 v0, ttmp9, 0x9 453 v_writelane_b32 v0, ttmp10, 0xA 454 v_writelane_b32 v0, ttmp11, 0xB 455 v_writelane_b32 v0, ttmp13, 0xD 456 v_writelane_b32 v0, exec_lo, 0xE 457 v_writelane_b32 v0, exec_hi, 0xF 458 459 s_mov_b32 exec_lo, 0x3FFF 460 s_mov_b32 exec_hi, 0x0 461 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 V_COHERENCE 462 v_readlane_b32 ttmp14, v0, 0xE 463 v_readlane_b32 ttmp15, v0, 0xF 464 s_mov_b32 exec_lo, ttmp14 465 s_mov_b32 exec_hi, ttmp15 466#else 467 s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 S_COHERENCE 468 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 S_COHERENCE 469 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 S_COHERENCE 470#endif 471 472 /* setup Resource Contants */ 473 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo 474 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi 475 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE 476 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited 477 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC 478 479 s_mov_b32 s_save_m0, m0 480 481 /* global mem offset */ 482 s_mov_b32 s_save_mem_offset, 0x0 483 get_wave_size2(s_wave_size) 484 485#if HAVE_XNACK 486 // Save and clear vector XNACK state late to free up SGPRs. 487 s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) 488 s_setreg_imm32_b32 hwreg(HW_REG_SHADER_XNACK_MASK), 0x0 489#endif 490 491 /* save first 4 VGPRs, needed for SGPR save */ 492 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 493 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 494 s_and_b32 m0, m0, 1 495 s_cmp_eq_u32 m0, 1 496 s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI 497 s_mov_b32 exec_hi, 0x00000000 498 s_branch L_SAVE_4VGPR_WAVE32 499L_ENABLE_SAVE_4VGPR_EXEC_HI: 500 s_mov_b32 exec_hi, 0xFFFFFFFF 501 s_branch L_SAVE_4VGPR_WAVE64 502L_SAVE_4VGPR_WAVE32: 503 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 504 505 // VGPR Allocated in 4-GPR granularity 506 507#if SAVE_AFTER_XNACK_ERROR 508 check_if_tcp_store_ok() 509 s_cbranch_scc1 L_SAVE_FIRST_VGPRS32_WITH_TCP 510 511 write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) 512 s_branch L_SAVE_HWREG 513 514L_SAVE_FIRST_VGPRS32_WITH_TCP: 515#endif 516 517#if !NO_SQC_STORE 518 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 519#endif 520 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 521 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 522 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 523 s_branch L_SAVE_HWREG 524 525L_SAVE_4VGPR_WAVE64: 526 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 527 528 // VGPR Allocated in 4-GPR granularity 529 530#if SAVE_AFTER_XNACK_ERROR 531 check_if_tcp_store_ok() 532 s_cbranch_scc1 L_SAVE_FIRST_VGPRS64_WITH_TCP 533 534 write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) 535 s_branch L_SAVE_HWREG 536 537L_SAVE_FIRST_VGPRS64_WITH_TCP: 538#endif 539 540#if !NO_SQC_STORE 541 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 542#endif 543 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 544 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 545 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 546 547 /* save HW registers */ 548 549L_SAVE_HWREG: 550 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) 551 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 552 get_svgpr_size_bytes(s_save_tmp) 553 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 554 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() 555 556 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 557 558#if NO_SQC_STORE 559 v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource 560 v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource 561 v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store 562 s_mov_b32 m0, 0x0 //Next lane of v2 to write to 563#endif 564 565 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 566 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) 567 s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK 568 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) 569 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) 570 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) 571 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) 572 573 s_getreg_b32 s_save_tmp, hwreg(S_TRAPSTS_HWREG) 574 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) 575 576 // Not used on Sienna_Cichlid but keep layout same for debugger. 577 write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset) 578 579 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) 580 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 581 582 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO) 583 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 584 585 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI) 586 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 587 588#if NO_SQC_STORE 589 // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. 590 s_mov_b32 exec_lo, 0xFFFF 591 s_mov_b32 exec_hi, 0x0 592 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 593 594 // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. 595 s_mov_b32 exec_lo, 0xFFFFFFFF 596#endif 597 598 /* save SGPRs */ 599 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... 600 601 // SGPR SR memory offset : size(VGPR)+size(SVGPR) 602 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 603 get_svgpr_size_bytes(s_save_tmp) 604 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 605 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 606 607#if NO_SQC_STORE 608 s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into 609#else 610 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 611 s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0 612 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset 613 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 614#endif 615 616 s_mov_b32 m0, 0x0 //SGPR initial index value =0 617 s_nop 0x0 //Manually inserted wait states 618L_SAVE_SGPR_LOOP: 619 // SGPR is allocated in 16 SGPR granularity 620 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 621 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 622 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 623 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 624 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 625 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 626 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] 627 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] 628 629 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) 630 631#if NO_SQC_STORE 632 s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? 633 s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE 634 635 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 636 s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 637 s_mov_b32 ttmp13, 0x0 638 v_mov_b32 v2, 0x0 639L_SAVE_SGPR_SKIP_TCP_STORE: 640#endif 641 642 s_add_u32 m0, m0, 16 //next sgpr index 643 s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 644 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? 645 646 //save the rest 12 SGPR 647 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 648 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 649 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 650 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 651 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 652 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 653 write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) 654 655#if NO_SQC_STORE 656 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 657#else 658 // restore s_save_buf_rsrc0,1 659 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask 660#endif 661 662 /* save LDS */ 663 664L_SAVE_LDS: 665 // Change EXEC to all threads... 666 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 667 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 668 s_and_b32 m0, m0, 1 669 s_cmp_eq_u32 m0, 1 670 s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI 671 s_mov_b32 exec_hi, 0x00000000 672 s_branch L_SAVE_LDS_NORMAL 673L_ENABLE_SAVE_LDS_EXEC_HI: 674 s_mov_b32 exec_hi, 0xFFFFFFFF 675L_SAVE_LDS_NORMAL: 676 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 677 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? 678 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE 679 680 s_barrier //LDS is used? wait for other waves in the same TG 681 s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK 682 s_cbranch_scc0 L_SAVE_LDS_DONE 683 684 // first wave do LDS save; 685 686 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY 687 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes 688 689 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) 690 // 691 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 692 get_svgpr_size_bytes(s_save_tmp) 693 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 694 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() 695 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() 696 697 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 698 699 //load 0~63*4(byte address) to vgpr v0 700 v_mbcnt_lo_u32_b32 v0, -1, 0 701 v_mbcnt_hi_u32_b32 v0, -1, v0 702 v_mul_u32_u24 v0, 4, v0 703 704 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 705 s_and_b32 m0, m0, 1 706 s_cmp_eq_u32 m0, 1 707 s_mov_b32 m0, 0x0 708 s_cbranch_scc1 L_SAVE_LDS_W64 709 710L_SAVE_LDS_W32: 711#if SAVE_AFTER_XNACK_ERROR 712 check_if_tcp_store_ok() 713 s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W32 714 715L_SAVE_LDS_LOOP_SQC_W32: 716 ds_read_b32 v1, v0 717 S_WAITCNT_0 718 719 write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) 720 721 s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes 722 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes 723 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 724 s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W32 //LDS save is complete? 725 726 s_branch L_SAVE_LDS_DONE 727 728L_SAVE_LDS_WITH_TCP_W32: 729#endif 730 731 s_mov_b32 s3, 128 732 s_nop 0 733 s_nop 0 734 s_nop 0 735L_SAVE_LDS_LOOP_W32: 736 ds_read_b32 v1, v0 737 S_WAITCNT_0 738 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 739 740 s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes 741 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 742 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes 743 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 744 s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? 745 746 s_branch L_SAVE_LDS_DONE 747 748L_SAVE_LDS_W64: 749#if SAVE_AFTER_XNACK_ERROR 750 check_if_tcp_store_ok() 751 s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W64 752 753L_SAVE_LDS_LOOP_SQC_W64: 754 ds_read_b32 v1, v0 755 S_WAITCNT_0 756 757 write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) 758 759 s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes 760 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes 761 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 762 s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W64 //LDS save is complete? 763 764 s_branch L_SAVE_LDS_DONE 765 766L_SAVE_LDS_WITH_TCP_W64: 767#endif 768 769 s_mov_b32 s3, 256 770 s_nop 0 771 s_nop 0 772 s_nop 0 773L_SAVE_LDS_LOOP_W64: 774 ds_read_b32 v1, v0 775 S_WAITCNT_0 776 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 777 778 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes 779 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 780 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes 781 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 782 s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? 783 784L_SAVE_LDS_DONE: 785 /* save VGPRs - set the Rest VGPRs */ 786L_SAVE_VGPR: 787 // VGPR SR memory offset: 0 788 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 789 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 790 s_and_b32 m0, m0, 1 791 s_cmp_eq_u32 m0, 1 792 s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI 793 s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs 794 s_mov_b32 exec_hi, 0x00000000 795 s_branch L_SAVE_VGPR_NORMAL 796L_ENABLE_SAVE_VGPR_EXEC_HI: 797 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs 798 s_mov_b32 exec_hi, 0xFFFFFFFF 799L_SAVE_VGPR_NORMAL: 800 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 801 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 802 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 803 //determine it is wave32 or wave64 804 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 805 s_and_b32 m0, m0, 1 806 s_cmp_eq_u32 m0, 1 807 s_cbranch_scc1 L_SAVE_VGPR_WAVE64 808 809 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 810 811 // VGPR Allocated in 4-GPR granularity 812 813 // VGPR store using dw burst 814 s_mov_b32 m0, 0x4 //VGPR initial index value =4 815 s_cmp_lt_u32 m0, s_save_alloc_size 816 s_cbranch_scc0 L_SAVE_VGPR_END 817 818#if SAVE_AFTER_XNACK_ERROR 819 check_if_tcp_store_ok() 820 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP 821 822L_SAVE_VGPR_LOOP_SQC_W32: 823 v_movrels_b32 v0, v0 //v0 = v[0+m0] 824 v_movrels_b32 v1, v1 //v1 = v[1+m0] 825 v_movrels_b32 v2, v2 //v2 = v[2+m0] 826 v_movrels_b32 v3, v3 //v3 = v[3+m0] 827 828 write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) 829 830 s_add_u32 m0, m0, 4 831 s_cmp_lt_u32 m0, s_save_alloc_size 832 s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W32 833 834 s_branch L_SAVE_VGPR_END 835#endif 836 837L_SAVE_VGPR_W32_LOOP: 838 v_movrels_b32 v0, v0 //v0 = v[0+m0] 839 v_movrels_b32 v1, v1 //v1 = v[1+m0] 840 v_movrels_b32 v2, v2 //v2 = v[2+m0] 841 v_movrels_b32 v3, v3 //v3 = v[3+m0] 842 843 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 844 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 845 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 846 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 847 848 s_add_u32 m0, m0, 4 //next vgpr index 849 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes 850 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 851 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete? 852 853 s_branch L_SAVE_VGPR_END 854 855L_SAVE_VGPR_WAVE64: 856 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 857 858 // VGPR store using dw burst 859 s_mov_b32 m0, 0x4 //VGPR initial index value =4 860 s_cmp_lt_u32 m0, s_save_alloc_size 861 s_cbranch_scc0 L_SAVE_SHARED_VGPR 862 863#if SAVE_AFTER_XNACK_ERROR 864 check_if_tcp_store_ok() 865 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP 866 867L_SAVE_VGPR_LOOP_SQC_W64: 868 v_movrels_b32 v0, v0 //v0 = v[0+m0] 869 v_movrels_b32 v1, v1 //v1 = v[1+m0] 870 v_movrels_b32 v2, v2 //v2 = v[2+m0] 871 v_movrels_b32 v3, v3 //v3 = v[3+m0] 872 873 write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) 874 875 s_add_u32 m0, m0, 4 876 s_cmp_lt_u32 m0, s_save_alloc_size 877 s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W64 878 879 s_branch L_SAVE_VGPR_END 880#endif 881 882L_SAVE_VGPR_W64_LOOP: 883 v_movrels_b32 v0, v0 //v0 = v[0+m0] 884 v_movrels_b32 v1, v1 //v1 = v[1+m0] 885 v_movrels_b32 v2, v2 //v2 = v[2+m0] 886 v_movrels_b32 v3, v3 //v3 = v[3+m0] 887 888 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 889 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 890 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 891 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 892 893 s_add_u32 m0, m0, 4 //next vgpr index 894 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes 895 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 896 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete? 897 898L_SAVE_SHARED_VGPR: 899 //Below part will be the save shared vgpr part (new for gfx10) 900 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 901 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? 902 s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS 903 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) 904 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. 905 //save shared_vgpr will start from the index of m0 906 s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 907 s_mov_b32 exec_lo, 0xFFFFFFFF 908 s_mov_b32 exec_hi, 0x00000000 909 910#if SAVE_AFTER_XNACK_ERROR 911 check_if_tcp_store_ok() 912 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP 913 914L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC: 915 v_movrels_b32 v0, v0 916 917 write_vgprs_to_mem_with_sqc_w64(v0, 1, s_save_buf_rsrc0, s_save_mem_offset) 918 919 s_add_u32 m0, m0, 1 920 s_cmp_lt_u32 m0, s_save_alloc_size 921 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC 922 923 s_branch L_SAVE_VGPR_END 924#endif 925 926L_SAVE_SHARED_VGPR_WAVE64_LOOP: 927 v_movrels_b32 v0, v0 //v0 = v[0+m0] 928 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE 929 s_add_u32 m0, m0, 1 //next vgpr index 930 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 931 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 932 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? 933 934L_SAVE_VGPR_END: 935 s_branch L_END_PGM 936 937L_RESTORE: 938 /* Setup Resource Contants */ 939 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo 940 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi 941 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE 942 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) 943 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC 944 945 //determine it is wave32 or wave64 946 get_wave_size2(s_restore_size) 947 948 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK 949 s_cbranch_scc0 L_RESTORE_VGPR 950 951 /* restore LDS */ 952L_RESTORE_LDS: 953 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 954 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 955 s_and_b32 m0, m0, 1 956 s_cmp_eq_u32 m0, 1 957 s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI 958 s_mov_b32 exec_hi, 0x00000000 959 s_branch L_RESTORE_LDS_NORMAL 960L_ENABLE_RESTORE_LDS_EXEC_HI: 961 s_mov_b32 exec_hi, 0xFFFFFFFF 962L_RESTORE_LDS_NORMAL: 963 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 964 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? 965 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR 966 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY 967 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes 968 969 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) 970 // 971 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 972 get_svgpr_size_bytes(s_restore_tmp) 973 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 974 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 975 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() 976 977 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 978 979 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 980 s_and_b32 m0, m0, 1 981 s_cmp_eq_u32 m0, 1 982 s_mov_b32 m0, 0x0 983 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 984 985L_RESTORE_LDS_LOOP_W32: 986#if HAVE_BUFFER_LDS_LOAD 987 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 988#else 989 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset 990 S_WAITCNT_0 991 ds_store_addtid_b32 v0 992#endif 993 s_add_u32 m0, m0, 128 // 128 DW 994 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW 995 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 996 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? 997 s_branch L_RESTORE_VGPR 998 999L_RESTORE_LDS_LOOP_W64: 1000#if HAVE_BUFFER_LDS_LOAD 1001 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 1002#else 1003 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset 1004 S_WAITCNT_0 1005 ds_store_addtid_b32 v0 1006#endif 1007 s_add_u32 m0, m0, 256 // 256 DW 1008 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW 1009 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 1010 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? 1011 1012 /* restore VGPRs */ 1013L_RESTORE_VGPR: 1014 // VGPR SR memory offset : 0 1015 s_mov_b32 s_restore_mem_offset, 0x0 1016 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 1017 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 1018 s_and_b32 m0, m0, 1 1019 s_cmp_eq_u32 m0, 1 1020 s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI 1021 s_mov_b32 exec_hi, 0x00000000 1022 s_branch L_RESTORE_VGPR_NORMAL 1023L_ENABLE_RESTORE_VGPR_EXEC_HI: 1024 s_mov_b32 exec_hi, 0xFFFFFFFF 1025L_RESTORE_VGPR_NORMAL: 1026 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 1027 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 1028 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 1029 //determine it is wave32 or wave64 1030 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 1031 s_and_b32 m0, m0, 1 1032 s_cmp_eq_u32 m0, 1 1033 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 1034 1035 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1036 1037 // VGPR load using dw burst 1038 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last 1039 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 1040 s_mov_b32 m0, 4 //VGPR initial index value = 4 1041 s_cmp_lt_u32 m0, s_restore_alloc_size 1042 s_cbranch_scc0 L_RESTORE_SGPR 1043 1044L_RESTORE_VGPR_WAVE32_LOOP: 1045 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE 1046 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128 1047 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2 1048 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3 1049 S_WAITCNT_0 1050 v_movreld_b32 v0, v0 //v[0+m0] = v0 1051 v_movreld_b32 v1, v1 1052 v_movreld_b32 v2, v2 1053 v_movreld_b32 v3, v3 1054 s_add_u32 m0, m0, 4 //next vgpr index 1055 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes 1056 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 1057 s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? 1058 1059 /* VGPR restore on v0 */ 1060 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE 1061 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128 1062 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2 1063 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3 1064 S_WAITCNT_0 1065 1066 s_branch L_RESTORE_SGPR 1067 1068L_RESTORE_VGPR_WAVE64: 1069 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1070 1071 // VGPR load using dw burst 1072 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last 1073 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 1074 s_mov_b32 m0, 4 //VGPR initial index value = 4 1075 s_cmp_lt_u32 m0, s_restore_alloc_size 1076 s_cbranch_scc0 L_RESTORE_SHARED_VGPR 1077 1078L_RESTORE_VGPR_WAVE64_LOOP: 1079 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE 1080 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256 1081 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2 1082 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3 1083 S_WAITCNT_0 1084 v_movreld_b32 v0, v0 //v[0+m0] = v0 1085 v_movreld_b32 v1, v1 1086 v_movreld_b32 v2, v2 1087 v_movreld_b32 v3, v3 1088 s_add_u32 m0, m0, 4 //next vgpr index 1089 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes 1090 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 1091 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? 1092 1093L_RESTORE_SHARED_VGPR: 1094 //Below part will be the restore shared vgpr part (new for gfx10) 1095 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size 1096 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? 1097 s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? 1098 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) 1099 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. 1100 //restore shared_vgpr will start from the index of m0 1101 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 1102 s_mov_b32 exec_lo, 0xFFFFFFFF 1103 s_mov_b32 exec_hi, 0x00000000 1104L_RESTORE_SHARED_VGPR_WAVE64_LOOP: 1105 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE 1106 S_WAITCNT_0 1107 v_movreld_b32 v0, v0 //v[0+m0] = v0 1108 s_add_u32 m0, m0, 1 //next vgpr index 1109 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 1110 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 1111 s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? 1112 1113 s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! 1114 1115 /* VGPR restore on v0 */ 1116L_RESTORE_V0: 1117 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE 1118 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256 1119 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2 1120 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3 1121 S_WAITCNT_0 1122 1123 /* restore SGPRs */ 1124 //will be 2+8+16*6 1125 // SGPR SR memory offset : size(VGPR)+size(SVGPR) 1126L_RESTORE_SGPR: 1127 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 1128 get_svgpr_size_bytes(s_restore_tmp) 1129 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 1130 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 1131 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved 1132 1133 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1134 1135 s_mov_b32 m0, s_sgpr_save_num 1136 1137 read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 1138 S_WAITCNT_0 1139 1140 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] 1141 s_nop 0 // hazard SALU M0=> S_MOVREL 1142 1143 s_movreld_b64 s0, s0 //s[0+m0] = s0 1144 s_movreld_b64 s2, s2 1145 1146 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 1147 S_WAITCNT_0 1148 1149 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] 1150 s_nop 0 // hazard SALU M0=> S_MOVREL 1151 1152 s_movreld_b64 s0, s0 //s[0+m0] = s0 1153 s_movreld_b64 s2, s2 1154 s_movreld_b64 s4, s4 1155 s_movreld_b64 s6, s6 1156 1157 L_RESTORE_SGPR_LOOP: 1158 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 1159 S_WAITCNT_0 1160 1161 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] 1162 s_nop 0 // hazard SALU M0=> S_MOVREL 1163 1164 s_movreld_b64 s0, s0 //s[0+m0] = s0 1165 s_movreld_b64 s2, s2 1166 s_movreld_b64 s4, s4 1167 s_movreld_b64 s6, s6 1168 s_movreld_b64 s8, s8 1169 s_movreld_b64 s10, s10 1170 s_movreld_b64 s12, s12 1171 s_movreld_b64 s14, s14 1172 1173 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0 1174 s_cbranch_scc0 L_RESTORE_SGPR_LOOP 1175 1176 // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception. 1177 // Clear DEBUG_EN before and restore MODE after the barrier. 1178 s_setreg_imm32_b32 hwreg(HW_REG_MODE), 0 1179 s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG 1180 1181 /* restore HW registers */ 1182L_RESTORE_HWREG: 1183 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) 1184 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 1185 get_svgpr_size_bytes(s_restore_tmp) 1186 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 1187 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 1188 1189 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1190 1191 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) 1192 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) 1193 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 1194 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) 1195 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 1196 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) 1197 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) 1198 read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) 1199 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) 1200 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) 1201 S_WAITCNT_0 1202 1203 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch 1204 1205 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) 1206 S_WAITCNT_0 1207 1208 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch 1209 1210 s_mov_b32 m0, s_restore_m0 1211 s_mov_b32 exec_lo, s_restore_exec_lo 1212 s_mov_b32 exec_hi, s_restore_exec_hi 1213 1214#if HAVE_XNACK 1215 s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask 1216#endif 1217 1218 // {TRAPSTS/EXCP_FLAG_PRIV}.SAVE_CONTEXT and HOST_TRAP may have changed. 1219 // Only restore the other fields to avoid clobbering them. 1220 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, 0, S_TRAPSTS_RESTORE_PART_1_SIZE), s_restore_trapsts 1221 s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_2_SHIFT 1222 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_2_SHIFT, S_TRAPSTS_RESTORE_PART_2_SIZE), s_restore_trapsts 1223 1224if S_TRAPSTS_RESTORE_PART_3_SIZE > 0 1225 s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_3_SHIFT - S_TRAPSTS_RESTORE_PART_2_SHIFT 1226 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_3_SHIFT, S_TRAPSTS_RESTORE_PART_3_SIZE), s_restore_trapsts 1227end 1228 1229 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode 1230 1231 // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic 1232 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 1233 get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) 1234 get_svgpr_size_bytes(s_restore_ttmps_hi) 1235 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi 1236 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() 1237 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 1238 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 1239 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF 1240 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE 1241 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE 1242 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE 1243 S_WAITCNT_0 1244 1245#if HAVE_XNACK 1246 restore_ib_sts(s_restore_tmp, s_restore_m0) 1247#endif 1248 1249 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS 1250 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 1251 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 1252 1253#if SW_SA_TRAP 1254 // If traps are enabled then return to the shader with PRIV=0. 1255 // Otherwise retain PRIV=1 for subsequent context save requests. 1256 s_getreg_b32 s_restore_tmp, hwreg(HW_REG_STATUS) 1257 s_bitcmp1_b32 s_restore_tmp, SQ_WAVE_STATUS_TRAP_EN_SHIFT 1258 s_cbranch_scc1 L_RETURN_WITHOUT_PRIV 1259 1260 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu 1261 s_setpc_b64 [s_restore_pc_lo, s_restore_pc_hi] 1262L_RETURN_WITHOUT_PRIV: 1263#endif 1264 1265 s_setreg_b32 hwreg(S_STATUS_HWREG), s_restore_status // SCC is included, which is changed by previous salu 1266 1267 s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution 1268 1269L_END_PGM: 1270 s_endpgm_saved 1271end 1272 1273function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) 1274#if NO_SQC_STORE 1275 // Copy into VGPR for later TCP store. 1276 v_writelane_b32 v2, s, m0 1277 s_add_u32 m0, m0, 0x1 1278#else 1279 s_mov_b32 exec_lo, m0 1280 s_mov_b32 m0, s_mem_offset 1281 s_buffer_store_dword s, s_rsrc, m0 S_COHERENCE 1282 s_add_u32 s_mem_offset, s_mem_offset, 4 1283 s_mov_b32 m0, exec_lo 1284#endif 1285end 1286 1287 1288function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) 1289#if NO_SQC_STORE 1290 // Copy into VGPR for later TCP store. 1291 for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ 1292 v_writelane_b32 v2, s[sgpr_idx], ttmp13 1293 s_add_u32 ttmp13, ttmp13, 0x1 1294 end 1295#else 1296 s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE 1297 s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE 1298 s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE 1299 s_buffer_store_dwordx4 s[12], s_rsrc, 48 S_COHERENCE 1300 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 1301 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 1302#endif 1303end 1304 1305function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) 1306#if NO_SQC_STORE 1307 // Copy into VGPR for later TCP store. 1308 for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ 1309 v_writelane_b32 v2, s[sgpr_idx], ttmp13 1310 s_add_u32 ttmp13, ttmp13, 0x1 1311 end 1312#else 1313 s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE 1314 s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE 1315 s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE 1316 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 1317 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 1318#endif 1319end 1320 1321function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) 1322 s_buffer_load_dword s, s_rsrc, s_mem_offset S_COHERENCE 1323 s_add_u32 s_mem_offset, s_mem_offset, 4 1324end 1325 1326function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) 1327 s_sub_u32 s_mem_offset, s_mem_offset, 4*16 1328 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset S_COHERENCE 1329end 1330 1331function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) 1332 s_sub_u32 s_mem_offset, s_mem_offset, 4*8 1333 s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset S_COHERENCE 1334end 1335 1336function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) 1337 s_sub_u32 s_mem_offset, s_mem_offset, 4*4 1338 s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset S_COHERENCE 1339end 1340 1341#if SAVE_AFTER_XNACK_ERROR 1342function check_if_tcp_store_ok 1343 // If TRAPSTS.XNACK_ERROR=1 then TCP stores will fail. 1344 s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) 1345 s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp 1346 1347L_TCP_STORE_CHECK_DONE: 1348end 1349 1350function write_vgpr_to_mem_with_sqc(vgpr, n_lanes, s_rsrc, s_mem_offset) 1351 s_mov_b32 s4, 0 1352 1353L_WRITE_VGPR_LANE_LOOP: 1354 for var lane = 0; lane < 4; ++lane 1355 v_readlane_b32 s[lane], vgpr, s4 1356 s_add_u32 s4, s4, 1 1357 end 1358 1359 s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1 1360 1361 s_add_u32 s_mem_offset, s_mem_offset, 0x10 1362 s_cmp_eq_u32 s4, n_lanes 1363 s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP 1364end 1365 1366function write_vgprs_to_mem_with_sqc_w32(vgpr0, n_vgprs, s_rsrc, s_mem_offset) 1367 for var vgpr = 0; vgpr < n_vgprs; ++vgpr 1368 write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 32, s_rsrc, s_mem_offset) 1369 end 1370end 1371 1372function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset) 1373 for var vgpr = 0; vgpr < n_vgprs; ++vgpr 1374 write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 64, s_rsrc, s_mem_offset) 1375 end 1376end 1377#endif 1378 1379function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) 1380 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 1381 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 1382 s_bitcmp1_b32 s_size, S_WAVE_SIZE 1383 s_cbranch_scc1 L_ENABLE_SHIFT_W64 1384 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) 1385 s_branch L_SHIFT_DONE 1386L_ENABLE_SHIFT_W64: 1387 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) 1388L_SHIFT_DONE: 1389end 1390 1391function get_svgpr_size_bytes(s_svgpr_size_byte) 1392 s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 1393 s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7) 1394end 1395 1396function get_sgpr_size_bytes 1397 return 512 1398end 1399 1400function get_hwreg_size_bytes 1401 return 128 1402end 1403 1404function get_wave_size2(s_reg) 1405 s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) 1406 s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE 1407end 1408 1409#if HAVE_XNACK 1410function save_and_clear_ib_sts(tmp1, tmp2) 1411 // Preserve and clear scalar XNACK state before issuing scalar loads. 1412 // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into 1413 // unused space ttmp11[31:24]. 1414 s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK) 1415 s_getreg_b32 tmp1, hwreg(HW_REG_IB_STS) 1416 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK 1417 s_lshl_b32 tmp2, tmp2, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) 1418 s_or_b32 ttmp11, ttmp11, tmp2 1419 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 1420 s_lshl_b32 tmp2, tmp2, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) 1421 s_or_b32 ttmp11, ttmp11, tmp2 1422 s_andn2_b32 tmp1, tmp1, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK) 1423 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1 1424end 1425 1426function restore_ib_sts(tmp1, tmp2) 1427 s_lshr_b32 tmp1, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) 1428 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 1429 s_lshr_b32 tmp1, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) 1430 s_and_b32 tmp1, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK 1431 s_or_b32 tmp1, tmp1, tmp2 1432 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1 1433end 1434#endif 1435