1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23/* To compile this assembly code: 24 * 25 * gfx12: 26 * cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx12.asm -P -o gfx12.sp3 27 * sp3 gfx12.sp3 -hex gfx12.hex 28 */ 29 30#define CHIP_GFX12 37 31#define CHIP_GC_12_0_3 38 32 33#define HAVE_XNACK (ASIC_FAMILY == CHIP_GC_12_0_3) 34#define HAVE_57BIT_ADDRESS (ASIC_FAMILY == CHIP_GC_12_0_3) 35#define HAVE_BANKED_VGPRS (ASIC_FAMILY == CHIP_GC_12_0_3) 36#define NUM_NAMED_BARRIERS (ASIC_FAMILY == CHIP_GC_12_0_3 ? 0x10 : 0) 37#define HAVE_CLUSTER_BARRIER (ASIC_FAMILY == CHIP_GC_12_0_3) 38#define CLUSTER_BARRIER_SERIALIZE_WORKAROUND (ASIC_FAMILY == CHIP_GC_12_0_3) 39#define RELAXED_SCHEDULING_IN_TRAP (ASIC_FAMILY == CHIP_GFX12) 40#define HAVE_INSTRUCTION_FIXUP (ASIC_FAMILY == CHIP_GC_12_0_3) 41 42#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised 43#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12) 44#define WAVE32_ONLY (ASIC_FAMILY == CHIP_GC_12_0_3) 45#define SAVE_TTMPS_IN_SGPR_BLOCK (ASIC_FAMILY >= CHIP_GC_12_0_3) 46 47#if HAVE_XNACK && !WAVE32_ONLY 48# error 49#endif 50 51#define ADDRESS_HI32_NUM_BITS ((HAVE_57BIT_ADDRESS ? 57 : 48) - 32) 52#define ADDRESS_HI32_MASK ((1 << ADDRESS_HI32_NUM_BITS) - 1) 53 54var SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK = 0x4 | (NUM_NAMED_BARRIERS ? 0x8 : 0) | (HAVE_CLUSTER_BARRIER ? 0x10000 : 0) 55var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 56var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00 57var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000 58var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK = 0x8000 59var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15 60var SQ_WAVE_STATUS_WAVE64_SHIFT = 29 61var SQ_WAVE_STATUS_WAVE64_SIZE = 1 62var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 63var SQ_WAVE_STATUS_IN_WG_SHIFT = 11 64var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK 65var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000 66 67var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 68var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 69var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 70var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12 71 72#if ASIC_FAMILY < CHIP_GC_12_0_3 73var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9 74#else 75var SQ_WAVE_LDS_ALLOC_GRANULARITY = 10 76#endif 77 78var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF 79var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT = 4 80var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10 81var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5 82var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20 83var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40 84var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6 85var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80 86var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT = 7 87var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK = 0x100 88var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT = 8 89var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200 90var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800 91var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80 92var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK = 0x200 93 94var SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK= SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK |\ 95 SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK |\ 96 SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK |\ 97 SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK |\ 98 SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK |\ 99 SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK 100var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT 101var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT 102var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT 103var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT 104var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE = 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT 105 106var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT = 0 107var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE = 2 108 109var BARRIER_STATE_SIGNAL_OFFSET = 16 110var BARRIER_STATE_SIGNAL_SIZE = 7 111var BARRIER_STATE_MEMBER_OFFSET = 4 112var BARRIER_STATE_MEMBER_SIZE = 7 113var BARRIER_STATE_VALID_OFFSET = 0 114 115#if RELAXED_SCHEDULING_IN_TRAP 116var TTMP11_SCHED_MODE_SHIFT = 26 117var TTMP11_SCHED_MODE_SIZE = 2 118var TTMP11_SCHED_MODE_MASK = 0xC000000 119#endif 120 121var NAMED_BARRIERS_SR_OFFSET_FROM_HWREG = 0x80 122var S_BARRIER_INIT_MEMBERCNT_MASK = 0x7F0000 123var S_BARRIER_INIT_MEMBERCNT_SHIFT = 0x10 124 125var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT = 18 126var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE = 1 127var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT = 16 128var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE = 1 129var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT = 0 130var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE = 7 131 132#if HAVE_BANKED_VGPRS 133var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT = 12 134var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE = 6 135#endif 136 137var TTMP11_SCHED_MODE_SHIFT = 26 138var TTMP11_SCHED_MODE_SIZE = 2 139var TTMP11_SCHED_MODE_MASK = 0xC000000 140var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23 141var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 142var TTMP11_FIRST_REPLAY_SHIFT = 22 143var TTMP11_FIRST_REPLAY_MASK = 0x400000 144var TTMP11_REPLAY_W64H_SHIFT = 21 145var TTMP11_REPLAY_W64H_MASK = 0x200000 146var TTMP11_FXPTR_SHIFT = 14 147var TTMP11_FXPTR_MASK = 0x1FC000 148 149// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] 150// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE 151var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 152var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC 153var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 154var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 155 156var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000 157var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31 158 159#if HAVE_BANKED_VGPRS 160var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT = 25 161var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE = 6 162#endif 163 164var s_sgpr_save_num = 108 165 166var s_save_spi_init_lo = exec_lo 167var s_save_spi_init_hi = exec_hi 168var s_save_pc_lo = ttmp0 169var s_save_pc_hi = ttmp1 170var s_save_exec_lo = ttmp2 171var s_save_exec_hi = ttmp3 172var s_save_state_priv = ttmp12 173var s_save_excp_flag_priv = ttmp15 174var s_save_xnack_mask = s_save_exec_hi 175var s_wave_size = ttmp7 176var s_save_base_addr_lo = ttmp8 177var s_save_base_addr_hi = ttmp9 178var s_save_addr_lo = ttmp10 179var s_save_addr_hi = ttmp11 180var s_save_mem_offset = ttmp4 181var s_save_alloc_size = s_save_excp_flag_priv 182var s_save_tmp = ttmp14 183var s_save_m0 = ttmp5 184var s_save_ttmps_lo = s_save_tmp 185var s_save_ttmps_hi = s_save_excp_flag_priv 186 187var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 188var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 189var S_WAVE_SIZE = 25 190 191var s_restore_spi_init_lo = exec_lo 192var s_restore_spi_init_hi = exec_hi 193var s_restore_mem_offset = ttmp12 194var s_restore_alloc_size = ttmp3 195var s_restore_tmp = ttmp2 196var s_restore_mem_offset_save = s_restore_tmp 197var s_restore_m0 = s_restore_alloc_size 198var s_restore_mode = ttmp7 199var s_restore_flat_scratch = s_restore_tmp 200var s_restore_pc_lo = ttmp0 201var s_restore_pc_hi = ttmp1 202var s_restore_exec_lo = ttmp4 203var s_restore_exec_hi = ttmp5 204var s_restore_state_priv = ttmp14 205var s_restore_excp_flag_priv = ttmp15 206var s_restore_xnack_mask = ttmp13 207var s_restore_base_addr_lo = ttmp8 208var s_restore_base_addr_hi = ttmp9 209var s_restore_addr_lo = ttmp10 210var s_restore_addr_hi = ttmp11 211var s_restore_size = ttmp6 212var s_restore_ttmps_lo = s_restore_tmp 213var s_restore_ttmps_hi = s_restore_alloc_size 214var s_restore_spi_init_hi_save = s_restore_exec_hi 215 216#if SAVE_TTMPS_IN_SGPR_BLOCK 217var TTMP_SR_OFFSET_FROM_HWREG = -0x40 218#else 219var TTMP_SR_OFFSET_FROM_HWREG = 0x40 220#endif 221 222shader main 223 asic(DEFAULT) 224 type(CS) 225 wave_size(32) 226 227 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save 228 229L_JUMP_TO_RESTORE: 230 s_branch L_RESTORE 231 232L_SKIP_RESTORE: 233#if RELAXED_SCHEDULING_IN_TRAP 234 // Assume most relaxed scheduling mode is set. Save and revert to normal mode. 235 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE) 236 s_wait_alu 0 237 s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, \ 238 SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0 239#endif 240 241 s_getreg_b32 s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV) //save STATUS since we will change SCC 242 243#if RELAXED_SCHEDULING_IN_TRAP 244 // Save SCHED_MODE[1:0] into ttmp11[27:26]. 245 s_andn2_b32 ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK 246 s_lshl_b32 ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT 247 s_or_b32 ttmp11, ttmp11, ttmp2 248#endif 249 250 // Clear SPI_PRIO: do not save with elevated priority. 251 // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. 252 s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK 253 254 s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) 255 256 s_and_b32 ttmp2, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK 257 s_cbranch_scc0 L_NOT_HALTED 258 259L_HALTED: 260 // Host trap may occur while wave is halted. 261 s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK 262 s_cbranch_scc1 L_FETCH_2ND_TRAP 263 264L_CHECK_SAVE: 265 s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK 266 s_cbranch_scc1 L_SAVE 267 268 // Wave is halted but neither host trap nor SAVECTX is raised. 269 // Caused by instruction fetch memory violation. 270 // Spin wait until context saved to prevent interrupt storm. 271 s_sleep 0x10 272 s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) 273 s_branch L_CHECK_SAVE 274 275L_NOT_HALTED: 276 // Let second-level handle non-SAVECTX exception or trap. 277 // Any concurrent SAVECTX will be handled upon re-entry once halted. 278 279 // Check non-maskable exceptions. memory_violation, illegal_instruction 280 // and xnack_error exceptions always cause the wave to enter the trap 281 // handler. 282 s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK 283 s_cbranch_scc1 L_FETCH_2ND_TRAP 284 285 // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi. 286 // Maskable exceptions only cause the wave to enter the trap handler if 287 // their respective bit in mode.excp_en is set. 288 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) 289 s_and_b32 ttmp3, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK 290 s_cbranch_scc0 L_NOT_ADDR_WATCH 291 s_or_b32 ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK 292 293L_NOT_ADDR_WATCH: 294 s_getreg_b32 ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL) 295 s_and_b32 ttmp2, ttmp3, ttmp2 296 s_cbranch_scc1 L_FETCH_2ND_TRAP 297 298L_CHECK_TRAP_ID: 299 // Check trap_id != 0 300 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK 301 s_cbranch_scc1 L_FETCH_2ND_TRAP 302 303#if SINGLE_STEP_MISSED_WORKAROUND 304 // Prioritize single step exception over context save. 305 // Second-level trap will halt wave and RFE, re-entering for SAVECTX. 306 // WAVE_TRAP_CTRL is already in ttmp3. 307 s_and_b32 ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK 308 s_cbranch_scc1 L_FETCH_2ND_TRAP 309#endif 310 311 s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK 312 s_cbranch_scc1 L_SAVE 313 314L_FETCH_2ND_TRAP: 315#if HAVE_XNACK 316 save_and_clear_xnack_state_priv(ttmp14) 317#endif 318 319 // Read second-level TBA/TMA from first-level TMA and jump if available. 320 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) 321 // ttmp12 holds SQ_WAVE_STATUS 322 s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA) 323 s_wait_idle 324 s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 325 326 s_bitcmp1_b32 ttmp15, (ADDRESS_HI32_NUM_BITS - 1) 327 s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA 328 s_or_b32 ttmp15, ttmp15, ~ADDRESS_HI32_MASK 329L_NO_SIGN_EXTEND_TMA: 330#if RELAXED_SCHEDULING_IN_TRAP 331 // Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI). 332 // The second-level trap will restore from ttmp1 for backwards compatibility. 333 s_and_b32 ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK 334 s_andn2_b32 ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK 335 s_or_b32 ttmp1, ttmp1, ttmp2 336#endif 337 338 s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS // debug trap enabled flag 339 s_wait_idle 340 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT 341 s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK 342 s_or_b32 ttmp11, ttmp11, ttmp2 343 344 s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 scope:SCOPE_SYS // second-level TBA 345 s_wait_idle 346 s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 scope:SCOPE_SYS // second-level TMA 347 s_wait_idle 348 349 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] 350 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set 351 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler 352 353L_NO_NEXT_TRAP: 354 // If not caused by trap then halt wave to prevent re-entry. 355 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK 356 s_cbranch_scc1 L_TRAP_CASE 357 358 // Host trap will not cause trap re-entry. 359 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) 360 s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK 361 s_cbranch_scc1 L_EXIT_TRAP 362 s_or_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK 363 364 // If the PC points to S_ENDPGM then context save will fail if STATE_PRIV.HALT is set. 365 // Rewind the PC to prevent this from occurring. 366 s_sub_u32 ttmp0, ttmp0, 0x8 367 s_subb_u32 ttmp1, ttmp1, 0x0 368 369 s_branch L_EXIT_TRAP 370 371L_TRAP_CASE: 372 // Advance past trap instruction to prevent re-entry. 373 s_add_u32 ttmp0, ttmp0, 0x4 374 s_addc_u32 ttmp1, ttmp1, 0x0 375 376L_EXIT_TRAP: 377 s_and_b32 ttmp1, ttmp1, ADDRESS_HI32_MASK 378 379#if HAVE_INSTRUCTION_FIXUP 380 s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) 381 fixup_instruction() 382#endif 383 384#if HAVE_XNACK 385 restore_xnack_state_priv(s_save_tmp) 386#endif 387 388 // Restore SQ_WAVE_STATUS. 389 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 390 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 391 392 // STATE_PRIV.*BARRIER_COMPLETE may have changed since we read it. 393 // Only restore fields which the trap handler changes. 394 s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT 395 396#if RELAXED_SCHEDULING_IN_TRAP 397 // Assume relaxed scheduling mode after this point. 398 restore_sched_mode(ttmp2) 399#endif 400 401 s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ 402 SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv 403 404 s_rfe_b64 [ttmp0, ttmp1] 405 406L_SAVE: 407 // If VGPRs have been deallocated then terminate the wavefront. 408 // It has no remaining program to run and cannot save without VGPRs. 409 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) 410 s_bitcmp1_b32 s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT 411 s_cbranch_scc0 L_HAVE_VGPRS 412 s_endpgm 413L_HAVE_VGPRS: 414 s_and_b32 s_save_pc_hi, s_save_pc_hi, ADDRESS_HI32_MASK 415 s_mov_b32 s_save_tmp, 0 416 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit 417 418#if HAVE_XNACK 419 save_and_clear_xnack_state_priv(s_save_tmp) 420#endif 421 422#if HAVE_INSTRUCTION_FIXUP 423 fixup_instruction() 424#endif 425 426 /* inform SPI the readiness and wait for SPI's go signal */ 427 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI 428 s_mov_b32 s_save_exec_hi, exec_hi 429 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive 430 431 s_sendmsg_rtn_b64 [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE) 432 s_wait_idle 433 434 // Save first_wave flag so we can clear high bits of save address. 435 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK 436 s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT) 437 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 438 439#if HAVE_XNACK 440 s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_WAVE_XNACK_MASK) 441 s_setreg_imm32_b32 hwreg(HW_REG_WAVE_XNACK_MASK), 0 442#endif 443 444#if HAVE_BANKED_VGPRS 445 // Save and clear shader's DST/SRC0/SRC1 VGPR bank selection so we can use v[0-255]. 446 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE) 447 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT 448 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 449 s_mov_b32 s_save_tmp, 0 450 s_setreg_b32 hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE), s_save_tmp 451#endif 452 453 // Trap temporaries must be saved via VGPR but all VGPRs are in use. 454 // There is no ttmp space to hold the resource constant for VGPR save. 455 // Save v0 by itself since it requires only two SGPRs. 456 s_mov_b32 s_save_ttmps_lo, exec_lo 457 s_and_b32 s_save_ttmps_hi, exec_hi, ADDRESS_HI32_MASK 458 s_mov_b32 exec_lo, 0xFFFFFFFF 459 s_mov_b32 exec_hi, 0xFFFFFFFF 460 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS 461 v_mov_b32 v0, 0x0 462 s_mov_b32 exec_lo, s_save_ttmps_lo 463 s_mov_b32 exec_hi, s_save_ttmps_hi 464 465 // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic 466 // ttmp SR memory offset: 467 // - gfx12: size(VGPR)+size(SGPR)+0x40 468 // - gfx12.5: size(VGPR)+size(SGPR)-0x40 469 get_wave_size2(s_save_ttmps_hi) 470 get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) 471 s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK 472 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG) 473 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo 474 s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 475 476 v_writelane_b32 v0, ttmp4, 0x4 477 v_writelane_b32 v0, ttmp5, 0x5 478 v_writelane_b32 v0, ttmp6, 0x6 479 v_writelane_b32 v0, ttmp7, 0x7 480 v_writelane_b32 v0, ttmp8, 0x8 481 v_writelane_b32 v0, ttmp9, 0x9 482 v_writelane_b32 v0, ttmp10, 0xA 483 v_writelane_b32 v0, ttmp11, 0xB 484 v_writelane_b32 v0, ttmp13, 0xD 485 v_writelane_b32 v0, exec_lo, 0xE 486 v_writelane_b32 v0, exec_hi, 0xF 487 488 s_mov_b32 exec_lo, 0x3FFF 489 s_mov_b32 exec_hi, 0x0 490 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS 491 v_readlane_b32 ttmp14, v0, 0xE 492 v_readlane_b32 ttmp15, v0, 0xF 493 s_mov_b32 exec_lo, ttmp14 494 s_mov_b32 exec_hi, ttmp15 495 496 s_mov_b32 s_save_base_addr_lo, s_save_spi_init_lo 497 s_and_b32 s_save_base_addr_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK 498 s_mov_b32 s_save_m0, m0 499 500 get_wave_size2(s_wave_size) 501 502 /* save first 4 VGPRs, needed for SGPR save */ 503 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 504 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 505 s_and_b32 m0, m0, 1 506 s_cmp_eq_u32 m0, 1 507 s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI 508 s_mov_b32 exec_hi, 0x00000000 509 s_branch L_SAVE_4VGPR_WAVE32 510L_ENABLE_SAVE_4VGPR_EXEC_HI: 511 s_mov_b32 exec_hi, 0xFFFFFFFF 512 s_branch L_SAVE_4VGPR_WAVE64 513L_SAVE_4VGPR_WAVE32: 514 // VGPR Allocated in 4-GPR granularity 515 global_store_addtid_b32 v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128 516 global_store_addtid_b32 v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*2 517 global_store_addtid_b32 v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*3 518 s_branch L_SAVE_HWREG 519 520L_SAVE_4VGPR_WAVE64: 521 // VGPR Allocated in 4-GPR granularity 522 global_store_addtid_b32 v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256 523 global_store_addtid_b32 v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*2 524 global_store_addtid_b32 v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*3 525 526 /* save HW registers */ 527 528L_SAVE_HWREG: 529 // HWREG SR memory offset : size(VGPR)+size(SGPR) 530 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 531 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() 532 533 v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource 534 v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource 535 v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store 536 s_mov_b32 m0, 0x0 //Next lane of v2 to write to 537 538 write_hwreg_to_v2(s_save_m0) 539 540 // Ensure no further changes to barrier or LDS state. 541 // STATE_PRIV.*BARRIER_COMPLETE may change up to this point. 542 wait_trap_barriers(s_save_tmp, s_save_m0, 1) 543 544 // Re-read final state of *BARRIER_COMPLETE fields for save. 545 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV) 546 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK 547 s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK 548 s_or_b32 s_save_state_priv, s_save_state_priv, s_save_tmp 549 550 write_hwreg_to_v2(s_save_pc_lo) 551 s_and_b32 s_save_tmp, s_save_pc_hi, ADDRESS_HI32_MASK 552 write_hwreg_to_v2(s_save_tmp) 553 write_hwreg_to_v2(s_save_exec_lo) 554#if WAVE32_ONLY 555 s_mov_b32 s_save_tmp, 0 556 write_hwreg_to_v2(s_save_tmp) 557#else 558 write_hwreg_to_v2(s_save_exec_hi) 559#endif 560 write_hwreg_to_v2(s_save_state_priv) 561 562 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) 563 write_hwreg_to_v2(s_save_tmp) 564 565#if HAVE_XNACK 566 write_hwreg_to_v2(s_save_xnack_mask) 567#else 568 s_mov_b32 s_save_tmp, 0 569 write_hwreg_to_v2(s_save_tmp) 570#endif 571 572 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_MODE) 573 574#if HAVE_BANKED_VGPRS 575 s_bfe_u32 s_save_tmp, s_save_pc_hi, (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT | (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE << 0x10)) 576 s_lshl_b32 s_save_tmp, s_save_tmp, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT 577 s_or_b32 s_save_m0, s_save_m0, s_save_tmp 578#endif 579 580 write_hwreg_to_v2(s_save_m0) 581 582 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) 583 write_hwreg_to_v2(s_save_m0) 584 585 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI) 586 write_hwreg_to_v2(s_save_m0) 587 588 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) 589 write_hwreg_to_v2(s_save_m0) 590 591 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL) 592 write_hwreg_to_v2(s_save_m0) 593 594 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) 595 write_hwreg_to_v2(s_save_tmp) 596 597 s_get_barrier_state s_save_tmp, -1 598 s_wait_kmcnt (0) 599 write_hwreg_to_v2(s_save_tmp) 600 601#if HAVE_CLUSTER_BARRIER 602 s_sendmsg_rtn_b32 s_save_tmp, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) 603 s_wait_kmcnt 0 604 write_hwreg_to_v2(s_save_tmp) 605#endif 606 607#if ASIC_FAMILY >= CHIP_GC_12_0_3 608 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCHED_MODE) 609 write_hwreg_to_v2(s_save_tmp) 610#endif 611 612#if ! SAVE_TTMPS_IN_SGPR_BLOCK 613 // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. 614 s_mov_b32 exec_lo, 0xFFFF 615#else 616 // All 128 bytes are available for HWREGs. 617 s_mov_b32 exec_lo, 0xFFFFFFFF 618#endif 619 s_mov_b32 exec_hi, 0x0 620 s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset 621 s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 622 global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS 623 624 // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. 625 s_mov_b32 exec_lo, 0xFFFFFFFF 626 627#if NUM_NAMED_BARRIERS 628 v_mov_b32 v2, 0 629 630 for var bar_idx = 0; bar_idx < NUM_NAMED_BARRIERS; bar_idx ++ 631 s_get_barrier_state s_save_tmp, (bar_idx + 1) 632 s_wait_kmcnt 0 633 v_writelane_b32 v2, s_save_tmp, bar_idx 634 end 635 636 global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:NAMED_BARRIERS_SR_OFFSET_FROM_HWREG 637#endif 638 639 /* save SGPRs */ 640 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... 641 642 // SGPR SR memory offset : size(VGPR) 643 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 644 645 s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into 646 647 s_mov_b32 m0, 0x0 //SGPR initial index value =0 648 s_nop 0x0 //Manually inserted wait states 649L_SAVE_SGPR_LOOP: 650 // SGPR is allocated in 16 SGPR granularity 651 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 652 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 653 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 654 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 655 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 656 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 657 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] 658 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] 659 660 write_16sgpr_to_v2(s0) 661 662 s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? 663 s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE 664 665 s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset 666 s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 667 global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS 668 s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 669 s_mov_b32 ttmp13, 0x0 670 v_mov_b32 v2, 0x0 671L_SAVE_SGPR_SKIP_TCP_STORE: 672 673 s_add_u32 m0, m0, 16 //next sgpr index 674 s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 675 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? 676 677 //save the rest 12 SGPR 678 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 679 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 680 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 681 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 682 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 683 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 684 write_12sgpr_to_v2(s0) 685 686#if SAVE_TTMPS_IN_SGPR_BLOCK 687 // Last 16 dwords of the SGPR block already contain the TTMPS. Make 688 // sure to not override them. 689 s_mov_b32 exec_lo, 0xFFFF 690#endif 691 s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset 692 s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 693 global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS 694 695 /* save LDS */ 696 697L_SAVE_LDS: 698 // Change EXEC to all threads... 699 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 700 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 701 s_and_b32 m0, m0, 1 702 s_cmp_eq_u32 m0, 1 703 s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI 704 s_mov_b32 exec_hi, 0x00000000 705 s_branch L_SAVE_LDS_NORMAL 706L_ENABLE_SAVE_LDS_EXEC_HI: 707 s_mov_b32 exec_hi, 0xFFFFFFFF 708L_SAVE_LDS_NORMAL: 709 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 710 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? 711 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE 712 713 s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK 714 s_cbranch_scc0 L_SAVE_LDS_DONE 715 716 // first wave do LDS save; 717 718 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY 719 720 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) 721 // 722 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 723 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() 724 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() 725 726 //load 0~63*4(byte address) to vgpr v0 727 v_mbcnt_lo_u32_b32 v0, -1, 0 728 v_mbcnt_hi_u32_b32 v0, -1, v0 729 v_mul_u32_u24 v0, 4, v0 730 731 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 732 s_and_b32 m0, m0, 1 733 s_cmp_eq_u32 m0, 1 734 s_mov_b32 m0, 0x0 735 s_cbranch_scc1 L_SAVE_LDS_W64 736 737L_SAVE_LDS_W32: 738 s_mov_b32 s3, 128 739 s_nop 0 740 s_nop 0 741 s_nop 0 742L_SAVE_LDS_LOOP_W32: 743 ds_read_b32 v1, v0 744 s_wait_idle 745 s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset 746 s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 747 global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS 748 749 s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes 750 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 751 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes 752 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 753 s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? 754 755 s_branch L_SAVE_LDS_DONE 756 757L_SAVE_LDS_W64: 758 s_mov_b32 s3, 256 759 s_nop 0 760 s_nop 0 761 s_nop 0 762L_SAVE_LDS_LOOP_W64: 763 ds_read_b32 v1, v0 764 s_wait_idle 765 s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset 766 s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 767 global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS 768 769 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes 770 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 771 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes 772 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 773 s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? 774 775L_SAVE_LDS_DONE: 776 /* save VGPRs - set the Rest VGPRs */ 777L_SAVE_VGPR: 778 // VGPR SR memory offset: 0 779 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 780 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 781 s_and_b32 m0, m0, 1 782 s_cmp_eq_u32 m0, 1 783 s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI 784 s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs 785 s_mov_b32 exec_hi, 0x00000000 786 s_branch L_SAVE_VGPR_NORMAL 787L_ENABLE_SAVE_VGPR_EXEC_HI: 788 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs 789 s_mov_b32 exec_hi, 0xFFFFFFFF 790L_SAVE_VGPR_NORMAL: 791 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 792 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 793 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 794 //determine it is wave32 or wave64 795 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 796 s_and_b32 m0, m0, 1 797 s_cmp_eq_u32 m0, 1 798 s_cbranch_scc1 L_SAVE_VGPR_WAVE64 799 800 // VGPR Allocated in 4-GPR granularity 801 802 // VGPR store using dw burst 803 s_mov_b32 m0, 0x4 //VGPR initial index value =4 804 s_cmp_lt_u32 m0, s_save_alloc_size 805 s_cbranch_scc0 L_SAVE_VGPR_END 806 807L_SAVE_VGPR_W32_LOOP: 808 v_movrels_b32 v0, v0 //v0 = v[0+m0] 809 v_movrels_b32 v1, v1 //v1 = v[1+m0] 810 v_movrels_b32 v2, v2 //v2 = v[2+m0] 811 v_movrels_b32 v3, v3 //v3 = v[3+m0] 812 813 s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset 814 s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 815 global_store_addtid_b32 v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS 816 global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128 817 global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*2 818 global_store_addtid_b32 v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*3 819 820 s_add_u32 m0, m0, 4 //next vgpr index 821 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes 822 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 823 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete? 824 825 s_branch L_SAVE_VGPR_END 826 827L_SAVE_VGPR_WAVE64: 828 // VGPR store using dw burst 829 s_mov_b32 m0, 0x4 //VGPR initial index value =4 830 s_cmp_lt_u32 m0, s_save_alloc_size 831 s_cbranch_scc0 L_SAVE_VGPR_END 832 833L_SAVE_VGPR_W64_LOOP: 834 v_movrels_b32 v0, v0 //v0 = v[0+m0] 835 v_movrels_b32 v1, v1 //v1 = v[1+m0] 836 v_movrels_b32 v2, v2 //v2 = v[2+m0] 837 v_movrels_b32 v3, v3 //v3 = v[3+m0] 838 839 s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset 840 s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 841 global_store_addtid_b32 v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS 842 global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256 843 global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*2 844 global_store_addtid_b32 v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*3 845 846 s_add_u32 m0, m0, 4 //next vgpr index 847 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes 848 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 849 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete? 850 851L_SAVE_VGPR_END: 852 s_branch L_END_PGM 853 854L_RESTORE: 855 s_mov_b32 s_restore_base_addr_lo, s_restore_spi_init_lo 856 s_and_b32 s_restore_base_addr_hi, s_restore_spi_init_hi, ADDRESS_HI32_MASK 857 858 // Save s_restore_spi_init_hi for later use. 859 s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi 860 861 //determine it is wave32 or wave64 862 get_wave_size2(s_restore_size) 863 864 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK 865 s_cbranch_scc0 L_RESTORE_VGPR 866 867 /* restore LDS */ 868L_RESTORE_LDS: 869 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 870 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 871 s_and_b32 m0, m0, 1 872 s_cmp_eq_u32 m0, 1 873 s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI 874 s_mov_b32 exec_hi, 0x00000000 875 s_branch L_RESTORE_LDS_NORMAL 876L_ENABLE_RESTORE_LDS_EXEC_HI: 877 s_mov_b32 exec_hi, 0xFFFFFFFF 878L_RESTORE_LDS_NORMAL: 879 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 880 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? 881 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR 882 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY 883 884 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) 885 // 886 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 887 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 888 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() 889 890 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 891 s_and_b32 m0, m0, 1 892 s_cmp_eq_u32 m0, 1 893 s_mov_b32 m0, 0x0 894 895 v_mbcnt_lo_u32_b32 v1, -1, 0 896 v_mbcnt_hi_u32_b32 v1, -1, v1 897 v_lshlrev_b32 v1, 2, v1 // 0, 4, 8, ... 124 (W32) or 252 (W64) 898 899 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 900 901L_RESTORE_LDS_LOOP_W32: 902 s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset 903 s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 904 global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS 905 s_wait_idle 906 ds_store_b32 v1, v0 907 v_add_nc_u32 v1, v1, 128 908 s_add_u32 m0, m0, 128 // 128 DW 909 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW 910 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 911 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? 912 s_branch L_RESTORE_VGPR 913 914L_RESTORE_LDS_LOOP_W64: 915 s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset 916 s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 917 global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS 918 s_wait_idle 919 ds_store_b32 v1, v0 920 v_add_nc_u32 v1, v1, 256 921 s_add_u32 m0, m0, 256 // 256 DW 922 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW 923 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 924 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? 925 926 /* restore VGPRs */ 927L_RESTORE_VGPR: 928 // VGPR SR memory offset : 0 929 s_mov_b32 s_restore_mem_offset, 0x0 930 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 931 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 932 s_and_b32 m0, m0, 1 933 s_cmp_eq_u32 m0, 1 934 s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI 935 s_mov_b32 exec_hi, 0x00000000 936 s_branch L_RESTORE_VGPR_NORMAL 937L_ENABLE_RESTORE_VGPR_EXEC_HI: 938 s_mov_b32 exec_hi, 0xFFFFFFFF 939L_RESTORE_VGPR_NORMAL: 940 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 941 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 942 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 943 //determine it is wave32 or wave64 944 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 945 s_and_b32 m0, m0, 1 946 s_cmp_eq_u32 m0, 1 947 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 948 949 // VGPR load using dw burst 950 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last 951 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 952 s_mov_b32 m0, 4 //VGPR initial index value = 4 953 954L_RESTORE_VGPR_WAVE32_LOOP: 955 s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset 956 s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 957 global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS 958 global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128 959 global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2 960 global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3 961 s_wait_idle 962 v_movreld_b32 v0, v0 //v[0+m0] = v0 963 v_movreld_b32 v1, v1 964 v_movreld_b32 v2, v2 965 v_movreld_b32 v3, v3 966 s_add_u32 m0, m0, 4 //next vgpr index 967 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes 968 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 969 s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? 970 971 /* VGPR restore on v0 */ 972 s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save 973 s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 974 global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS 975 global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128 976 global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2 977 global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3 978 s_wait_idle 979 980 s_branch L_RESTORE_SGPR 981 982L_RESTORE_VGPR_WAVE64: 983 // VGPR load using dw burst 984 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last 985 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 986 s_mov_b32 m0, 4 //VGPR initial index value = 4 987 s_cmp_lt_u32 m0, s_restore_alloc_size 988 s_cbranch_scc0 L_RESTORE_V0 989 990L_RESTORE_VGPR_WAVE64_LOOP: 991 s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset 992 s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 993 global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS 994 global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256 995 global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2 996 global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3 997 s_wait_idle 998 v_movreld_b32 v0, v0 //v[0+m0] = v0 999 v_movreld_b32 v1, v1 1000 v_movreld_b32 v2, v2 1001 v_movreld_b32 v3, v3 1002 s_add_u32 m0, m0, 4 //next vgpr index 1003 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes 1004 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 1005 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? 1006 1007 /* VGPR restore on v0 */ 1008L_RESTORE_V0: 1009 s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save 1010 s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 1011 global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS 1012 global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256 1013 global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2 1014 global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3 1015 s_wait_idle 1016 1017 /* restore SGPRs */ 1018 //will be 2+8+16*6 1019 // SGPR SR memory offset : size(VGPR) 1020L_RESTORE_SGPR: 1021 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 1022 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 1023 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 24*4 // s[104:107] 1024 s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset 1025 s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 1026 1027 s_mov_b32 m0, s_sgpr_save_num 1028 1029 s_load_b128 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS 1030 s_wait_idle 1031 1032 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] 1033 s_nop 0 // hazard SALU M0=> S_MOVREL 1034 1035 s_movreld_b64 s0, s0 //s[0+m0] = s0 1036 s_movreld_b64 s2, s2 1037 1038 s_sub_co_u32 s_restore_addr_lo, s_restore_addr_lo, 8*4 // s[96:103] 1039 s_sub_co_ci_u32 s_restore_addr_hi, s_restore_addr_hi, 0 1040 s_load_b256 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS 1041 s_wait_idle 1042 1043 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] 1044 s_nop 0 // hazard SALU M0=> S_MOVREL 1045 1046 s_movreld_b64 s0, s0 //s[0+m0] = s0 1047 s_movreld_b64 s2, s2 1048 s_movreld_b64 s4, s4 1049 s_movreld_b64 s6, s6 1050 1051 L_RESTORE_SGPR_LOOP: 1052 s_sub_co_u32 s_restore_addr_lo, s_restore_addr_lo, 16*4 // s[0,16,32,48,64,80] 1053 s_sub_co_ci_u32 s_restore_addr_hi, s_restore_addr_hi, 0 1054 s_load_b512 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS 1055 s_wait_idle 1056 1057 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] 1058 s_nop 0 // hazard SALU M0=> S_MOVREL 1059 1060 s_movreld_b64 s0, s0 //s[0+m0] = s0 1061 s_movreld_b64 s2, s2 1062 s_movreld_b64 s4, s4 1063 s_movreld_b64 s6, s6 1064 s_movreld_b64 s8, s8 1065 s_movreld_b64 s10, s10 1066 s_movreld_b64 s12, s12 1067 s_movreld_b64 s14, s14 1068 1069 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0 1070 s_cbranch_scc0 L_RESTORE_SGPR_LOOP 1071 1072 // s_barrier with STATE_PRIV.TRAP_AFTER_INST=1, STATUS.PRIV=1 incorrectly asserts debug exception. 1073 // Clear DEBUG_EN before and restore MODE after the barrier. 1074 s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE), 0 1075 1076 /* restore HW registers */ 1077L_RESTORE_HWREG: 1078 // HWREG SR memory offset : size(VGPR)+size(SGPR) 1079 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 1080 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 1081 s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset 1082 s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 1083 1084 // Restore s_restore_spi_init_hi before the saved value gets clobbered. 1085 s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save 1086 1087 s_load_b32 s_restore_m0, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS 1088 s_load_b32 s_restore_pc_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x4 1089 s_load_b32 s_restore_pc_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x8 1090 s_load_b32 s_restore_exec_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0xC 1091 s_load_b32 s_restore_exec_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x10 1092 s_load_b32 s_restore_state_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x14 1093 s_load_b32 s_restore_excp_flag_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x18 1094 s_load_b32 s_restore_xnack_mask, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x1C 1095 s_load_b32 s_restore_mode, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x20 1096 s_load_b32 s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x24 1097 s_wait_idle 1098 1099 s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch 1100 1101 s_load_b32 s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x28 1102 s_wait_idle 1103 1104 s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch 1105 1106 s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x2C 1107 s_wait_idle 1108 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp 1109 1110 s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x30 1111 s_wait_idle 1112 s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp 1113 1114 // Only the first wave needs to restore group barriers. 1115 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK 1116 s_cbranch_scc0 L_SKIP_GROUP_BARRIER_RESTORE 1117 1118 // Skip over WAVE_STATUS, since there is no state to restore from it 1119 1120 s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x38 1121 s_wait_idle 1122 1123 // Skip group barriers if wave is not part of a group. 1124 s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET 1125 s_cbranch_scc0 L_SKIP_GROUP_BARRIER_RESTORE 1126 1127 // Restore workgroup barrier signal count. 1128 restore_barrier_signal_count(-1) 1129 1130#if NUM_NAMED_BARRIERS 1131 s_mov_b32 s_restore_mem_offset, NAMED_BARRIERS_SR_OFFSET_FROM_HWREG 1132 s_mov_b32 m0, 1 1133 1134L_RESTORE_NAMED_BARRIER_LOOP: 1135 s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], s_restore_mem_offset scope:SCOPE_SYS 1136 s_wait_kmcnt 0 1137 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 0x4 1138 1139 // Restore named barrier member count. 1140 s_bfe_u32 exec_lo, s_restore_tmp, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 16)) 1141 s_lshl_b32 exec_lo, exec_lo, S_BARRIER_INIT_MEMBERCNT_SHIFT 1142 s_or_b32 m0, m0, exec_lo 1143 s_barrier_init m0 1144 s_andn2_b32 m0, m0, S_BARRIER_INIT_MEMBERCNT_MASK 1145 1146 // Restore named barrier signal count. 1147 restore_barrier_signal_count(m0) 1148 1149 s_add_u32 m0, m0, 1 1150 s_cmp_gt_u32 m0, NUM_NAMED_BARRIERS 1151 s_cbranch_scc0 L_RESTORE_NAMED_BARRIER_LOOP 1152#endif 1153 1154L_SKIP_GROUP_BARRIER_RESTORE: 1155#if HAVE_CLUSTER_BARRIER 1156 s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x3C 1157 s_wait_kmcnt 0 1158 1159 // Skip cluster barrier restore if wave is not part of a cluster. 1160 s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET 1161 s_cbranch_scc0 L_SKIP_CLUSTER_BARRIER_RESTORE 1162 1163 // Only the first wave in the group signals the trap cluster barrier. 1164 s_bitcmp1_b32 s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT 1165 s_cbranch_scc0 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL 1166 1167 // Clear SCC: s_barrier_signal_isfirst -4 writes SCC=>1 but not SCC=>0. 1168 s_cmp_eq_u32 0, 1 1169 s_barrier_signal_isfirst -4 1170L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL: 1171 s_barrier_wait -4 1172 1173 // Only the first wave in the cluster restores the barrier. 1174 s_cbranch_scc0 L_SKIP_CLUSTER_BARRIER_RESTORE 1175 1176 // Restore cluster barrier signal count. 1177 restore_barrier_signal_count(-3) 1178L_SKIP_CLUSTER_BARRIER_RESTORE: 1179#endif 1180 1181#if ASIC_FAMILY >= CHIP_GC_12_0_3 1182 s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x40 1183 s_wait_kmcnt 0 1184 s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_restore_tmp 1185#endif 1186 1187 s_mov_b32 m0, s_restore_m0 1188 s_mov_b32 exec_lo, s_restore_exec_lo 1189 s_mov_b32 exec_hi, s_restore_exec_hi 1190 1191#if HAVE_XNACK 1192 s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_MASK), s_restore_xnack_mask 1193#endif 1194 1195 // EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed. 1196 // Only restore the other fields to avoid clobbering them. 1197 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv 1198 s_lshr_b32 s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT 1199 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE), s_restore_excp_flag_priv 1200 s_lshr_b32 s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT 1201 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE), s_restore_excp_flag_priv 1202 1203 s_setreg_b32 hwreg(HW_REG_WAVE_MODE), s_restore_mode 1204 1205 // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic 1206 // ttmp SR memory offset : 1207 // - gfx12: size(VGPR)+size(SGPR)+0x40 1208 // - gfx12.5: size(VGPR)+size(SGPR)-0x40 1209 get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) 1210 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG) 1211 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_base_addr_lo 1212 s_addc_u32 s_restore_ttmps_hi, s_restore_base_addr_hi, 0x0 1213 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x10 scope:SCOPE_SYS 1214 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x20 scope:SCOPE_SYS 1215 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x34 scope:SCOPE_SYS 1216 s_wait_idle 1217 1218#if HAVE_XNACK 1219 restore_xnack_state_priv(s_restore_tmp) 1220#endif 1221 1222 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, ADDRESS_HI32_MASK //Do it here in order not to affect STATUS 1223 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 1224 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 1225 1226#if RELAXED_SCHEDULING_IN_TRAP 1227 // Assume relaxed scheduling mode after this point. 1228 restore_sched_mode(s_restore_tmp) 1229#endif 1230 1231 s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu 1232 1233 // Make barrier and LDS state visible to all waves in the group/cluster. 1234 // STATE_PRIV.*BARRIER_COMPLETE may change after this point. 1235 wait_trap_barriers(s_restore_tmp, 0, 0) 1236 1237#if HAVE_CLUSTER_BARRIER 1238 // SCC is changed by wait_trap_barriers, restore it separately. 1239 s_lshr_b32 s_restore_state_priv, s_restore_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT 1240 s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, 1), s_restore_state_priv 1241#endif 1242 1243 s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution 1244 1245L_END_PGM: 1246 // Make sure that no wave of the group/cluster can exit the trap handler 1247 // before the group/cluster barrier state is saved. 1248 wait_trap_barriers(s_restore_tmp, 0, 0) 1249 1250 s_endpgm_saved 1251end 1252 1253function write_hwreg_to_v2(s) 1254 // Copy into VGPR for later TCP store. 1255 v_writelane_b32 v2, s, m0 1256 s_add_u32 m0, m0, 0x1 1257end 1258 1259 1260function write_16sgpr_to_v2(s) 1261 // Copy into VGPR for later TCP store. 1262 for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ 1263 v_writelane_b32 v2, s[sgpr_idx], ttmp13 1264 s_add_u32 ttmp13, ttmp13, 0x1 1265 end 1266end 1267 1268function write_12sgpr_to_v2(s) 1269 // Copy into VGPR for later TCP store. 1270 for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ 1271 v_writelane_b32 v2, s[sgpr_idx], ttmp13 1272 s_add_u32 ttmp13, ttmp13, 0x1 1273 end 1274end 1275 1276function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) 1277 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 1278 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 1279 s_bitcmp1_b32 s_size, S_WAVE_SIZE 1280 s_cbranch_scc1 L_ENABLE_SHIFT_W64 1281 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) 1282 s_branch L_SHIFT_DONE 1283L_ENABLE_SHIFT_W64: 1284 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) 1285L_SHIFT_DONE: 1286end 1287 1288function get_sgpr_size_bytes 1289 return 512 1290end 1291 1292function get_hwreg_size_bytes 1293#if ASIC_FAMILY >= CHIP_GC_12_0_3 1294 return 512 1295#else 1296 return 128 1297#endif 1298end 1299 1300function get_wave_size2(s_reg) 1301 s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE) 1302 s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE 1303end 1304 1305#if HAVE_XNACK 1306function save_and_clear_xnack_state_priv(s_tmp) 1307 // Preserve and clear XNACK state before issuing further translations. 1308 // Save XNACK_STATE_PRIV.{FIRST_REPLAY, REPLAY_W64H, FXPTR} into ttmp11[22:14]. 1309 s_andn2_b32 ttmp11, ttmp11, (TTMP11_FIRST_REPLAY_MASK | TTMP11_REPLAY_W64H_MASK | TTMP11_FXPTR_MASK) 1310 1311 s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE) 1312 s_lshl_b32 s_tmp, s_tmp, TTMP11_FIRST_REPLAY_SHIFT 1313 s_or_b32 ttmp11, ttmp11, s_tmp 1314 1315 s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE) 1316 s_lshl_b32 s_tmp, s_tmp, TTMP11_REPLAY_W64H_SHIFT 1317 s_or_b32 ttmp11, ttmp11, s_tmp 1318 1319 s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE) 1320 s_lshl_b32 s_tmp, s_tmp, TTMP11_FXPTR_SHIFT 1321 s_or_b32 ttmp11, ttmp11, s_tmp 1322 1323 s_setreg_imm32_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV), 0 1324end 1325 1326function restore_xnack_state_priv(s_tmp) 1327 s_lshr_b32 s_tmp, ttmp11, TTMP11_FIRST_REPLAY_SHIFT 1328 s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE), s_tmp 1329 1330 s_lshr_b32 s_tmp, ttmp11, TTMP11_REPLAY_W64H_SHIFT 1331 s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE), s_tmp 1332 1333 s_lshr_b32 s_tmp, ttmp11, TTMP11_FXPTR_SHIFT 1334 s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE), s_tmp 1335end 1336#endif 1337 1338function wait_trap_barriers(s_tmp1, s_tmp2, serialize_wa) 1339#if HAVE_CLUSTER_BARRIER 1340 // If not in a WG then wave cannot use s_barrier_signal_isfirst. 1341 s_getreg_b32 s_tmp1, hwreg(HW_REG_WAVE_STATUS) 1342 s_bitcmp0_b32 s_tmp1, SQ_WAVE_STATUS_IN_WG_SHIFT 1343 s_cbranch_scc1 L_TRAP_CLUSTER_BARRIER_SIGNAL 1344 1345 s_barrier_signal_isfirst -2 1346 s_barrier_wait -2 1347 1348 // Only the first wave in the group signals the trap cluster barrier. 1349 s_cbranch_scc0 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL 1350 1351L_TRAP_CLUSTER_BARRIER_SIGNAL: 1352 s_barrier_signal -4 1353 1354L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL: 1355 s_barrier_wait -4 1356 1357#if CLUSTER_BARRIER_SERIALIZE_WORKAROUND 1358if serialize_wa 1359 // Trap cluster barrier may complete with a user cluster barrier in-flight. 1360 // This is indicated if user cluster member count and signal count are equal. 1361L_WAIT_USER_CLUSTER_BARRIER_COMPLETE: 1362 s_sendmsg_rtn_b32 s_tmp1, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) 1363 s_wait_kmcnt 0 1364 s_bitcmp0_b32 s_tmp1, BARRIER_STATE_VALID_OFFSET 1365 s_cbranch_scc1 L_NOT_IN_CLUSTER 1366 1367 s_bfe_u32 s_tmp2, s_tmp1, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 0x10)) 1368 s_bfe_u32 s_tmp1, s_tmp1, (BARRIER_STATE_SIGNAL_OFFSET | (BARRIER_STATE_SIGNAL_SIZE << 0x10)) 1369 s_cmp_eq_u32 s_tmp1, s_tmp2 1370 s_cbranch_scc1 L_WAIT_USER_CLUSTER_BARRIER_COMPLETE 1371end 1372L_NOT_IN_CLUSTER: 1373#endif 1374 1375#else 1376 s_barrier_signal -2 1377 s_barrier_wait -2 1378#endif 1379end 1380 1381#if RELAXED_SCHEDULING_IN_TRAP 1382function restore_sched_mode(s_tmp) 1383 s_bfe_u32 s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10)) 1384 s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp 1385end 1386#endif 1387 1388function restore_barrier_signal_count(barrier_id) 1389 // extract the saved signal count from s_restore_tmp 1390 s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET 1391 1392 // We need to call s_barrier_signal repeatedly to restore the signal count 1393 // of the group/cluster barrier. The member count is already initialized. 1394L_BARRIER_RESTORE_LOOP: 1395 s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp 1396 s_cbranch_scc0 L_BARRIER_RESTORE_DONE 1397 s_barrier_signal barrier_id 1398 s_add_i32 s_restore_tmp, s_restore_tmp, -1 1399 s_branch L_BARRIER_RESTORE_LOOP 1400 1401L_BARRIER_RESTORE_DONE: 1402end 1403 1404#if HAVE_INSTRUCTION_FIXUP 1405function fixup_instruction 1406 // PC read may fault if memory violation has been asserted. 1407 // In this case no further progress is expected so fixup is not needed. 1408 s_bitcmp1_b32 s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT 1409 s_cbranch_scc1 L_FIXUP_DONE 1410 1411 // ttmp[0:1]: {7b'0} PC[56:0] 1412 // ttmp2, 3, 10, 13, 14, 15: free 1413 s_load_b64 [ttmp14, ttmp15], [ttmp0, ttmp1], 0 scope:SCOPE_CU // Load the 2 instruction DW we are returning to 1414 s_wait_kmcnt 0 1415 s_load_b64 [ttmp2, ttmp3], [ttmp0, ttmp1], 8 scope:SCOPE_CU // Load the next 2 instruction DW, just in case 1416 s_and_b32 ttmp10, ttmp14, 0x80000000 // Check bit 31 in the first DWORD 1417 // SCC set if ttmp10 is != 0, i.e. if bit 31 == 1 1418 s_cbranch_scc1 L_FIXUP_NOT_VOP12C // If bit 31 is 1, we are not VOP1, VOP2, or VOP3C 1419 // Fall through here means bit 31 == 0, meaning we are VOP1, VOP2, or VOPC 1420 // Size of instruction depends on Opcode or SRC0_9 1421 // Check for VOP2 opcode 1422 s_bfe_u32 ttmp10, ttmp14, (25 | (6 << 0x10)) // Check bits 30:25 for VOP2 Opcode 1423 // VOP2 V_FMAMK_F64 of V_FMAAK_F64 has implied 64-bit literature, 3 DW 1424 s_sub_co_i32 ttmp13, ttmp10, 0x23 // V_FMAMK_F64 is 0x23, V_FMAAK_F64 is 0x24 1425 s_cmp_le_u32 ttmp13, 0x1 // 0==0x23, 1==0x24 1426 s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst 1427 // VOP2 V_FMAMK_F32, V_FMAAK_F32, V_FMAMK_F16, V_FMAAK_F16, 2 DW 1428 s_sub_co_i32 ttmp13, ttmp10, 0x2c // V_FMAMK_F32 is 0x2c, V_FMAAK_F32 is 0x2d 1429 s_cmp_le_u32 ttmp13, 0x1 // 0==0x2c, 1==0x2d 1430 s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst 1431 s_sub_co_i32 ttmp13, ttmp10, 0x37 // V_FMAMK_F16 is 0x37, V_FMAAK_F16 is 0x38 1432 s_cmp_le_u32 ttmp13, 0x1 // 0==0x37, 1==0x38 1433 s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst 1434 // Check SRC0_9 for VOP1, VOP2, and VOPC 1435 s_and_b32 ttmp10, ttmp14, 0x1ff // Check bits 8:0 for SRC0_9 1436 // Literal constant 64 is 3 DWORDs 1437 s_cmp_eq_u32 ttmp10, 0xfe // 0xfe == 254 == Literal constant64 1438 s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst 1439 // Literal constant 32, DPP16, DPP8, and DPP8FI are 2 DWORDs 1440 s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32 1441 s_cbranch_scc1 L_FIXUP_TWO_DWORD // 2 DWORD inst 1442 s_cmp_eq_u32 ttmp10, 0xfa // 0xfa == 250 = DPP16 1443 s_cbranch_scc1 L_FIXUP_TWO_DWORD // 2 DWORD inst 1444 s_sub_co_i32 ttmp13, ttmp10, 0xe9 // DPP8 is 0xe9, DPP8FI is 0xea 1445 s_cmp_le_u32 ttmp13, 0x1 // 0==0xe9, 1==0xea 1446 s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst 1447 // Instruction is 1 DWORD otherwise 1448 1449L_FIXUP_ONE_DWORD: 1450 // Check if TTMP15 contains the value for S_SET_VGPR_MSB instruction 1451 s_and_b32 ttmp10, ttmp15, 0xffff0000 // Check encoding in upper 16 bits 1452 s_cmp_eq_u32 ttmp10, 0xbf860000 // Check if SOPP (9b'10_1111111) and S_SET_VGPR_MSB (7b'0000110) 1453 s_cbranch_scc0 L_FIXUP_DONE // No problem, no fixup needed 1454 // VALU op followed by a S_SET_VGPR_MSB. Need to pull SIMM[15:8] to fix up MODE.*_VGPR_MSB 1455 s_bfe_u32 ttmp10, ttmp15, (14 | (2 << 0x10)) // Shift SIMM[15:14] over to 1:0, Dst 1456 s_and_b32 ttmp13, ttmp15, 0x3f00 // Mask to get SIMM[13:8] only 1457 s_lshr_b32 ttmp13, ttmp13, 6 // Shift SIMM[13:8] into 7:2, Src2, Src1, Src0 1458 s_or_b32 ttmp10, ttmp10, ttmp13 // Src2, Src1, Src0, Dst --> format in MODE register 1459 s_setreg_b32 hwreg(HW_REG_WAVE_MODE, 12, 8), ttmp10 // Write value into MODE[19:12] 1460 s_branch L_FIXUP_DONE 1461 1462L_FIXUP_NOT_VOP12C: 1463 // ttmp[0:1]: {7b'0} PC[56:0] 1464 // ttmp2: PC+2 value (not waitcnt'ed yet) 1465 // ttmp3: PC+3 value (not waitcnt'ed yet) 1466 // ttmp10, ttmp13: free 1467 // ttmp14: PC+O value 1468 // ttmp15: PC+1 value 1469 // Not VOP1, VOP2, or VOPC. 1470 // Check if we are VOP3 or VOP3SD 1471 s_and_b32 ttmp10, ttmp14, 0xfc000000 // Bits 31:26 1472 s_cmp_eq_u32 ttmp10, 0xd4000000 // If 31:26 = 0x35, this is VOP3 or VOP3SD 1473 s_cbranch_scc1 L_FIXUP_CHECK_VOP3 // If VOP3 or VOP3SD, need to check SRC2_9, SRC1_9, SRC0_9 1474 // Not VOP1, VOP2, VOPC, VOP3, or VOP3SD. 1475 // Check for VOPD 1476 s_cmp_eq_u32 ttmp10, 0xc8000000 // If 31:26 = 0x32, this is VOPD 1477 s_cbranch_scc1 L_FIXUP_CHECK_VOPD // If VOPD, need to check OpX, OpY, SRCX0 and SRCY0 1478 // Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD. 1479 // Check if we are VOPD3 1480 s_and_b32 ttmp10, ttmp14, 0xff000000 // Bits 31:24 1481 s_cmp_eq_u32 ttmp10, 0xcf000000 // If 31:24 = 0xcf, this is VOPD3 1482 s_cbranch_scc1 L_FIXUP_THREE_DWORD // If VOPD3, 3 DWORD inst 1483 // Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD, or VOPD3. 1484 // Check if we are in the middle of VOP3PX. 1485 s_and_b32 ttmp13, ttmp14, 0xffff0000 // Bits 31:16 1486 s_cmp_eq_u32 ttmp13, 0xcc330000 // If 31:16 = 0xcc33, this is 8 bytes past VOP3PX 1487 s_cbranch_scc1 L_FIXUP_VOP3PX_MIDDLE 1488 s_cmp_eq_u32 ttmp13, 0xcc880000 // If 31:16 = 0xcc88, this is 8 bytes past VOP3PX 1489 s_cbranch_scc1 L_FIXUP_VOP3PX_MIDDLE 1490 // Might be in VOP3P, but we must ensure we are not VOP3PX2 1491 s_cmp_eq_u32 ttmp13, 0xcc350000 // If 31:16 = 0xcc35, this is VOP3PX2 1492 s_cbranch_scc1 L_FIXUP_DONE // If VOP3PX2, no fixup needed 1493 s_cmp_eq_u32 ttmp13, 0xcc3a0000 // If 31:16 = 0xcc3a, this is VOP3PX2 1494 s_cbranch_scc1 L_FIXUP_DONE // If VOP3PX2, no fixup needed 1495 // Check if we are VOP3P 1496 s_cmp_eq_u32 ttmp10, 0xcc000000 // If 31:24 = 0xcc, this is VOP3P 1497 s_cbranch_scc0 L_FIXUP_DONE // Not in VOP3P, so instruction is not VOP1, VOP2, 1498 // VOPC, VOP3, VOP3SD, VOP3P, VOPD, or VOPD3 1499 // No fixup needed. 1500 // Fall-through if we are in VOP3P to check SRC2_9, SRC1_9, and SRC0_9 1501L_FIXUP_CHECK_VOP3: 1502 // Start with Src0, which is in bits 8:0 of second instruction DW, ttmp15 1503 s_and_b32 ttmp10, ttmp15, 0x1ff // Mask out unused bits 1504 // Src0_9 == Literal constant 32, DPP16, DPP8, and DPP8FI means 3 DWORDs 1505 s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32 1506 s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst 1507 s_cmp_eq_u32 ttmp10, 0xfa // 0xfa == 250 = DPP16 1508 s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst 1509 s_sub_co_i32 ttmp10, ttmp10, 0xe9 // DPP8 is 0xe9, DPP8FI is 0xea 1510 s_cmp_le_u32 ttmp10, 0x1 // 0==0xe9, 1==0xea 1511 s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst 1512 s_and_b32 ttmp10, ttmp15, 0x3fe00 // Next is Src1, which is in 17:9 1513 s_cmp_eq_u32 ttmp10, 0x1fe00 // 0xff == 255 = Literal constant32 1514 s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst 1515 s_and_b32 ttmp10, ttmp15, 0x7fc0000 // Next is Src2, which is in 26:18 1516 s_cmp_eq_u32 ttmp10, 0x3fc0000 // 0xff == 255 = Literal constant32 1517 s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst 1518 s_branch L_FIXUP_TWO_DWORD // No special encodings, VOP3* is 2 Dword 1519 1520L_FIXUP_CHECK_VOPD: 1521 // OpX being V_DUAL_FMA*K_F32 means 3 DWORDs 1522 s_bfe_u32 ttmp10, ttmp14, (22 | (4 << 0x10)) // OPX is bits 25:22 1523 s_sub_co_i32 ttmp10, ttmp10, 0x1 // V_DUAL_FMAAK_F32 is 0x1, V_DUAL_FMAMK_F32 is 0x2 1524 s_cmp_le_u32 ttmp10, 0x1 // 0==0x1, 1==0x2 1525 s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst 1526 // OpY being V_DUAL_FMA*K_F32 means 3 DWORDs 1527 s_bfe_u32 ttmp10, ttmp14, (17 | (5 << 0x10)) // OPX is bits 21:17 1528 s_sub_co_i32 ttmp10, ttmp10, 0x1 // V_DUAL_FMAAK_F32 is 0x1, V_DUAL_FMAMK_F32 is 0x2 1529 s_cmp_le_u32 ttmp10, 0x1 // 0==0x1, 1==0x2 1530 s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst 1531 // SRCX0 == Literal constant 32 means 3 DWORDs 1532 s_and_b32 ttmp10, ttmp14, 0x1ff // SRCX0 is in bits 8:0 of 1st DWORD 1533 s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32 1534 s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst 1535 // SRCY0 == Literal constant 32 means 3 DWORDs 1536 s_and_b32 ttmp10, ttmp15, 0x1ff // SRCY0 is in bits 8:0 of 2nd DWORD 1537 s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32 1538 s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst 1539 // If otherwise, no special encodings. Default VOPD is 2 Dword 1540 // Fall-thru if true, because this is a 2 DWORD inst 1541L_FIXUP_TWO_DWORD: 1542 s_wait_kmcnt 0 // Wait for PC+2 and PC+3 to arrive in ttmp2 and ttmp3 1543 s_mov_b32 ttmp15, ttmp2 // Move possible S_SET_VGPR_MSB into ttmp15 1544 s_branch L_FIXUP_ONE_DWORD // Go to common logic that checks if it is S_SET_VGPR_MSB 1545 1546L_FIXUP_THREE_DWORD: 1547 s_wait_kmcnt 0 // Wait for PC+2 and PC+3 to arrive in ttmp2 and ttmp3 1548 s_mov_b32 ttmp15, ttmp3 // Move possible S_SET_VGPR_MSB into ttmp15 1549 s_branch L_FIXUP_ONE_DWORD // Go to common logic that checks if it is S_SET_VGPR_MSB 1550 1551L_FIXUP_VOP3PX_MIDDLE: 1552 s_sub_co_u32 ttmp0, ttmp0, 8 // Rewind PC 8 bytes to beginning of instruction 1553 s_sub_co_ci_u32 ttmp1, ttmp1, 0 1554 s_branch L_FIXUP_TWO_DWORD // 2 DWORD inst (2nd half of a 4 DWORD inst) 1555 1556L_FIXUP_DONE: 1557 s_wait_kmcnt 0 // Ensure load of ttmp2 and ttmp3 is done 1558end 1559#endif 1560