xref: /linux/drivers/gpu/drm/amd/amdgpu/gfx_v10_1_10_cleaner_shader.asm (revision e332935a540eb76dd656663ca908eb0544d96757)
1/* SPDX-License-Identifier: MIT */
2/*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24// This shader is to clean LDS, SGPRs and VGPRs. It is  first 64 Dwords or 256 bytes of 256 Dwords cleaner shader.
25
26// GFX10.1 : Clear SGPRs, VGPRs and LDS
27//   Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot
28//   Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD
29//   Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS)
30//      It takes 2 workgroups to use all of LDS: one on each CU of the WGP
31//   Each wave clears SGPRs 0 - 107
32//   Each wave clears VGPRs 0 - 63
33//   The first wave of the workgroup clears its 64KB of LDS
34//   The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup
35//       before any wave in the workgroup could end.  Without this, it is possible not all SGPRs get cleared.
36
37
38shader main
39  asic(GFX10.1)
40  type(CS)
41  wave_size(32)
42// Note: original source code from SQ team
43//
44// Create 32 waves in a threadgroup (CS waves)
45// Each allocates 64 VGPRs
46// The workgroup allocates all of LDS (64kbytes)
47//
48// Takes about 2500 clocks to run.
49//   (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks)
50//
51  S_BARRIER
52  s_cmp_eq_u32 s0, 1                                // Bit0 is set, sgpr0 is set then clear VGPRS and LDS as FW set COMPUTE_USER_DATA_0
53  s_cbranch_scc0  label_0023                        // Clean VGPRs and LDS if sgpr0 of wave is set, scc = (s0 == 1)
54
55  s_mov_b32     s2, 0x00000038  // Loop 64/8=8 times  (loop unrolled for performance)
56  s_mov_b32     m0, 0
57  //
58  // CLEAR VGPRs
59  //
60label_0005:
61  v_movreld_b32     v0, 0
62  v_movreld_b32     v1, 0
63  v_movreld_b32     v2, 0
64  v_movreld_b32     v3, 0
65  v_movreld_b32     v4, 0
66  v_movreld_b32     v5, 0
67  v_movreld_b32     v6, 0
68  v_movreld_b32     v7, 0
69  s_mov_b32         m0, s2
70  s_sub_u32     s2, s2, 8
71  s_cbranch_scc0  label_0005
72  //
73  s_mov_b32     s2, 0x80000000                       // Bit31 is first_wave
74  s_and_b32     s2, s2, s1                           // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set
75  s_cbranch_scc0  label_0023                         // Clean LDS if its first wave of ThreadGroup/WorkGroup
76  // CLEAR LDS
77  //
78  s_mov_b32 exec_lo, 0xffffffff
79  s_mov_b32 exec_hi, 0xffffffff
80  v_mbcnt_lo_u32_b32  v1, exec_hi, 0          // Set V1 to thread-ID (0..63)
81  v_mbcnt_hi_u32_b32  v1, exec_lo, v1        // Set V1 to thread-ID (0..63)
82  v_mul_u32_u24  v1, 0x00000008, v1          // * 8, so each thread is a double-dword address (8byte)
83  s_mov_b32     s2, 0x00000003f                    // 64 loop iterations
84  s_mov_b32     m0, 0xffffffff
85  // Clear all of LDS space
86  // Each FirstWave of WorkGroup clears 64kbyte block
87
88label_001F:
89  ds_write2_b64  v1, v[2:3], v[2:3] offset1:32
90  ds_write2_b64  v1, v[4:5], v[4:5] offset0:64 offset1:96
91  v_add_co_u32     v1, vcc, 0x00000400, v1
92  s_sub_u32     s2, s2, 1
93  s_cbranch_scc0  label_001F
94
95  //
96  // CLEAR SGPRs
97  //
98label_0023:
99  s_mov_b32     m0, 0x00000068  // Loop 108/4=27 times  (loop unrolled for performance)
100label_sgpr_loop:
101  s_movreld_b32     s0, s0
102  s_movreld_b32     s1, s0
103  s_movreld_b32     s2, s0
104  s_movreld_b32     s3, s0
105  s_sub_u32         m0, m0, 4
106  s_cbranch_scc0  label_sgpr_loop
107
108  //clear vcc
109  s_mov_b64 vcc, 0          //clear vcc
110  //s_setreg_imm32_b32 hw_reg_shader_flat_scratch_lo, 0   //clear  flat scratch lo SGPR
111  //s_setreg_imm32_b32 hw_reg_shader_flat_scratch_hi, 0    //clear  flat scratch hi SGPR
112  s_mov_b64 ttmp0, 0        //Clear ttmp0 and ttmp1
113  s_mov_b64 ttmp2, 0        //Clear ttmp2 and ttmp3
114  s_mov_b64 ttmp4, 0        //Clear ttmp4 and ttmp5
115  s_mov_b64 ttmp6, 0        //Clear ttmp6 and ttmp7
116  s_mov_b64 ttmp8, 0        //Clear ttmp8 and ttmp9
117  s_mov_b64 ttmp10, 0       //Clear ttmp10 and ttmp11
118  s_mov_b64 ttmp12, 0       //Clear ttmp12 and ttmp13
119  s_mov_b64 ttmp14, 0       //Clear ttmp14 and ttmp15
120
121 s_endpgm
122
123end
124
125
126