xref: /linux/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c (revision e50a6ecebe0841d3dfa4d9415d4fae80bb5d91e8)
1 /*
2  * Copyright 2025 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 #include <linux/delay.h>
25 #include <linux/firmware.h>
26 #include <linux/module.h>
27 #include <linux/pci.h>
28 
29 #include "amdgpu.h"
30 #include "amdgpu_ucode.h"
31 #include "amdgpu_trace.h"
32 
33 #include "gc/gc_12_1_0_offset.h"
34 #include "gc/gc_12_1_0_sh_mask.h"
35 #include "ivsrcid/gfx/irqsrcs_gfx_12_1_0.h"
36 
37 #include "soc15_common.h"
38 #include "soc15.h"
39 #include "sdma_v7_1_0_pkt_open.h"
40 #include "nbio_v4_3.h"
41 #include "sdma_common.h"
42 #include "sdma_v7_1.h"
43 #include "v12_structs.h"
44 #include "mes_userqueue.h"
45 
46 MODULE_FIRMWARE("amdgpu/sdma_7_1_0.bin");
47 
48 #define SDMA1_REG_OFFSET 0x600
49 #define SDMA0_SDMA_IDX_0_END 0x450
50 #define SDMA1_HYP_DEC_REG_OFFSET 0x30
51 
52 static const struct amdgpu_hwip_reg_entry sdma_reg_list_7_1[] = {
53 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS_REG),
54 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS1_REG),
55 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS2_REG),
56 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS3_REG),
57 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS4_REG),
58 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS5_REG),
59 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS6_REG),
60 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UCODE_REV),
61 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_RB_RPTR_FETCH_HI),
62 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_RB_RPTR_FETCH),
63 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_RD_STATUS),
64 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_WR_STATUS),
65 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_RD_XNACK0),
66 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_RD_XNACK1),
67 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_WR_XNACK0),
68 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_WR_XNACK1),
69 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_CNTL),
70 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_RPTR),
71 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_RPTR_HI),
72 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_WPTR),
73 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_WPTR_HI),
74 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_OFFSET),
75 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_BASE_LO),
76 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_BASE_HI),
77 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_CNTL),
78 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_RPTR),
79 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_SUB_REMAIN),
80 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_DUMMY_REG),
81 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE_STATUS0),
82 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_CNTL),
83 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_RPTR),
84 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_RPTR_HI),
85 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_WPTR),
86 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_WPTR_HI),
87 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_OFFSET),
88 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_BASE_LO),
89 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_BASE_HI),
90 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_RPTR),
91 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_SUB_REMAIN),
92 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_DUMMY_REG),
93 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_CNTL),
94 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_RPTR),
95 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_RPTR_HI),
96 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_WPTR),
97 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_WPTR_HI),
98 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_OFFSET),
99 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_BASE_LO),
100 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_BASE_HI),
101 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_RPTR),
102 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_SUB_REMAIN),
103 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_DUMMY_REG),
104 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_INT_STATUS),
105 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_VM_CNTL),
106 	SOC15_REG_ENTRY_STR(GC, 0, regGRBM_STATUS2),
107 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_CHICKEN_BITS),
108 };
109 
110 static void sdma_v7_1_set_ring_funcs(struct amdgpu_device *adev);
111 static void sdma_v7_1_set_buffer_funcs(struct amdgpu_device *adev);
112 static void sdma_v7_1_set_vm_pte_funcs(struct amdgpu_device *adev);
113 static void sdma_v7_1_set_irq_funcs(struct amdgpu_device *adev);
114 static int sdma_v7_1_inst_start(struct amdgpu_device *adev,
115 				uint32_t inst_mask);
116 
117 static u32 sdma_v7_1_get_reg_offset(struct amdgpu_device *adev, u32 instance, u32 internal_offset)
118 {
119 	u32 base;
120 	u32 dev_inst = GET_INST(SDMA0, instance);
121 	int xcc_id = adev->sdma.instance[instance].xcc_id;
122 	int xcc_inst = dev_inst % adev->sdma.num_inst_per_xcc;
123 
124 	if (internal_offset >= SDMA0_SDMA_IDX_0_END) {
125 		base = adev->reg_offset[GC_HWIP][xcc_id][1];
126 		if (xcc_inst != 0)
127 			internal_offset += SDMA1_HYP_DEC_REG_OFFSET * xcc_inst;
128 	} else {
129 		base = adev->reg_offset[GC_HWIP][xcc_id][0];
130 		if (xcc_inst != 0)
131 			internal_offset += SDMA1_REG_OFFSET * xcc_inst;
132 	}
133 
134 	return base + internal_offset;
135 }
136 
137 static unsigned sdma_v7_1_ring_init_cond_exec(struct amdgpu_ring *ring,
138 					      uint64_t addr)
139 {
140 	unsigned ret;
141 
142 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COND_EXE));
143 	amdgpu_ring_write(ring, lower_32_bits(addr));
144 	amdgpu_ring_write(ring, upper_32_bits(addr));
145 	amdgpu_ring_write(ring, 1);
146 	/* this is the offset we need patch later */
147 	ret = ring->wptr & ring->buf_mask;
148 	/* insert dummy here and patch it later */
149 	amdgpu_ring_write(ring, 0);
150 
151 	return ret;
152 }
153 
154 /**
155  * sdma_v7_1_ring_get_rptr - get the current read pointer
156  *
157  * @ring: amdgpu ring pointer
158  *
159  * Get the current rptr from the hardware.
160  */
161 static uint64_t sdma_v7_1_ring_get_rptr(struct amdgpu_ring *ring)
162 {
163 	u64 *rptr;
164 
165 	/* XXX check if swapping is necessary on BE */
166 	rptr = (u64 *)ring->rptr_cpu_addr;
167 
168 	DRM_DEBUG("rptr before shift == 0x%016llx\n", *rptr);
169 	return ((*rptr) >> 2);
170 }
171 
172 /**
173  * sdma_v7_1_ring_get_wptr - get the current write pointer
174  *
175  * @ring: amdgpu ring pointer
176  *
177  * Get the current wptr from the hardware.
178  */
179 static uint64_t sdma_v7_1_ring_get_wptr(struct amdgpu_ring *ring)
180 {
181 	u64 wptr = 0;
182 
183 	if (ring->use_doorbell) {
184 		/* XXX check if swapping is necessary on BE */
185 		wptr = READ_ONCE(*((u64 *)ring->wptr_cpu_addr));
186 		DRM_DEBUG("wptr/doorbell before shift == 0x%016llx\n", wptr);
187 	}
188 
189 	return wptr >> 2;
190 }
191 
192 /**
193  * sdma_v7_1_ring_set_wptr - commit the write pointer
194  *
195  * @ring: amdgpu ring pointer
196  *
197  * Write the wptr back to the hardware.
198  */
199 static void sdma_v7_1_ring_set_wptr(struct amdgpu_ring *ring)
200 {
201 	struct amdgpu_device *adev = ring->adev;
202 
203 	DRM_DEBUG("Setting write pointer\n");
204 
205 	if (ring->use_doorbell) {
206 		DRM_DEBUG("Using doorbell -- "
207 			  "wptr_offs == 0x%08x "
208 			  "lower_32_bits(ring->wptr) << 2 == 0x%08x "
209 			  "upper_32_bits(ring->wptr) << 2 == 0x%08x\n",
210 			  ring->wptr_offs,
211 			  lower_32_bits(ring->wptr << 2),
212 			  upper_32_bits(ring->wptr << 2));
213 		/* XXX check if swapping is necessary on BE */
214 		atomic64_set((atomic64_t *)ring->wptr_cpu_addr,
215 			     ring->wptr << 2);
216 		DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n",
217 			  ring->doorbell_index, ring->wptr << 2);
218 		WDOORBELL64(ring->doorbell_index, ring->wptr << 2);
219 	} else {
220 		DRM_DEBUG("Not using doorbell -- "
221 			  "regSDMA%i_GFX_RB_WPTR == 0x%08x "
222 			  "regSDMA%i_GFX_RB_WPTR_HI == 0x%08x\n",
223 			  ring->me,
224 			  lower_32_bits(ring->wptr << 2),
225 			  ring->me,
226 			  upper_32_bits(ring->wptr << 2));
227 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev,
228 							     ring->me,
229 							     regSDMA0_SDMA_QUEUE0_RB_WPTR),
230 				lower_32_bits(ring->wptr << 2));
231 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev,
232 							     ring->me,
233 							     regSDMA0_SDMA_QUEUE0_RB_WPTR_HI),
234 				upper_32_bits(ring->wptr << 2));
235 	}
236 }
237 
238 static void sdma_v7_1_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
239 {
240 	struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
241 	int i;
242 
243 	for (i = 0; i < count; i++)
244 		if (sdma && sdma->burst_nop && (i == 0))
245 			amdgpu_ring_write(ring, ring->funcs->nop |
246 				SDMA_PKT_NOP_HEADER_COUNT(count - 1));
247 		else
248 			amdgpu_ring_write(ring, ring->funcs->nop);
249 }
250 
251 /**
252  * sdma_v7_1_ring_emit_ib - Schedule an IB on the DMA engine
253  *
254  * @ring: amdgpu ring pointer
255  * @job: job to retrieve vmid from
256  * @ib: IB object to schedule
257  * @flags: unused
258  *
259  * Schedule an IB in the DMA ring.
260  */
261 static void sdma_v7_1_ring_emit_ib(struct amdgpu_ring *ring,
262 				   struct amdgpu_job *job,
263 				   struct amdgpu_ib *ib,
264 				   uint32_t flags)
265 {
266 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
267 	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
268 
269 	/* An IB packet must end on a 8 DW boundary--the next dword
270 	 * must be on a 8-dword boundary. Our IB packet below is 6
271 	 * dwords long, thus add x number of NOPs, such that, in
272 	 * modular arithmetic,
273 	 * wptr + 6 + x = 8k, k >= 0, which in C is,
274 	 * (wptr + 6 + x) % 8 = 0.
275 	 * The expression below, is a solution of x.
276 	 */
277 	sdma_v7_1_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
278 
279 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_INDIRECT) |
280 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
281 	/* base must be 32 byte aligned */
282 	amdgpu_ring_write(ring, lower_32_bits(ib->gpu_addr) & 0xffffffe0);
283 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
284 	amdgpu_ring_write(ring, ib->length_dw);
285 	amdgpu_ring_write(ring, lower_32_bits(csa_mc_addr));
286 	amdgpu_ring_write(ring, upper_32_bits(csa_mc_addr));
287 }
288 
289 /**
290  * sdma_v7_1_ring_emit_mem_sync - flush the IB by graphics cache rinse
291  *
292  * @ring: amdgpu ring pointer
293  *
294  * flush the IB by graphics cache rinse.
295  */
296 static void sdma_v7_1_ring_emit_mem_sync(struct amdgpu_ring *ring)
297 {
298 	uint32_t gcr_cntl = SDMA_GCR_GL2_INV | SDMA_GCR_GL2_WB | SDMA_GCR_GLM_INV |
299 		SDMA_GCR_GL1_INV | SDMA_GCR_GLV_INV | SDMA_GCR_GLK_INV |
300 		SDMA_GCR_GLI_INV(1);
301 
302 	/* flush entire cache L0/L1/L2, this can be optimized by performance requirement */
303 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_GCR_REQ));
304 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD1_BASE_VA_31_7(0));
305 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD2_BASE_VA_56_32(0));
306 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD3_GCR_CONTROL_18_0(gcr_cntl) |
307 			  SDMA_PKT_GCR_REQ_PAYLOAD3_LIMIT_VA_15_7(0));
308 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD4_LIMIT_VA_47_16(0));
309 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD5_LIMIT_VA_56_48(0) |
310 			  SDMA_PKT_GCR_REQ_PAYLOAD5_VMID(0));
311 }
312 
313 
314 /**
315  * sdma_v7_1_ring_emit_fence - emit a fence on the DMA ring
316  *
317  * @ring: amdgpu ring pointer
318  * @addr: address
319  * @seq: fence seq number
320  * @flags: fence flags
321  *
322  * Add a DMA fence packet to the ring to write
323  * the fence seq number and DMA trap packet to generate
324  * an interrupt if needed.
325  */
326 static void sdma_v7_1_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 seq,
327 				      unsigned flags)
328 {
329 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
330 	/* write the fence */
331 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_FENCE) |
332 			  SDMA_PKT_FENCE_HEADER_MTYPE(0x3)); /* Ucached(UC) */
333 	/* zero in first two bits */
334 	BUG_ON(addr & 0x3);
335 	amdgpu_ring_write(ring, lower_32_bits(addr));
336 	amdgpu_ring_write(ring, upper_32_bits(addr));
337 	amdgpu_ring_write(ring, lower_32_bits(seq));
338 
339 	/* optionally write high bits as well */
340 	if (write64bit) {
341 		addr += 4;
342 		amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_FENCE) |
343 				  SDMA_PKT_FENCE_HEADER_MTYPE(0x3));
344 		/* zero in first two bits */
345 		BUG_ON(addr & 0x3);
346 		amdgpu_ring_write(ring, lower_32_bits(addr));
347 		amdgpu_ring_write(ring, upper_32_bits(addr));
348 		amdgpu_ring_write(ring, upper_32_bits(seq));
349 	}
350 
351 	if (flags & AMDGPU_FENCE_FLAG_INT) {
352 		/* generate an interrupt */
353 		amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_TRAP));
354 		amdgpu_ring_write(ring, SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0));
355 	}
356 }
357 
358 /**
359  * sdma_v7_1_inst_gfx_stop - stop the gfx async dma engines
360  *
361  * @adev: amdgpu_device pointer
362  * @inst_mask: mask of dma engine instances to be disabled
363  *
364  * Stop the gfx async dma ring buffers.
365  */
366 static void sdma_v7_1_inst_gfx_stop(struct amdgpu_device *adev,
367 				    uint32_t inst_mask)
368 {
369 	u32 rb_cntl, ib_cntl;
370 	int i;
371 
372 	for (i = 0; i < NUM_XCC(inst_mask); i++) {
373 		rb_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL));
374 		rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_ENABLE, 0);
375 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL), rb_cntl);
376 		ib_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_IB_CNTL));
377 		ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_SDMA_QUEUE0_IB_CNTL, IB_ENABLE, 0);
378 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_IB_CNTL), ib_cntl);
379 	}
380 }
381 
382 /**
383  * sdma_v7_1_inst_rlc_stop - stop the compute async dma engines
384  *
385  * @adev: amdgpu_device pointer
386  * @inst_mask: mask of dma engine instances to be disabled
387  *
388  * Stop the compute async dma queues.
389  */
390 static void sdma_v7_1_inst_rlc_stop(struct amdgpu_device *adev,
391 				    uint32_t inst_mask)
392 {
393 	/* XXX todo */
394 }
395 
396 /**
397  * sdma_v7_1_inst_ctx_switch_enable - stop the async dma engines context switch
398  *
399  * @adev: amdgpu_device pointer
400  * @enable: enable/disable the DMA MEs context switch.
401  * @inst_mask: mask of dma engine instances to be enabled
402  *
403  * Halt or unhalt the async dma engines context switch.
404  */
405 static void sdma_v7_1_inst_ctx_switch_enable(struct amdgpu_device *adev,
406 					     bool enable, uint32_t inst_mask)
407 {
408 	int i;
409 
410 	for_each_inst(i, inst_mask) {
411 		WREG32_SOC15_IP(GC,
412 			sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_TIMEOUT), 0x80);
413 	}
414 }
415 
416 /**
417  * sdma_v7_1_inst_enable - stop the async dma engines
418  *
419  * @adev: amdgpu_device pointer
420  * @enable: enable/disable the DMA MEs.
421  * @inst_mask: mask of dma engine instances to be enabled
422  *
423  * Halt or unhalt the async dma engines.
424  */
425 static void sdma_v7_1_inst_enable(struct amdgpu_device *adev,
426 				  bool enable, uint32_t inst_mask)
427 {
428 	u32 mcu_cntl;
429 	int i;
430 
431 	if (!enable) {
432 		sdma_v7_1_inst_gfx_stop(adev, inst_mask);
433 		sdma_v7_1_inst_rlc_stop(adev, inst_mask);
434 	}
435 
436 	if (amdgpu_sriov_vf(adev))
437 		return;
438 
439 	for (i = 0; i < NUM_XCC(inst_mask); i++) {
440 		mcu_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL));
441 		mcu_cntl = REG_SET_FIELD(mcu_cntl, SDMA0_SDMA_MCU_CNTL, HALT, enable ? 0 : 1);
442 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL), mcu_cntl);
443 	}
444 }
445 
446 /**
447  * sdma_v7_1_gfx_resume_instance - start/restart a certain sdma engine
448  *
449  * @adev: amdgpu_device pointer
450  * @i: instance
451  * @restore: used to restore wptr when restart
452  *
453  * Set up the gfx DMA ring buffers and enable them. On restart, we will restore wptr and rptr.
454  * Return 0 for success.
455  */
456 static int sdma_v7_1_gfx_resume_instance(struct amdgpu_device *adev, int i, bool restore)
457 {
458 	struct amdgpu_ring *ring;
459 	u32 rb_cntl, ib_cntl;
460 	u32 rb_bufsz;
461 	u32 doorbell;
462 	u32 doorbell_offset;
463 	u32 temp;
464 	u64 wptr_gpu_addr;
465 	int r;
466 
467 	ring = &adev->sdma.instance[i].ring;
468 
469 	/* Set ring buffer size in dwords */
470 	rb_bufsz = order_base_2(ring->ring_size / 4);
471 	rb_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL));
472 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_SIZE, rb_bufsz);
473 #ifdef __BIG_ENDIAN
474 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_SWAP_ENABLE, 1);
475 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL,
476 				RPTR_WRITEBACK_SWAP_ENABLE, 1);
477 #endif
478 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_PRIV, 1);
479 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL), rb_cntl);
480 
481 	/* Initialize the ring buffer's read and write pointers */
482 	if (restore) {
483 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR), lower_32_bits(ring->wptr << 2));
484 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR_HI), upper_32_bits(ring->wptr << 2));
485 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR), lower_32_bits(ring->wptr << 2));
486 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_HI), upper_32_bits(ring->wptr << 2));
487 	} else {
488 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR), 0);
489 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR_HI), 0);
490 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR), 0);
491 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_HI), 0);
492 	}
493 	/* setup the wptr shadow polling */
494 	wptr_gpu_addr = ring->wptr_gpu_addr;
495 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_POLL_ADDR_LO),
496 	       lower_32_bits(wptr_gpu_addr));
497 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_POLL_ADDR_HI),
498 	       upper_32_bits(wptr_gpu_addr));
499 
500 	/* set the wb address whether it's enabled or not */
501 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR_ADDR_HI),
502 	       upper_32_bits(ring->rptr_gpu_addr) & 0xFFFFFFFF);
503 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR_ADDR_LO),
504 	       lower_32_bits(ring->rptr_gpu_addr) & 0xFFFFFFFC);
505 
506 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RPTR_WRITEBACK_ENABLE, 1);
507 	if (amdgpu_sriov_vf(adev))
508 		rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, WPTR_POLL_ENABLE, 1);
509 	else
510 		rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, WPTR_POLL_ENABLE, 0);
511 
512 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, MCU_WPTR_POLL_ENABLE, 1);
513 
514 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_BASE), ring->gpu_addr >> 8);
515 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_BASE_HI), ring->gpu_addr >> 40);
516 
517 	if (!restore)
518 		ring->wptr = 0;
519 
520 	/* before programing wptr to a less value, need set minor_ptr_update first */
521 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_MINOR_PTR_UPDATE), 1);
522 
523 	if (!amdgpu_sriov_vf(adev)) { /* only bare-metal use register write for wptr */
524 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR), lower_32_bits(ring->wptr) << 2);
525 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_HI), upper_32_bits(ring->wptr) << 2);
526 	}
527 
528 	doorbell = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_DOORBELL));
529 	doorbell_offset = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_DOORBELL_OFFSET));
530 
531 	if (ring->use_doorbell) {
532 		doorbell = REG_SET_FIELD(doorbell, SDMA0_SDMA_QUEUE0_DOORBELL, ENABLE, 1);
533 		doorbell_offset = REG_SET_FIELD(doorbell_offset, SDMA0_SDMA_QUEUE0_DOORBELL_OFFSET,
534 				OFFSET, ring->doorbell_index);
535 	} else {
536 		doorbell = REG_SET_FIELD(doorbell, SDMA0_SDMA_QUEUE0_DOORBELL, ENABLE, 0);
537 	}
538 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_DOORBELL), doorbell);
539 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_DOORBELL_OFFSET), doorbell_offset);
540 
541 	if (i == 0)
542 		adev->nbio.funcs->sdma_doorbell_range(adev, i, ring->use_doorbell,
543 					      ring->doorbell_index,
544 					      adev->doorbell_index.sdma_doorbell_range * adev->sdma.num_instances);
545 
546 	if (amdgpu_sriov_vf(adev))
547 		sdma_v7_1_ring_set_wptr(ring);
548 
549 	/* set minor_ptr_update to 0 after wptr programed */
550 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_MINOR_PTR_UPDATE), 0);
551 
552 	/* Set up sdma hang watchdog */
553 	temp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_WATCHDOG_CNTL));
554 	/* 100ms per unit */
555 	temp = REG_SET_FIELD(temp, SDMA0_SDMA_WATCHDOG_CNTL, QUEUE_HANG_COUNT,
556 			     max(adev->usec_timeout/100000, 1));
557 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_WATCHDOG_CNTL), temp);
558 
559 	/* Set up RESP_MODE to non-copy addresses */
560 	temp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_CNTL));
561 	temp = REG_SET_FIELD(temp, SDMA0_SDMA_UTCL1_CNTL, RESP_MODE, 3);
562 	temp = REG_SET_FIELD(temp, SDMA0_SDMA_UTCL1_CNTL, REDO_DELAY, 9);
563 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_CNTL), temp);
564 
565 	/* program default cache read and write policy */
566 	temp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_PAGE));
567 	/* clean read policy and write policy bits */
568 	temp &= 0xFF0FFF;
569 	temp |= ((CACHE_READ_POLICY_L2__DEFAULT << 12) |
570 		 (CACHE_WRITE_POLICY_L2__DEFAULT << 14));
571 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_PAGE), temp);
572 
573 	if (!amdgpu_sriov_vf(adev)) {
574 		/* unhalt engine */
575 		temp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL));
576 		temp = REG_SET_FIELD(temp, SDMA0_SDMA_MCU_CNTL, HALT, 0);
577 		temp = REG_SET_FIELD(temp, SDMA0_SDMA_MCU_CNTL, RESET, 0);
578 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL), temp);
579 	}
580 
581 	/* enable DMA RB */
582 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_ENABLE, 1);
583 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL), rb_cntl);
584 
585 	ib_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_IB_CNTL));
586 	ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_SDMA_QUEUE0_IB_CNTL, IB_ENABLE, 1);
587 #ifdef __BIG_ENDIAN
588 	ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_SDMA_QUEUE0_IB_CNTL, IB_SWAP_ENABLE, 1);
589 #endif
590 	/* enable DMA IBs */
591 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_IB_CNTL), ib_cntl);
592 	ring->sched.ready = true;
593 
594 	if (amdgpu_sriov_vf(adev)) { /* bare-metal sequence doesn't need below to lines */
595 		sdma_v7_1_inst_ctx_switch_enable(adev, true, i);
596 		sdma_v7_1_inst_enable(adev, true, i);
597 	}
598 
599 	r = amdgpu_ring_test_helper(ring);
600 	if (r)
601 		ring->sched.ready = false;
602 
603 	return r;
604 }
605 
606 /**
607  * sdma_v7_1_inst_gfx_resume - setup and start the async dma engines
608  *
609  * @adev: amdgpu_device pointer
610  * @inst_mask: mask of dma engine instances to be enabled
611  *
612  * Set up the gfx DMA ring buffers and enable them.
613  * Returns 0 for success, error for failure.
614  */
615 static int sdma_v7_1_inst_gfx_resume(struct amdgpu_device *adev,
616 				     uint32_t inst_mask)
617 {
618 	int i, r;
619 
620 	for (i = 0; i < NUM_XCC(inst_mask); i++) {
621 		r = sdma_v7_1_gfx_resume_instance(adev, i, false);
622 		if (r)
623 			return r;
624 	}
625 
626 	return 0;
627 
628 }
629 
630 /**
631  * sdma_v7_1_inst_rlc_resume - setup and start the async dma engines
632  *
633  * @adev: amdgpu_device pointer
634  * @inst_mask: mask of dma engine instances to be enabled
635  *
636  * Set up the compute DMA queues and enable them.
637  * Returns 0 for success, error for failure.
638  */
639 static int sdma_v7_1_inst_rlc_resume(struct amdgpu_device *adev,
640 				     uint32_t inst_mask)
641 {
642 	return 0;
643 }
644 
645 static void sdma_v7_1_inst_free_ucode_buffer(struct amdgpu_device *adev,
646 					     uint32_t inst_mask)
647 {
648 	int i;
649 
650 	for (i = 0; i < NUM_XCC(inst_mask); i++) {
651 		amdgpu_bo_free_kernel(&adev->sdma.instance[i].sdma_fw_obj,
652 				      &adev->sdma.instance[i].sdma_fw_gpu_addr,
653 				      (void **)&adev->sdma.instance[i].sdma_fw_ptr);
654 	}
655 }
656 
657 /**
658  * sdma_v7_1_inst_load_microcode - load the sDMA ME ucode
659  *
660  * @adev: amdgpu_device pointer
661  * @inst_mask: mask of dma engine instances to be enabled
662  *
663  * Loads the sDMA0/1 ucode.
664  * Returns 0 for success, -EINVAL if the ucode is not available.
665  */
666 static int sdma_v7_1_inst_load_microcode(struct amdgpu_device *adev,
667 					 uint32_t inst_mask)
668 {
669 	const struct sdma_firmware_header_v3_0 *hdr;
670 	const __le32 *fw_data;
671 	u32 fw_size;
672 	uint32_t tmp, sdma_status, ic_op_cntl;
673 	int i, r, j;
674 
675 	/* halt the MEs */
676 	sdma_v7_1_inst_enable(adev, false, inst_mask);
677 
678 	if (!adev->sdma.instance[0].fw)
679 		return -EINVAL;
680 
681 	hdr = (const struct sdma_firmware_header_v3_0 *)
682 		adev->sdma.instance[0].fw->data;
683 	amdgpu_ucode_print_sdma_hdr(&hdr->header);
684 
685 	fw_data = (const __le32 *)(adev->sdma.instance[0].fw->data +
686 			le32_to_cpu(hdr->ucode_offset_bytes));
687 	fw_size = le32_to_cpu(hdr->ucode_size_bytes);
688 
689 	for (i = 0; i < NUM_XCC(inst_mask); i++) {
690 		r = amdgpu_bo_create_reserved(adev, fw_size,
691 					      PAGE_SIZE,
692 					      AMDGPU_GEM_DOMAIN_VRAM,
693 					      &adev->sdma.instance[i].sdma_fw_obj,
694 					      &adev->sdma.instance[i].sdma_fw_gpu_addr,
695 					      (void **)&adev->sdma.instance[i].sdma_fw_ptr);
696 		if (r) {
697 			dev_err(adev->dev, "(%d) failed to create sdma ucode bo\n", r);
698 			return r;
699 		}
700 
701 		memcpy(adev->sdma.instance[i].sdma_fw_ptr, fw_data, fw_size);
702 
703 		amdgpu_bo_kunmap(adev->sdma.instance[i].sdma_fw_obj);
704 		amdgpu_bo_unreserve(adev->sdma.instance[i].sdma_fw_obj);
705 
706 		tmp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_CNTL));
707 		tmp = REG_SET_FIELD(tmp, SDMA0_SDMA_IC_CNTL, GPA, 0);
708 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_CNTL), tmp);
709 
710 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_BASE_LO),
711 			lower_32_bits(adev->sdma.instance[i].sdma_fw_gpu_addr));
712 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_BASE_HI),
713 			upper_32_bits(adev->sdma.instance[i].sdma_fw_gpu_addr));
714 
715 		tmp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_OP_CNTL));
716 		tmp = REG_SET_FIELD(tmp, SDMA0_SDMA_IC_OP_CNTL, PRIME_ICACHE, 1);
717 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_OP_CNTL), tmp);
718 
719 		/* Wait for sdma ucode init complete */
720 		for (j = 0; j < adev->usec_timeout; j++) {
721 			ic_op_cntl = RREG32_SOC15_IP(GC,
722 					sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_OP_CNTL));
723 			sdma_status = RREG32_SOC15_IP(GC,
724 					sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_STATUS_REG));
725 			if ((REG_GET_FIELD(ic_op_cntl, SDMA0_SDMA_IC_OP_CNTL, ICACHE_PRIMED) == 1) &&
726 			    (REG_GET_FIELD(sdma_status, SDMA0_SDMA_STATUS_REG, UCODE_INIT_DONE) == 1))
727 				break;
728 			udelay(1);
729 		}
730 
731 		if (j >= adev->usec_timeout) {
732 			dev_err(adev->dev, "failed to init sdma ucode\n");
733 			return -EINVAL;
734 		}
735 	}
736 
737 	return 0;
738 }
739 
740 static int sdma_v7_1_soft_reset(struct amdgpu_ip_block *ip_block)
741 {
742 	struct amdgpu_device *adev = ip_block->adev;
743 	uint32_t inst_mask;
744 	u32 tmp;
745 	int i;
746 
747 	inst_mask = adev->sdma.sdma_mask;
748 	sdma_v7_1_inst_gfx_stop(adev, inst_mask);
749 
750 	for (i = 0; i < NUM_XCC(inst_mask); i++) {
751 		//tmp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_FREEZE));
752 		//tmp |= SDMA0_SDMA_FREEZE__FREEZE_MASK;
753 		//WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_FREEZE), tmp);
754 		tmp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL));
755 		tmp |= SDMA0_SDMA_MCU_CNTL__HALT_MASK;
756 		tmp |= SDMA0_SDMA_MCU_CNTL__RESET_MASK;
757 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL), tmp);
758 
759 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_PREEMPT), 0);
760 
761 		udelay(100);
762 
763 		tmp = GRBM_SOFT_RESET__SOFT_RESET_SDMA0_MASK << i;
764 		WREG32_SOC15(GC, 0, regGRBM_SOFT_RESET, tmp);
765 		tmp = RREG32_SOC15(GC, 0, regGRBM_SOFT_RESET);
766 
767 		udelay(100);
768 
769 		WREG32_SOC15(GC, 0, regGRBM_SOFT_RESET, 0);
770 		tmp = RREG32_SOC15(GC, 0, regGRBM_SOFT_RESET);
771 
772 		udelay(100);
773 	}
774 
775 	return sdma_v7_1_inst_start(adev, inst_mask);
776 }
777 
778 static bool sdma_v7_1_check_soft_reset(struct amdgpu_ip_block *ip_block)
779 {
780 	struct amdgpu_device *adev = ip_block->adev;
781 	struct amdgpu_ring *ring;
782 	int i, r;
783 	long tmo = msecs_to_jiffies(1000);
784 
785 	for (i = 0; i < adev->sdma.num_instances; i++) {
786 		ring = &adev->sdma.instance[i].ring;
787 		r = amdgpu_ring_test_ib(ring, tmo);
788 		if (r)
789 			return true;
790 	}
791 
792 	return false;
793 }
794 
795 static int sdma_v7_1_reset_queue(struct amdgpu_ring *ring,
796 				 unsigned int vmid,
797 				 struct amdgpu_fence *timedout_fence)
798 {
799 	struct amdgpu_device *adev = ring->adev;
800 	int r;
801 
802 	if (ring->me >= adev->sdma.num_instances) {
803 		dev_err(adev->dev, "sdma instance not found\n");
804 		return -EINVAL;
805 	}
806 
807 	amdgpu_ring_reset_helper_begin(ring, timedout_fence);
808 
809 	r = amdgpu_mes_reset_legacy_queue(adev, ring, vmid, true, 0);
810 	if (r)
811 		return r;
812 
813 	r = sdma_v7_1_gfx_resume_instance(adev, ring->me, true);
814 	if (r)
815 		return r;
816 
817 	return amdgpu_ring_reset_helper_end(ring, timedout_fence);
818 }
819 
820 /**
821  * sdma_v7_1_inst_start - setup and start the async dma engines
822  *
823  * @adev: amdgpu_device pointer
824  * @inst_mask: mask of dma engine instances to be enabled
825  *
826  * Set up the DMA engines and enable them.
827  * Returns 0 for success, error for failure.
828  */
829 static int sdma_v7_1_inst_start(struct amdgpu_device *adev,
830 				uint32_t inst_mask)
831 {
832 	int r = 0;
833 
834 	if (amdgpu_sriov_vf(adev)) {
835 		sdma_v7_1_inst_ctx_switch_enable(adev, false, inst_mask);
836 		sdma_v7_1_inst_enable(adev, false, inst_mask);
837 
838 		/* set RB registers */
839 		r = sdma_v7_1_inst_gfx_resume(adev, inst_mask);
840 		return r;
841 	}
842 
843 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
844 		r = sdma_v7_1_inst_load_microcode(adev, inst_mask);
845 		if (r) {
846 			sdma_v7_1_inst_free_ucode_buffer(adev, inst_mask);
847 			return r;
848 		}
849 
850 		if (amdgpu_emu_mode == 1)
851 			msleep(1000);
852 	}
853 
854 	/* unhalt the MEs */
855 	sdma_v7_1_inst_enable(adev, true, inst_mask);
856 	/* enable sdma ring preemption */
857 	sdma_v7_1_inst_ctx_switch_enable(adev, true, inst_mask);
858 
859 	/* start the gfx rings and rlc compute queues */
860 	r = sdma_v7_1_inst_gfx_resume(adev, inst_mask);
861 	if (r)
862 		return r;
863 	r = sdma_v7_1_inst_rlc_resume(adev, inst_mask);
864 
865 	return r;
866 }
867 
868 static int sdma_v7_1_mqd_init(struct amdgpu_device *adev, void *mqd,
869 			      struct amdgpu_mqd_prop *prop)
870 {
871 	struct v12_sdma_mqd *m = mqd;
872 	uint64_t wb_gpu_addr;
873 
874 	m->sdmax_rlcx_rb_cntl =
875 		order_base_2(prop->queue_size / 4) << SDMA0_SDMA_QUEUE0_RB_CNTL__RB_SIZE__SHIFT |
876 		1 << SDMA0_SDMA_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
877 		4 << SDMA0_SDMA_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT |
878 		1 << SDMA0_SDMA_QUEUE0_RB_CNTL__MCU_WPTR_POLL_ENABLE__SHIFT;
879 
880 	m->sdmax_rlcx_rb_base = lower_32_bits(prop->hqd_base_gpu_addr >> 8);
881 	m->sdmax_rlcx_rb_base_hi = upper_32_bits(prop->hqd_base_gpu_addr >> 8);
882 
883 	wb_gpu_addr = prop->wptr_gpu_addr;
884 	m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits(wb_gpu_addr);
885 	m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits(wb_gpu_addr);
886 
887 	wb_gpu_addr = prop->rptr_gpu_addr;
888 	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits(wb_gpu_addr);
889 	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits(wb_gpu_addr);
890 
891 	m->sdmax_rlcx_ib_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, 0,
892 							regSDMA0_SDMA_QUEUE0_IB_CNTL));
893 
894 	m->sdmax_rlcx_doorbell_offset =
895 		prop->doorbell_index << SDMA0_SDMA_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
896 
897 	m->sdmax_rlcx_doorbell = REG_SET_FIELD(0, SDMA0_SDMA_QUEUE0_DOORBELL, ENABLE, 1);
898 
899 	m->sdmax_rlcx_doorbell_log = 0;
900 	m->sdmax_rlcx_rb_aql_cntl = 0x4000;	//regSDMA0_SDMA_QUEUE0_RB_AQL_CNTL_DEFAULT;
901 	m->sdmax_rlcx_dummy_reg = 0xf;	//regSDMA0_SDMA_QUEUE0_DUMMY_REG_DEFAULT;
902 
903 	m->sdmax_rlcx_csa_addr_lo = lower_32_bits(prop->csa_addr);
904 	m->sdmax_rlcx_csa_addr_hi = upper_32_bits(prop->csa_addr);
905 
906 	return 0;
907 }
908 
909 static void sdma_v7_1_set_mqd_funcs(struct amdgpu_device *adev)
910 {
911 	adev->mqds[AMDGPU_HW_IP_DMA].mqd_size = sizeof(struct v12_sdma_mqd);
912 	adev->mqds[AMDGPU_HW_IP_DMA].init_mqd = sdma_v7_1_mqd_init;
913 }
914 
915 /**
916  * sdma_v7_1_ring_test_ring - simple async dma engine test
917  *
918  * @ring: amdgpu_ring structure holding ring information
919  *
920  * Test the DMA engine by writing using it to write an
921  * value to memory.
922  * Returns 0 for success, error for failure.
923  */
924 static int sdma_v7_1_ring_test_ring(struct amdgpu_ring *ring)
925 {
926 	struct amdgpu_device *adev = ring->adev;
927 	unsigned i;
928 	unsigned index;
929 	int r;
930 	u32 tmp;
931 	u64 gpu_addr;
932 
933 	tmp = 0xCAFEDEAD;
934 
935 	r = amdgpu_device_wb_get(adev, &index);
936 	if (r) {
937 		dev_err(adev->dev, "(%d) failed to allocate wb slot\n", r);
938 		return r;
939 	}
940 
941 	gpu_addr = adev->wb.gpu_addr + (index * 4);
942 	adev->wb.wb[index] = cpu_to_le32(tmp);
943 
944 	r = amdgpu_ring_alloc(ring, 5);
945 	if (r) {
946 		DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", ring->idx, r);
947 		amdgpu_device_wb_free(adev, index);
948 		return r;
949 	}
950 
951 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_WRITE) |
952 			  SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR));
953 	amdgpu_ring_write(ring, lower_32_bits(gpu_addr));
954 	amdgpu_ring_write(ring, upper_32_bits(gpu_addr));
955 	amdgpu_ring_write(ring, SDMA_PKT_WRITE_UNTILED_DW_3_COUNT(0));
956 	amdgpu_ring_write(ring, 0xDEADBEEF);
957 	amdgpu_ring_commit(ring);
958 
959 	for (i = 0; i < adev->usec_timeout; i++) {
960 		tmp = le32_to_cpu(adev->wb.wb[index]);
961 		if (tmp == 0xDEADBEEF)
962 			break;
963 		if (amdgpu_emu_mode == 1)
964 			msleep(1);
965 		else
966 			udelay(1);
967 	}
968 
969 	if (i >= adev->usec_timeout)
970 		r = -ETIMEDOUT;
971 
972 	amdgpu_device_wb_free(adev, index);
973 
974 	return r;
975 }
976 
977 /**
978  * sdma_v7_1_ring_test_ib - test an IB on the DMA engine
979  *
980  * @ring: amdgpu_ring structure holding ring information
981  * @timeout: timeout value in jiffies, or MAX_SCHEDULE_TIMEOUT
982  *
983  * Test a simple IB in the DMA ring.
984  * Returns 0 on success, error on failure.
985  */
986 static int sdma_v7_1_ring_test_ib(struct amdgpu_ring *ring, long timeout)
987 {
988 	struct amdgpu_device *adev = ring->adev;
989 	struct amdgpu_ib ib;
990 	struct dma_fence *f = NULL;
991 	unsigned index;
992 	long r;
993 	u32 tmp = 0;
994 	u64 gpu_addr;
995 
996 	tmp = 0xCAFEDEAD;
997 	memset(&ib, 0, sizeof(ib));
998 
999 	r = amdgpu_device_wb_get(adev, &index);
1000 	if (r) {
1001 		dev_err(adev->dev, "(%ld) failed to allocate wb slot\n", r);
1002 		return r;
1003 	}
1004 
1005 	gpu_addr = adev->wb.gpu_addr + (index * 4);
1006 	adev->wb.wb[index] = cpu_to_le32(tmp);
1007 
1008 	r = amdgpu_ib_get(adev, NULL, 256, AMDGPU_IB_POOL_DIRECT, &ib);
1009 	if (r) {
1010 		DRM_ERROR("amdgpu: failed to get ib (%ld).\n", r);
1011 		goto err0;
1012 	}
1013 
1014 	ib.ptr[0] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_WRITE) |
1015 		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR);
1016 	ib.ptr[1] = lower_32_bits(gpu_addr);
1017 	ib.ptr[2] = upper_32_bits(gpu_addr);
1018 	ib.ptr[3] = SDMA_PKT_WRITE_UNTILED_DW_3_COUNT(0);
1019 	ib.ptr[4] = 0xDEADBEEF;
1020 	ib.ptr[5] = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP);
1021 	ib.ptr[6] = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP);
1022 	ib.ptr[7] = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP);
1023 	ib.length_dw = 8;
1024 
1025 	r = amdgpu_ib_schedule(ring, 1, &ib, NULL, &f);
1026 	if (r)
1027 		goto err1;
1028 
1029 	r = dma_fence_wait_timeout(f, false, timeout);
1030 	if (r == 0) {
1031 		DRM_ERROR("amdgpu: IB test timed out\n");
1032 		r = -ETIMEDOUT;
1033 		goto err1;
1034 	} else if (r < 0) {
1035 		DRM_ERROR("amdgpu: fence wait failed (%ld).\n", r);
1036 		goto err1;
1037 	}
1038 
1039 	tmp = le32_to_cpu(adev->wb.wb[index]);
1040 
1041 	if (tmp == 0xDEADBEEF)
1042 		r = 0;
1043 	else
1044 		r = -EINVAL;
1045 
1046 err1:
1047 	amdgpu_ib_free(&ib, NULL);
1048 	dma_fence_put(f);
1049 err0:
1050 	amdgpu_device_wb_free(adev, index);
1051 	return r;
1052 }
1053 
1054 
1055 /**
1056  * sdma_v7_1_vm_copy_pte - update PTEs by copying them from the GART
1057  *
1058  * @ib: indirect buffer to fill with commands
1059  * @pe: addr of the page entry
1060  * @src: src addr to copy from
1061  * @count: number of page entries to update
1062  *
1063  * Update PTEs by copying them from the GART using sDMA.
1064  */
1065 static void sdma_v7_1_vm_copy_pte(struct amdgpu_ib *ib,
1066 				  uint64_t pe, uint64_t src,
1067 				  unsigned count)
1068 {
1069 	unsigned bytes = count * 8;
1070 
1071 	ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) |
1072 		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR);
1073 
1074 	ib->ptr[ib->length_dw++] = bytes - 1;
1075 	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */
1076 	ib->ptr[ib->length_dw++] = lower_32_bits(src);
1077 	ib->ptr[ib->length_dw++] = upper_32_bits(src);
1078 	ib->ptr[ib->length_dw++] = lower_32_bits(pe);
1079 	ib->ptr[ib->length_dw++] = upper_32_bits(pe);
1080 
1081 }
1082 
1083 /**
1084  * sdma_v7_1_vm_write_pte - update PTEs by writing them manually
1085  *
1086  * @ib: indirect buffer to fill with commands
1087  * @pe: addr of the page entry
1088  * @value: dst addr to write into pe
1089  * @count: number of page entries to update
1090  * @incr: increase next addr by incr bytes
1091  *
1092  * Update PTEs by writing them manually using sDMA.
1093  */
1094 static void sdma_v7_1_vm_write_pte(struct amdgpu_ib *ib, uint64_t pe,
1095 				   uint64_t value, unsigned count,
1096 				   uint32_t incr)
1097 {
1098 	unsigned ndw = count * 2;
1099 
1100 	ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_WRITE) |
1101 		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR);
1102 	ib->ptr[ib->length_dw++] = lower_32_bits(pe);
1103 	ib->ptr[ib->length_dw++] = upper_32_bits(pe);
1104 	ib->ptr[ib->length_dw++] = ndw - 1;
1105 	for (; ndw > 0; ndw -= 2) {
1106 		ib->ptr[ib->length_dw++] = lower_32_bits(value);
1107 		ib->ptr[ib->length_dw++] = upper_32_bits(value);
1108 		value += incr;
1109 	}
1110 }
1111 
1112 /**
1113  * sdma_v7_1_vm_set_pte_pde - update the page tables using sDMA
1114  *
1115  * @ib: indirect buffer to fill with commands
1116  * @pe: addr of the page entry
1117  * @addr: dst addr to write into pe
1118  * @count: number of page entries to update
1119  * @incr: increase next addr by incr bytes
1120  * @flags: access flags
1121  *
1122  * Update the page tables using sDMA.
1123  */
1124 static void sdma_v7_1_vm_set_pte_pde(struct amdgpu_ib *ib,
1125 				     uint64_t pe,
1126 				     uint64_t addr, unsigned count,
1127 				     uint32_t incr, uint64_t flags)
1128 {
1129 	/* for physically contiguous pages (vram) */
1130 	u32 header = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_PTEPDE);
1131 
1132 	if (amdgpu_mtype_local)
1133 		header |= SDMA_PKT_PTEPDE_COPY_HEADER_MTYPE(0x3);
1134 	else
1135 		header |= (SDMA_PKT_PTEPDE_COPY_HEADER_MTYPE(0x2) |
1136 			   SDMA_PKT_PTEPDE_COPY_HEADER_SNOOP(0x1) |
1137 			   SDMA_PKT_PTEPDE_COPY_HEADER_SCOPE(0x3));
1138 
1139 	ib->ptr[ib->length_dw++] = header;
1140 	ib->ptr[ib->length_dw++] = lower_32_bits(pe); /* dst addr */
1141 	ib->ptr[ib->length_dw++] = upper_32_bits(pe);
1142 	ib->ptr[ib->length_dw++] = lower_32_bits(flags); /* mask */
1143 	ib->ptr[ib->length_dw++] = upper_32_bits(flags);
1144 	ib->ptr[ib->length_dw++] = lower_32_bits(addr); /* value */
1145 	ib->ptr[ib->length_dw++] = upper_32_bits(addr);
1146 	ib->ptr[ib->length_dw++] = incr; /* increment size */
1147 	ib->ptr[ib->length_dw++] = 0;
1148 	ib->ptr[ib->length_dw++] = count - 1; /* number of entries */
1149 }
1150 
1151 /**
1152  * sdma_v7_1_ring_pad_ib - pad the IB
1153  *
1154  * @ring: amdgpu ring pointer
1155  * @ib: indirect buffer to fill with padding
1156  *
1157  * Pad the IB with NOPs to a boundary multiple of 8.
1158  */
1159 static void sdma_v7_1_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
1160 {
1161 	struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
1162 	u32 pad_count;
1163 	int i;
1164 
1165 	pad_count = (-ib->length_dw) & 0x7;
1166 	for (i = 0; i < pad_count; i++)
1167 		if (sdma && sdma->burst_nop && (i == 0))
1168 			ib->ptr[ib->length_dw++] =
1169 				SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_NOP) |
1170 				SDMA_PKT_NOP_HEADER_COUNT(pad_count - 1);
1171 		else
1172 			ib->ptr[ib->length_dw++] =
1173 				SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_NOP);
1174 }
1175 
1176 /**
1177  * sdma_v7_1_ring_emit_pipeline_sync - sync the pipeline
1178  *
1179  * @ring: amdgpu_ring pointer
1180  *
1181  * Make sure all previous operations are completed (CIK).
1182  */
1183 static void sdma_v7_1_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
1184 {
1185 	uint32_t seq = ring->fence_drv.sync_seq;
1186 	uint64_t addr = ring->fence_drv.gpu_addr;
1187 
1188 	/* wait for idle */
1189 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_POLL_REGMEM) |
1190 			  SDMA_PKT_POLL_REGMEM_HEADER_FUNC(3) | /* equal */
1191 			  SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1));
1192 	amdgpu_ring_write(ring, addr & 0xfffffffc);
1193 	amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff);
1194 	amdgpu_ring_write(ring, seq); /* reference */
1195 	amdgpu_ring_write(ring, 0xffffffff); /* mask */
1196 	amdgpu_ring_write(ring, SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) |
1197 			  SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(4)); /* retry count, poll interval */
1198 }
1199 
1200 /**
1201  * sdma_v7_1_ring_emit_vm_flush - vm flush using sDMA
1202  *
1203  * @ring: amdgpu_ring pointer
1204  * @vmid: vmid number to use
1205  * @pd_addr: address
1206  *
1207  * Update the page table base and flush the VM TLB
1208  * using sDMA.
1209  */
1210 static void sdma_v7_1_ring_emit_vm_flush(struct amdgpu_ring *ring,
1211 					 unsigned vmid, uint64_t pd_addr)
1212 {
1213 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
1214 }
1215 
1216 static void sdma_v7_1_ring_emit_wreg(struct amdgpu_ring *ring,
1217 				     uint32_t reg, uint32_t val)
1218 {
1219 	/* SRBM WRITE command will not support on sdma v7.
1220 	 * Use Register WRITE command instead, which OPCODE is same as SRBM WRITE
1221 	 */
1222 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_SRBM_WRITE));
1223 	amdgpu_ring_write(ring, reg << 2);
1224 	amdgpu_ring_write(ring, val);
1225 }
1226 
1227 static void sdma_v7_1_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
1228 					 uint32_t val, uint32_t mask)
1229 {
1230 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_POLL_REGMEM) |
1231 			  SDMA_PKT_POLL_REGMEM_HEADER_FUNC(3)); /* equal */
1232 	amdgpu_ring_write(ring, reg << 2);
1233 	amdgpu_ring_write(ring, 0);
1234 	amdgpu_ring_write(ring, val); /* reference */
1235 	amdgpu_ring_write(ring, mask); /* mask */
1236 	amdgpu_ring_write(ring, SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) |
1237 			  SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(10));
1238 }
1239 
1240 static void sdma_v7_1_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
1241 						   uint32_t reg0, uint32_t reg1,
1242 						   uint32_t ref, uint32_t mask)
1243 {
1244 	amdgpu_ring_emit_wreg(ring, reg0, ref);
1245 	/* wait for a cycle to reset vm_inv_eng*_ack */
1246 	amdgpu_ring_emit_reg_wait(ring, reg0, 0, 0);
1247 	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
1248 }
1249 
1250 static int sdma_v7_1_early_init(struct amdgpu_ip_block *ip_block)
1251 {
1252 	struct amdgpu_device *adev = ip_block->adev;
1253 	int r;
1254 
1255 	r = amdgpu_sdma_init_microcode(adev, 0, true);
1256 	if (r) {
1257 		DRM_ERROR("Failed to init sdma firmware!\n");
1258 		return r;
1259 	}
1260 
1261 	sdma_v7_1_set_ring_funcs(adev);
1262 	sdma_v7_1_set_buffer_funcs(adev);
1263 	sdma_v7_1_set_vm_pte_funcs(adev);
1264 	sdma_v7_1_set_irq_funcs(adev);
1265 	sdma_v7_1_set_mqd_funcs(adev);
1266 
1267 	return 0;
1268 }
1269 
1270 static int sdma_v7_1_sw_init(struct amdgpu_ip_block *ip_block)
1271 {
1272 	struct amdgpu_ring *ring;
1273 	int r, i;
1274 	struct amdgpu_device *adev = ip_block->adev;
1275 	uint32_t reg_count = ARRAY_SIZE(sdma_reg_list_7_1);
1276 	uint32_t *ptr;
1277 	u32 xcc_id;
1278 
1279 	/* SDMA trap event */
1280 	r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_GFX,
1281 			      GFX_12_1_0__SRCID__SDMA_TRAP,
1282 			      &adev->sdma.trap_irq);
1283 	if (r)
1284 		return r;
1285 
1286 	for (i = 0; i < adev->sdma.num_instances; i++) {
1287 		ring = &adev->sdma.instance[i].ring;
1288 		ring->ring_obj = NULL;
1289 		ring->use_doorbell = true;
1290 		ring->me = i;
1291 
1292 		for (xcc_id = 0; xcc_id < fls(adev->gfx.xcc_mask); xcc_id++) {
1293 			if (adev->sdma.instance[i].xcc_id == GET_INST(GC, xcc_id))
1294 				break;
1295 		}
1296 
1297 		DRM_DEBUG("SDMA%d.%d use_doorbell being set to: [%s]\n",
1298 				xcc_id, GET_INST(SDMA0, i) % adev->sdma.num_inst_per_xcc,
1299 				ring->use_doorbell?"true":"false");
1300 
1301 		ring->doorbell_index =
1302 			(adev->doorbell_index.sdma_engine[i] << 1); // get DWORD offset
1303 
1304 		ring->vm_hub = AMDGPU_GFXHUB(xcc_id);
1305 		sprintf(ring->name, "sdma%d.%d", xcc_id,
1306 				GET_INST(SDMA0, i) % adev->sdma.num_inst_per_xcc);
1307 		r = amdgpu_ring_init(adev, ring, 1024,
1308 				     &adev->sdma.trap_irq,
1309 				     AMDGPU_SDMA_IRQ_INSTANCE0 + i,
1310 				     AMDGPU_RING_PRIO_DEFAULT, NULL);
1311 		if (r)
1312 			return r;
1313 	}
1314 
1315 	adev->sdma.supported_reset =
1316 		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
1317 	if (!amdgpu_sriov_vf(adev) &&
1318 	    !adev->debug_disable_gpu_ring_reset)
1319 		adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
1320 
1321 	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
1322 	if (r)
1323 		return r;
1324 	/* Allocate memory for SDMA IP Dump buffer */
1325 	ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), GFP_KERNEL);
1326 	if (ptr)
1327 		adev->sdma.ip_dump = ptr;
1328 	else
1329 		DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n");
1330 
1331 #ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ
1332 	adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs;
1333 #endif
1334 
1335 	return r;
1336 }
1337 
1338 static int sdma_v7_1_sw_fini(struct amdgpu_ip_block *ip_block)
1339 {
1340 	struct amdgpu_device *adev = ip_block->adev;
1341 	int i;
1342 
1343 	for (i = 0; i < adev->sdma.num_instances; i++)
1344 		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
1345 
1346 	amdgpu_sdma_sysfs_reset_mask_fini(adev);
1347 	amdgpu_sdma_destroy_inst_ctx(adev, true);
1348 
1349 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT)
1350 		sdma_v7_1_inst_free_ucode_buffer(adev, adev->sdma.sdma_mask);
1351 
1352 	kfree(adev->sdma.ip_dump);
1353 
1354 	return 0;
1355 }
1356 
1357 static int sdma_v7_1_hw_init(struct amdgpu_ip_block *ip_block)
1358 {
1359 	struct amdgpu_device *adev = ip_block->adev;
1360 
1361 	return sdma_v7_1_inst_start(adev, adev->sdma.sdma_mask);
1362 }
1363 
1364 static int sdma_v7_1_hw_fini(struct amdgpu_ip_block *ip_block)
1365 {
1366 	struct amdgpu_device *adev = ip_block->adev;
1367 
1368 	if (amdgpu_sriov_vf(adev))
1369 		return 0;
1370 
1371 	sdma_v7_1_inst_ctx_switch_enable(adev, false, adev->sdma.sdma_mask);
1372 	sdma_v7_1_inst_enable(adev, false, adev->sdma.sdma_mask);
1373 
1374 	return 0;
1375 }
1376 
1377 static int sdma_v7_1_suspend(struct amdgpu_ip_block *ip_block)
1378 {
1379 	return sdma_v7_1_hw_fini(ip_block);
1380 }
1381 
1382 static int sdma_v7_1_resume(struct amdgpu_ip_block *ip_block)
1383 {
1384 	return sdma_v7_1_hw_init(ip_block);
1385 }
1386 
1387 static bool sdma_v7_1_is_idle(struct amdgpu_ip_block *ip_block)
1388 {
1389 	struct amdgpu_device *adev = ip_block->adev;
1390 	u32 i;
1391 
1392 	for (i = 0; i < adev->sdma.num_instances; i++) {
1393 		u32 tmp = RREG32(sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_STATUS_REG));
1394 
1395 		if (!(tmp & SDMA0_SDMA_STATUS_REG__IDLE_MASK))
1396 			return false;
1397 	}
1398 
1399 	return true;
1400 }
1401 
1402 static int sdma_v7_1_wait_for_idle(struct amdgpu_ip_block *ip_block)
1403 {
1404 	unsigned i, j;
1405 	u32 sdma[AMDGPU_MAX_SDMA_INSTANCES];
1406 	struct amdgpu_device *adev = ip_block->adev;
1407 
1408 	for (i = 0; i < adev->usec_timeout; i++) {
1409 		for (j = 0; j < adev->sdma.num_instances; j++) {
1410 			sdma[j] = RREG32(sdma_v7_1_get_reg_offset(adev,
1411 						j, regSDMA0_SDMA_STATUS_REG));
1412 			if (!(sdma[j] & SDMA0_SDMA_STATUS_REG__IDLE_MASK))
1413 				break;
1414 		}
1415 		if (j == adev->sdma.num_instances)
1416 			return 0;
1417 		udelay(1);
1418 	}
1419 	return -ETIMEDOUT;
1420 }
1421 
1422 static int sdma_v7_1_ring_preempt_ib(struct amdgpu_ring *ring)
1423 {
1424 	int i, r = 0;
1425 	struct amdgpu_device *adev = ring->adev;
1426 	u32 index = 0;
1427 	u64 sdma_gfx_preempt;
1428 
1429 	amdgpu_sdma_get_index_from_ring(ring, &index);
1430 	sdma_gfx_preempt =
1431 		sdma_v7_1_get_reg_offset(adev, index, regSDMA0_SDMA_QUEUE0_PREEMPT);
1432 
1433 	/* assert preemption condition */
1434 	amdgpu_ring_set_preempt_cond_exec(ring, false);
1435 
1436 	/* emit the trailing fence */
1437 	ring->trail_seq += 1;
1438 	r = amdgpu_ring_alloc(ring, 10);
1439 	if (r) {
1440 		DRM_ERROR("ring %d failed to be allocated \n", ring->idx);
1441 		return r;
1442 	}
1443 	sdma_v7_1_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
1444 				  ring->trail_seq, 0);
1445 	amdgpu_ring_commit(ring);
1446 
1447 	/* assert IB preemption */
1448 	WREG32(sdma_gfx_preempt, 1);
1449 
1450 	/* poll the trailing fence */
1451 	for (i = 0; i < adev->usec_timeout; i++) {
1452 		if (ring->trail_seq ==
1453 		    le32_to_cpu(*(ring->trail_fence_cpu_addr)))
1454 			break;
1455 		udelay(1);
1456 	}
1457 
1458 	if (i >= adev->usec_timeout) {
1459 		r = -EINVAL;
1460 		DRM_ERROR("ring %d failed to be preempted\n", ring->idx);
1461 	}
1462 
1463 	/* deassert IB preemption */
1464 	WREG32(sdma_gfx_preempt, 0);
1465 
1466 	/* deassert the preemption condition */
1467 	amdgpu_ring_set_preempt_cond_exec(ring, true);
1468 	return r;
1469 }
1470 
1471 static int sdma_v7_1_set_trap_irq_state(struct amdgpu_device *adev,
1472 					struct amdgpu_irq_src *source,
1473 					unsigned type,
1474 					enum amdgpu_interrupt_state state)
1475 {
1476 	u32 sdma_cntl;
1477 
1478 	u32 reg_offset = sdma_v7_1_get_reg_offset(adev, type, regSDMA0_SDMA_CNTL);
1479 
1480 	sdma_cntl = RREG32(reg_offset);
1481 	sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA0_SDMA_CNTL, TRAP_ENABLE,
1482 		       state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
1483 	WREG32(reg_offset, sdma_cntl);
1484 
1485 	return 0;
1486 }
1487 
1488 static int sdma_v7_1_process_trap_irq(struct amdgpu_device *adev,
1489 				      struct amdgpu_irq_src *source,
1490 				      struct amdgpu_iv_entry *entry)
1491 {
1492 	int inst, instances, queue, xcc_id = 0;
1493 	uint32_t mes_queue_id = entry->src_data[0];
1494 
1495 	DRM_DEBUG("IH: SDMA trap\n");
1496 
1497 	if (adev->enable_mes && (mes_queue_id & AMDGPU_FENCE_MES_QUEUE_FLAG)) {
1498 		struct amdgpu_mes_queue *queue;
1499 
1500 		mes_queue_id &= AMDGPU_FENCE_MES_QUEUE_ID_MASK;
1501 
1502 		spin_lock(&adev->mes.queue_id_lock);
1503 		queue = idr_find(&adev->mes.queue_id_idr, mes_queue_id);
1504 		if (queue) {
1505 			DRM_DEBUG("process smda queue id = %d\n", mes_queue_id);
1506 			amdgpu_fence_process(queue->ring);
1507 		}
1508 		spin_unlock(&adev->mes.queue_id_lock);
1509 		return 0;
1510 	}
1511 
1512 	queue = entry->ring_id & 0xf;
1513 	if (adev->gfx.funcs && adev->gfx.funcs->ih_node_to_logical_xcc)
1514 		xcc_id = adev->gfx.funcs->ih_node_to_logical_xcc(adev, entry->node_id);
1515 	else
1516 		dev_warn(adev->dev, "IH: SDMA may get wrong xcc id as gfx function not available\n");
1517 	inst = ((entry->ring_id & 0xf0) >> 4) +
1518 		GET_INST(GC, xcc_id) * adev->sdma.num_inst_per_xcc;
1519 	for (instances = 0; instances < adev->sdma.num_instances; instances++) {
1520 		if (inst == GET_INST(SDMA0, instances))
1521 			break;
1522 	}
1523 	if (instances > adev->sdma.num_instances - 1) {
1524 		DRM_ERROR("IH: wrong ring_ID detected, as wrong sdma instance\n");
1525 		return -EINVAL;
1526 	}
1527 
1528 	switch (entry->client_id) {
1529 	case SOC_V1_0_IH_CLIENTID_GFX:
1530 		switch (queue) {
1531 		case 0:
1532 			amdgpu_fence_process(&adev->sdma.instance[instances].ring);
1533 			break;
1534 		default:
1535 			break;
1536 		}
1537 		break;
1538 	}
1539 	return 0;
1540 }
1541 
1542 static int sdma_v7_1_process_illegal_inst_irq(struct amdgpu_device *adev,
1543 					      struct amdgpu_irq_src *source,
1544 					      struct amdgpu_iv_entry *entry)
1545 {
1546 	return 0;
1547 }
1548 
1549 static int sdma_v7_1_set_clockgating_state(struct amdgpu_ip_block *ip_block,
1550 					   enum amd_clockgating_state state)
1551 {
1552 	return 0;
1553 }
1554 
1555 static int sdma_v7_1_set_powergating_state(struct amdgpu_ip_block *ip_block,
1556 					  enum amd_powergating_state state)
1557 {
1558 	return 0;
1559 }
1560 
1561 static void sdma_v7_1_get_clockgating_state(struct amdgpu_ip_block *ip_block,
1562 					    u64 *flags)
1563 {
1564 }
1565 
1566 static void sdma_v7_1_print_ip_state(struct amdgpu_ip_block *ip_block, struct drm_printer *p)
1567 {
1568 	struct amdgpu_device *adev = ip_block->adev;
1569 	int i, j;
1570 	uint32_t reg_count = ARRAY_SIZE(sdma_reg_list_7_1);
1571 	uint32_t instance_offset;
1572 
1573 	if (!adev->sdma.ip_dump)
1574 		return;
1575 
1576 	drm_printf(p, "num_instances:%d\n", adev->sdma.num_instances);
1577 	for (i = 0; i < adev->sdma.num_instances; i++) {
1578 		instance_offset = i * reg_count;
1579 		drm_printf(p, "\nInstance:%d\n", i);
1580 
1581 		for (j = 0; j < reg_count; j++)
1582 			drm_printf(p, "%-50s \t 0x%08x\n", sdma_reg_list_7_1[j].reg_name,
1583 				   adev->sdma.ip_dump[instance_offset + j]);
1584 	}
1585 }
1586 
1587 static void sdma_v7_1_dump_ip_state(struct amdgpu_ip_block *ip_block)
1588 {
1589 	struct amdgpu_device *adev = ip_block->adev;
1590 	int i, j;
1591 	uint32_t instance_offset;
1592 	uint32_t reg_count = ARRAY_SIZE(sdma_reg_list_7_1);
1593 
1594 	if (!adev->sdma.ip_dump)
1595 		return;
1596 
1597 	amdgpu_gfx_off_ctrl(adev, false);
1598 	for (i = 0; i < adev->sdma.num_instances; i++) {
1599 		instance_offset = i * reg_count;
1600 		for (j = 0; j < reg_count; j++)
1601 			adev->sdma.ip_dump[instance_offset + j] =
1602 				RREG32(sdma_v7_1_get_reg_offset(adev, i,
1603 				       sdma_reg_list_7_1[j].reg_offset));
1604 	}
1605 	amdgpu_gfx_off_ctrl(adev, true);
1606 }
1607 
1608 const struct amd_ip_funcs sdma_v7_1_ip_funcs = {
1609 	.name = "sdma_v7_1",
1610 	.early_init = sdma_v7_1_early_init,
1611 	.late_init = NULL,
1612 	.sw_init = sdma_v7_1_sw_init,
1613 	.sw_fini = sdma_v7_1_sw_fini,
1614 	.hw_init = sdma_v7_1_hw_init,
1615 	.hw_fini = sdma_v7_1_hw_fini,
1616 	.suspend = sdma_v7_1_suspend,
1617 	.resume = sdma_v7_1_resume,
1618 	.is_idle = sdma_v7_1_is_idle,
1619 	.wait_for_idle = sdma_v7_1_wait_for_idle,
1620 	.soft_reset = sdma_v7_1_soft_reset,
1621 	.check_soft_reset = sdma_v7_1_check_soft_reset,
1622 	.set_clockgating_state = sdma_v7_1_set_clockgating_state,
1623 	.set_powergating_state = sdma_v7_1_set_powergating_state,
1624 	.get_clockgating_state = sdma_v7_1_get_clockgating_state,
1625 	.dump_ip_state = sdma_v7_1_dump_ip_state,
1626 	.print_ip_state = sdma_v7_1_print_ip_state,
1627 };
1628 
1629 static const struct amdgpu_ring_funcs sdma_v7_1_ring_funcs = {
1630 	.type = AMDGPU_RING_TYPE_SDMA,
1631 	.align_mask = 0xf,
1632 	.nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
1633 	.support_64bit_ptrs = true,
1634 	.secure_submission_supported = true,
1635 	.get_rptr = sdma_v7_1_ring_get_rptr,
1636 	.get_wptr = sdma_v7_1_ring_get_wptr,
1637 	.set_wptr = sdma_v7_1_ring_set_wptr,
1638 	.emit_frame_size =
1639 		5 + /* sdma_v7_1_ring_init_cond_exec */
1640 		6 + /* sdma_v7_1_ring_emit_pipeline_sync */
1641 		/* sdma_v7_1_ring_emit_vm_flush */
1642 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
1643 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
1644 		10 + 10 + 10, /* sdma_v7_1_ring_emit_fence x3 for user fence, vm fence */
1645 	.emit_ib_size = 5 + 7 + 6, /* sdma_v7_1_ring_emit_ib */
1646 	.emit_ib = sdma_v7_1_ring_emit_ib,
1647 	.emit_mem_sync = sdma_v7_1_ring_emit_mem_sync,
1648 	.emit_fence = sdma_v7_1_ring_emit_fence,
1649 	.emit_pipeline_sync = sdma_v7_1_ring_emit_pipeline_sync,
1650 	.emit_vm_flush = sdma_v7_1_ring_emit_vm_flush,
1651 	.test_ring = sdma_v7_1_ring_test_ring,
1652 	.test_ib = sdma_v7_1_ring_test_ib,
1653 	.insert_nop = sdma_v7_1_ring_insert_nop,
1654 	.pad_ib = sdma_v7_1_ring_pad_ib,
1655 	.emit_wreg = sdma_v7_1_ring_emit_wreg,
1656 	.emit_reg_wait = sdma_v7_1_ring_emit_reg_wait,
1657 	.emit_reg_write_reg_wait = sdma_v7_1_ring_emit_reg_write_reg_wait,
1658 	.init_cond_exec = sdma_v7_1_ring_init_cond_exec,
1659 	.preempt_ib = sdma_v7_1_ring_preempt_ib,
1660 	.reset = sdma_v7_1_reset_queue,
1661 };
1662 
1663 static void sdma_v7_1_set_ring_funcs(struct amdgpu_device *adev)
1664 {
1665 	int i, dev_inst;
1666 
1667 	for (i = 0; i < adev->sdma.num_instances; i++) {
1668 		adev->sdma.instance[i].ring.funcs = &sdma_v7_1_ring_funcs;
1669 		adev->sdma.instance[i].ring.me = i;
1670 
1671 		dev_inst = GET_INST(SDMA0, i);
1672 		/* XCC to which SDMA belongs depends on physical instance */
1673 		adev->sdma.instance[i].xcc_id =
1674 			dev_inst / adev->sdma.num_inst_per_xcc;
1675 	}
1676 }
1677 
1678 static const struct amdgpu_irq_src_funcs sdma_v7_1_trap_irq_funcs = {
1679 	.set = sdma_v7_1_set_trap_irq_state,
1680 	.process = sdma_v7_1_process_trap_irq,
1681 };
1682 
1683 static const struct amdgpu_irq_src_funcs sdma_v7_1_illegal_inst_irq_funcs = {
1684 	.process = sdma_v7_1_process_illegal_inst_irq,
1685 };
1686 
1687 static void sdma_v7_1_set_irq_funcs(struct amdgpu_device *adev)
1688 {
1689 	adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE0 +
1690 					adev->sdma.num_instances;
1691 	adev->sdma.trap_irq.funcs = &sdma_v7_1_trap_irq_funcs;
1692 	adev->sdma.illegal_inst_irq.funcs = &sdma_v7_1_illegal_inst_irq_funcs;
1693 }
1694 
1695 /**
1696  * sdma_v7_1_emit_copy_buffer - copy buffer using the sDMA engine
1697  *
1698  * @ib: indirect buffer to fill with commands
1699  * @src_offset: src GPU address
1700  * @dst_offset: dst GPU address
1701  * @byte_count: number of bytes to xfer
1702  * @copy_flags: copy flags for the buffers
1703  *
1704  * Copy GPU buffers using the DMA engine.
1705  * Used by the amdgpu ttm implementation to move pages if
1706  * registered as the asic copy callback.
1707  */
1708 static void sdma_v7_1_emit_copy_buffer(struct amdgpu_ib *ib,
1709 				       uint64_t src_offset,
1710 				       uint64_t dst_offset,
1711 				       uint32_t byte_count,
1712 				       uint32_t copy_flags)
1713 {
1714 	ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) |
1715 		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) |
1716 		SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0);
1717 
1718 	ib->ptr[ib->length_dw++] = byte_count - 1;
1719 	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */
1720 	ib->ptr[ib->length_dw++] = lower_32_bits(src_offset);
1721 	ib->ptr[ib->length_dw++] = upper_32_bits(src_offset);
1722 	ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
1723 	ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset);
1724 }
1725 
1726 /**
1727  * sdma_v7_1_emit_fill_buffer - fill buffer using the sDMA engine
1728  *
1729  * @ib: indirect buffer to fill
1730  * @src_data: value to write to buffer
1731  * @dst_offset: dst GPU address
1732  * @byte_count: number of bytes to xfer
1733  *
1734  * Fill GPU buffers using the DMA engine.
1735  */
1736 static void sdma_v7_1_emit_fill_buffer(struct amdgpu_ib *ib,
1737 				       uint32_t src_data,
1738 				       uint64_t dst_offset,
1739 				       uint32_t byte_count)
1740 {
1741 	ib->ptr[ib->length_dw++] = SDMA_PKT_CONSTANT_FILL_HEADER_OP(SDMA_OP_CONST_FILL);
1742 	ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
1743 	ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset);
1744 	ib->ptr[ib->length_dw++] = src_data;
1745 	ib->ptr[ib->length_dw++] = byte_count - 1;
1746 }
1747 
1748 static const struct amdgpu_buffer_funcs sdma_v7_1_buffer_funcs = {
1749 	.copy_max_bytes = 0x400000,
1750 	.copy_num_dw = 8,
1751 	.emit_copy_buffer = sdma_v7_1_emit_copy_buffer,
1752 	.fill_max_bytes = 0x400000,
1753 	.fill_num_dw = 5,
1754 	.emit_fill_buffer = sdma_v7_1_emit_fill_buffer,
1755 };
1756 
1757 static void sdma_v7_1_set_buffer_funcs(struct amdgpu_device *adev)
1758 {
1759 	adev->mman.buffer_funcs = &sdma_v7_1_buffer_funcs;
1760 	adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].ring;
1761 }
1762 
1763 static const struct amdgpu_vm_pte_funcs sdma_v7_1_vm_pte_funcs = {
1764 	.copy_pte_num_dw = 8,
1765 	.copy_pte = sdma_v7_1_vm_copy_pte,
1766 	.write_pte = sdma_v7_1_vm_write_pte,
1767 	.set_pte_pde = sdma_v7_1_vm_set_pte_pde,
1768 };
1769 
1770 static void sdma_v7_1_set_vm_pte_funcs(struct amdgpu_device *adev)
1771 {
1772 	unsigned i;
1773 
1774 	adev->vm_manager.vm_pte_funcs = &sdma_v7_1_vm_pte_funcs;
1775 	for (i = 0; i < adev->sdma.num_instances; i++) {
1776 		adev->vm_manager.vm_pte_scheds[i] =
1777 			&adev->sdma.instance[i].ring.sched;
1778 	}
1779 	adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
1780 }
1781 
1782 const struct amdgpu_ip_block_version sdma_v7_1_ip_block = {
1783 	.type = AMD_IP_BLOCK_TYPE_SDMA,
1784 	.major = 7,
1785 	.minor = 1,
1786 	.rev = 0,
1787 	.funcs = &sdma_v7_1_ip_funcs,
1788 };
1789 
1790 static int sdma_v7_1_xcp_resume(void *handle, uint32_t inst_mask)
1791 {
1792 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
1793 	int r;
1794 
1795 	r = sdma_v7_1_inst_start(adev, inst_mask);
1796 
1797 	return r;
1798 }
1799 
1800 static int sdma_v7_1_xcp_suspend(void *handle, uint32_t inst_mask)
1801 {
1802 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
1803 
1804 	sdma_v7_1_inst_ctx_switch_enable(adev, false, inst_mask);
1805 	sdma_v7_1_inst_enable(adev, false, inst_mask);
1806 
1807 	return 0;
1808 }
1809 
1810 struct amdgpu_xcp_ip_funcs sdma_v7_1_xcp_funcs = {
1811 	.suspend = &sdma_v7_1_xcp_suspend,
1812 	.resume = &sdma_v7_1_xcp_resume
1813 };
1814