xref: /linux/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c (revision 75372d75a4e23783583998ed99d5009d555850da)
1 /*
2  * Copyright 2025 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 #include <linux/delay.h>
25 #include <linux/firmware.h>
26 #include <linux/module.h>
27 #include <linux/pci.h>
28 
29 #include "amdgpu.h"
30 #include "amdgpu_ucode.h"
31 #include "amdgpu_trace.h"
32 
33 #include "gc/gc_12_1_0_offset.h"
34 #include "gc/gc_12_1_0_sh_mask.h"
35 #include "ivsrcid/gfx/irqsrcs_gfx_12_1_0.h"
36 
37 #include "soc15_common.h"
38 #include "soc15.h"
39 #include "sdma_v7_1_0_pkt_open.h"
40 #include "nbio_v4_3.h"
41 #include "sdma_common.h"
42 #include "sdma_v7_1.h"
43 #include "v12_structs.h"
44 #include "mes_userqueue.h"
45 #include "soc_v1_0.h"
46 
47 MODULE_FIRMWARE("amdgpu/sdma_7_1_0.bin");
48 
49 #define SDMA1_REG_OFFSET 0x600
50 #define SDMA0_SDMA_IDX_0_END 0x450
51 #define SDMA1_HYP_DEC_REG_OFFSET 0x30
52 
53 static const struct amdgpu_hwip_reg_entry sdma_reg_list_7_1[] = {
54 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS_REG),
55 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS1_REG),
56 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS2_REG),
57 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS3_REG),
58 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS4_REG),
59 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS5_REG),
60 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_STATUS6_REG),
61 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UCODE_REV),
62 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_RB_RPTR_FETCH_HI),
63 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_RB_RPTR_FETCH),
64 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_RD_STATUS),
65 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_WR_STATUS),
66 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_RD_XNACK0),
67 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_RD_XNACK1),
68 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_WR_XNACK0),
69 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_UTCL1_WR_XNACK1),
70 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_CNTL),
71 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_RPTR),
72 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_RPTR_HI),
73 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_WPTR),
74 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_RB_WPTR_HI),
75 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_OFFSET),
76 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_BASE_LO),
77 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_BASE_HI),
78 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_CNTL),
79 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_RPTR),
80 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_IB_SUB_REMAIN),
81 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE0_DUMMY_REG),
82 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE_STATUS0),
83 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_CNTL),
84 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_RPTR),
85 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_RPTR_HI),
86 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_WPTR),
87 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_RB_WPTR_HI),
88 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_OFFSET),
89 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_BASE_LO),
90 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_BASE_HI),
91 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_RPTR),
92 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_IB_SUB_REMAIN),
93 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE1_DUMMY_REG),
94 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_CNTL),
95 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_RPTR),
96 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_RPTR_HI),
97 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_WPTR),
98 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_RB_WPTR_HI),
99 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_OFFSET),
100 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_BASE_LO),
101 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_BASE_HI),
102 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_RPTR),
103 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_IB_SUB_REMAIN),
104 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_QUEUE2_DUMMY_REG),
105 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_INT_STATUS),
106 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_VM_CNTL),
107 	SOC15_REG_ENTRY_STR(GC, 0, regGRBM_STATUS2),
108 	SOC15_REG_ENTRY_STR(GC, 0, regSDMA0_SDMA_CHICKEN_BITS),
109 };
110 
111 static void sdma_v7_1_set_ring_funcs(struct amdgpu_device *adev);
112 static void sdma_v7_1_set_buffer_funcs(struct amdgpu_device *adev);
113 static void sdma_v7_1_set_vm_pte_funcs(struct amdgpu_device *adev);
114 static void sdma_v7_1_set_irq_funcs(struct amdgpu_device *adev);
115 static int sdma_v7_1_inst_start(struct amdgpu_device *adev,
116 				uint32_t inst_mask);
117 
118 static u32 sdma_v7_1_get_reg_offset(struct amdgpu_device *adev, u32 instance, u32 internal_offset)
119 {
120 	u32 base;
121 	u32 dev_inst = GET_INST(SDMA0, instance);
122 	int xcc_id = adev->sdma.instance[instance].xcc_id;
123 	int xcc_inst = dev_inst % adev->sdma.num_inst_per_xcc;
124 
125 	if (internal_offset >= SDMA0_SDMA_IDX_0_END) {
126 		base = adev->reg_offset[GC_HWIP][xcc_id][1];
127 		if (xcc_inst != 0)
128 			internal_offset += SDMA1_HYP_DEC_REG_OFFSET * xcc_inst;
129 	} else {
130 		base = adev->reg_offset[GC_HWIP][xcc_id][0];
131 		if (xcc_inst != 0)
132 			internal_offset += SDMA1_REG_OFFSET * xcc_inst;
133 	}
134 
135 	return base + internal_offset;
136 }
137 
138 static unsigned sdma_v7_1_ring_init_cond_exec(struct amdgpu_ring *ring,
139 					      uint64_t addr)
140 {
141 	unsigned ret;
142 
143 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COND_EXE));
144 	amdgpu_ring_write(ring, lower_32_bits(addr));
145 	amdgpu_ring_write(ring, upper_32_bits(addr));
146 	amdgpu_ring_write(ring, 1);
147 	/* this is the offset we need patch later */
148 	ret = ring->wptr & ring->buf_mask;
149 	/* insert dummy here and patch it later */
150 	amdgpu_ring_write(ring, 0);
151 
152 	return ret;
153 }
154 
155 /**
156  * sdma_v7_1_ring_get_rptr - get the current read pointer
157  *
158  * @ring: amdgpu ring pointer
159  *
160  * Get the current rptr from the hardware.
161  */
162 static uint64_t sdma_v7_1_ring_get_rptr(struct amdgpu_ring *ring)
163 {
164 	u64 *rptr;
165 
166 	/* XXX check if swapping is necessary on BE */
167 	rptr = (u64 *)ring->rptr_cpu_addr;
168 
169 	DRM_DEBUG("rptr before shift == 0x%016llx\n", *rptr);
170 	return ((*rptr) >> 2);
171 }
172 
173 /**
174  * sdma_v7_1_ring_get_wptr - get the current write pointer
175  *
176  * @ring: amdgpu ring pointer
177  *
178  * Get the current wptr from the hardware.
179  */
180 static uint64_t sdma_v7_1_ring_get_wptr(struct amdgpu_ring *ring)
181 {
182 	u64 wptr = 0;
183 
184 	if (ring->use_doorbell) {
185 		/* XXX check if swapping is necessary on BE */
186 		wptr = READ_ONCE(*((u64 *)ring->wptr_cpu_addr));
187 		DRM_DEBUG("wptr/doorbell before shift == 0x%016llx\n", wptr);
188 	}
189 
190 	return wptr >> 2;
191 }
192 
193 /**
194  * sdma_v7_1_ring_set_wptr - commit the write pointer
195  *
196  * @ring: amdgpu ring pointer
197  *
198  * Write the wptr back to the hardware.
199  */
200 static void sdma_v7_1_ring_set_wptr(struct amdgpu_ring *ring)
201 {
202 	struct amdgpu_device *adev = ring->adev;
203 
204 	DRM_DEBUG("Setting write pointer\n");
205 
206 	if (ring->use_doorbell) {
207 		DRM_DEBUG("Using doorbell -- "
208 			  "wptr_offs == 0x%08x "
209 			  "lower_32_bits(ring->wptr) << 2 == 0x%08x "
210 			  "upper_32_bits(ring->wptr) << 2 == 0x%08x\n",
211 			  ring->wptr_offs,
212 			  lower_32_bits(ring->wptr << 2),
213 			  upper_32_bits(ring->wptr << 2));
214 		/* XXX check if swapping is necessary on BE */
215 		atomic64_set((atomic64_t *)ring->wptr_cpu_addr,
216 			     ring->wptr << 2);
217 		DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n",
218 			  ring->doorbell_index, ring->wptr << 2);
219 		WDOORBELL64(ring->doorbell_index, ring->wptr << 2);
220 	} else {
221 		DRM_DEBUG("Not using doorbell -- "
222 			  "regSDMA%i_GFX_RB_WPTR == 0x%08x "
223 			  "regSDMA%i_GFX_RB_WPTR_HI == 0x%08x\n",
224 			  ring->me,
225 			  lower_32_bits(ring->wptr << 2),
226 			  ring->me,
227 			  upper_32_bits(ring->wptr << 2));
228 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev,
229 							     ring->me,
230 							     regSDMA0_SDMA_QUEUE0_RB_WPTR),
231 				lower_32_bits(ring->wptr << 2));
232 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev,
233 							     ring->me,
234 							     regSDMA0_SDMA_QUEUE0_RB_WPTR_HI),
235 				upper_32_bits(ring->wptr << 2));
236 	}
237 }
238 
239 static void sdma_v7_1_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
240 {
241 	struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
242 	int i;
243 
244 	for (i = 0; i < count; i++)
245 		if (sdma && sdma->burst_nop && (i == 0))
246 			amdgpu_ring_write(ring, ring->funcs->nop |
247 				SDMA_PKT_NOP_HEADER_COUNT(count - 1));
248 		else
249 			amdgpu_ring_write(ring, ring->funcs->nop);
250 }
251 
252 /**
253  * sdma_v7_1_ring_emit_ib - Schedule an IB on the DMA engine
254  *
255  * @ring: amdgpu ring pointer
256  * @job: job to retrieve vmid from
257  * @ib: IB object to schedule
258  * @flags: unused
259  *
260  * Schedule an IB in the DMA ring.
261  */
262 static void sdma_v7_1_ring_emit_ib(struct amdgpu_ring *ring,
263 				   struct amdgpu_job *job,
264 				   struct amdgpu_ib *ib,
265 				   uint32_t flags)
266 {
267 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
268 	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
269 
270 	/* An IB packet must end on a 8 DW boundary--the next dword
271 	 * must be on a 8-dword boundary. Our IB packet below is 6
272 	 * dwords long, thus add x number of NOPs, such that, in
273 	 * modular arithmetic,
274 	 * wptr + 6 + x = 8k, k >= 0, which in C is,
275 	 * (wptr + 6 + x) % 8 = 0.
276 	 * The expression below, is a solution of x.
277 	 */
278 	sdma_v7_1_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
279 
280 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_INDIRECT) |
281 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
282 	/* base must be 32 byte aligned */
283 	amdgpu_ring_write(ring, lower_32_bits(ib->gpu_addr) & 0xffffffe0);
284 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
285 	amdgpu_ring_write(ring, ib->length_dw);
286 	amdgpu_ring_write(ring, lower_32_bits(csa_mc_addr));
287 	amdgpu_ring_write(ring, upper_32_bits(csa_mc_addr));
288 }
289 
290 /**
291  * sdma_v7_1_ring_emit_mem_sync - flush the IB by graphics cache rinse
292  *
293  * @ring: amdgpu ring pointer
294  *
295  * flush the IB by graphics cache rinse.
296  */
297 static void sdma_v7_1_ring_emit_mem_sync(struct amdgpu_ring *ring)
298 {
299 	uint32_t gcr_cntl = SDMA_GCR_GL2_INV | SDMA_GCR_GL2_WB | SDMA_GCR_GLM_INV |
300 		SDMA_GCR_GL1_INV | SDMA_GCR_GLV_INV | SDMA_GCR_GLK_INV |
301 		SDMA_GCR_GLI_INV(1);
302 
303 	/* flush entire cache L0/L1/L2, this can be optimized by performance requirement */
304 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_GCR_REQ));
305 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD1_BASE_VA_31_7(0));
306 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD2_BASE_VA_56_32(0));
307 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD3_GCR_CONTROL_18_0(gcr_cntl) |
308 			  SDMA_PKT_GCR_REQ_PAYLOAD3_LIMIT_VA_15_7(0));
309 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD4_LIMIT_VA_47_16(0));
310 	amdgpu_ring_write(ring, SDMA_PKT_GCR_REQ_PAYLOAD5_LIMIT_VA_56_48(0) |
311 			  SDMA_PKT_GCR_REQ_PAYLOAD5_VMID(0));
312 }
313 
314 
315 /**
316  * sdma_v7_1_ring_emit_fence - emit a fence on the DMA ring
317  *
318  * @ring: amdgpu ring pointer
319  * @addr: address
320  * @seq: fence seq number
321  * @flags: fence flags
322  *
323  * Add a DMA fence packet to the ring to write
324  * the fence seq number and DMA trap packet to generate
325  * an interrupt if needed.
326  */
327 static void sdma_v7_1_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 seq,
328 				      unsigned flags)
329 {
330 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
331 	/* write the fence */
332 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_FENCE) |
333 			  SDMA_PKT_FENCE_HEADER_MTYPE(0x3)); /* Ucached(UC) */
334 	/* zero in first two bits */
335 	BUG_ON(addr & 0x3);
336 	amdgpu_ring_write(ring, lower_32_bits(addr));
337 	amdgpu_ring_write(ring, upper_32_bits(addr));
338 	amdgpu_ring_write(ring, lower_32_bits(seq));
339 
340 	/* optionally write high bits as well */
341 	if (write64bit) {
342 		addr += 4;
343 		amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_FENCE) |
344 				  SDMA_PKT_FENCE_HEADER_MTYPE(0x3));
345 		/* zero in first two bits */
346 		BUG_ON(addr & 0x3);
347 		amdgpu_ring_write(ring, lower_32_bits(addr));
348 		amdgpu_ring_write(ring, upper_32_bits(addr));
349 		amdgpu_ring_write(ring, upper_32_bits(seq));
350 	}
351 
352 	if (flags & AMDGPU_FENCE_FLAG_INT) {
353 		/* generate an interrupt */
354 		amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_TRAP));
355 		amdgpu_ring_write(ring, SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0));
356 	}
357 }
358 
359 /**
360  * sdma_v7_1_inst_gfx_stop - stop the gfx async dma engines
361  *
362  * @adev: amdgpu_device pointer
363  * @inst_mask: mask of dma engine instances to be disabled
364  *
365  * Stop the gfx async dma ring buffers.
366  */
367 static void sdma_v7_1_inst_gfx_stop(struct amdgpu_device *adev,
368 				    uint32_t inst_mask)
369 {
370 	u32 rb_cntl, ib_cntl;
371 	int i;
372 
373 	for_each_inst(i, inst_mask) {
374 		rb_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL));
375 		rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_ENABLE, 0);
376 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL), rb_cntl);
377 		ib_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_IB_CNTL));
378 		ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_SDMA_QUEUE0_IB_CNTL, IB_ENABLE, 0);
379 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_IB_CNTL), ib_cntl);
380 	}
381 }
382 
383 /**
384  * sdma_v7_1_inst_rlc_stop - stop the compute async dma engines
385  *
386  * @adev: amdgpu_device pointer
387  * @inst_mask: mask of dma engine instances to be disabled
388  *
389  * Stop the compute async dma queues.
390  */
391 static void sdma_v7_1_inst_rlc_stop(struct amdgpu_device *adev,
392 				    uint32_t inst_mask)
393 {
394 	/* XXX todo */
395 }
396 
397 /**
398  * sdma_v7_1_inst_ctx_switch_enable - stop the async dma engines context switch
399  *
400  * @adev: amdgpu_device pointer
401  * @enable: enable/disable the DMA MEs context switch.
402  * @inst_mask: mask of dma engine instances to be enabled
403  *
404  * Halt or unhalt the async dma engines context switch.
405  */
406 static void sdma_v7_1_inst_ctx_switch_enable(struct amdgpu_device *adev,
407 					     bool enable, uint32_t inst_mask)
408 {
409 	int i;
410 
411 	for_each_inst(i, inst_mask) {
412 		WREG32_SOC15_IP(GC,
413 			sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_TIMEOUT), 0x80);
414 	}
415 }
416 
417 /**
418  * sdma_v7_1_inst_enable - stop the async dma engines
419  *
420  * @adev: amdgpu_device pointer
421  * @enable: enable/disable the DMA MEs.
422  * @inst_mask: mask of dma engine instances to be enabled
423  *
424  * Halt or unhalt the async dma engines.
425  */
426 static void sdma_v7_1_inst_enable(struct amdgpu_device *adev,
427 				  bool enable, uint32_t inst_mask)
428 {
429 	u32 mcu_cntl;
430 	int i;
431 
432 	if (!enable) {
433 		sdma_v7_1_inst_gfx_stop(adev, inst_mask);
434 		sdma_v7_1_inst_rlc_stop(adev, inst_mask);
435 	}
436 
437 	if (amdgpu_sriov_vf(adev))
438 		return;
439 
440 	for_each_inst(i, inst_mask) {
441 		mcu_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL));
442 		mcu_cntl = REG_SET_FIELD(mcu_cntl, SDMA0_SDMA_MCU_CNTL, HALT, enable ? 0 : 1);
443 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL), mcu_cntl);
444 	}
445 }
446 
447 /**
448  * sdma_v7_1_gfx_resume_instance - start/restart a certain sdma engine
449  *
450  * @adev: amdgpu_device pointer
451  * @i: instance
452  * @restore: used to restore wptr when restart
453  *
454  * Set up the gfx DMA ring buffers and enable them. On restart, we will restore wptr and rptr.
455  * Return 0 for success.
456  */
457 static int sdma_v7_1_gfx_resume_instance(struct amdgpu_device *adev, int i, bool restore)
458 {
459 	struct amdgpu_ring *ring;
460 	u32 rb_cntl, ib_cntl;
461 	u32 rb_bufsz;
462 	u32 doorbell;
463 	u32 doorbell_offset;
464 	u32 temp;
465 	u64 wptr_gpu_addr;
466 	int r;
467 
468 	ring = &adev->sdma.instance[i].ring;
469 
470 	/* Set ring buffer size in dwords */
471 	rb_bufsz = order_base_2(ring->ring_size / 4);
472 	rb_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL));
473 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_SIZE, rb_bufsz);
474 #ifdef __BIG_ENDIAN
475 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_SWAP_ENABLE, 1);
476 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL,
477 				RPTR_WRITEBACK_SWAP_ENABLE, 1);
478 #endif
479 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_PRIV, 1);
480 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL), rb_cntl);
481 
482 	/* Initialize the ring buffer's read and write pointers */
483 	if (restore) {
484 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR), lower_32_bits(ring->wptr << 2));
485 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR_HI), upper_32_bits(ring->wptr << 2));
486 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR), lower_32_bits(ring->wptr << 2));
487 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_HI), upper_32_bits(ring->wptr << 2));
488 	} else {
489 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR), 0);
490 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR_HI), 0);
491 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR), 0);
492 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_HI), 0);
493 	}
494 	/* setup the wptr shadow polling */
495 	wptr_gpu_addr = ring->wptr_gpu_addr;
496 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_POLL_ADDR_LO),
497 	       lower_32_bits(wptr_gpu_addr));
498 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_POLL_ADDR_HI),
499 	       upper_32_bits(wptr_gpu_addr));
500 
501 	/* set the wb address whether it's enabled or not */
502 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR_ADDR_HI),
503 	       upper_32_bits(ring->rptr_gpu_addr) & 0xFFFFFFFF);
504 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_RPTR_ADDR_LO),
505 	       lower_32_bits(ring->rptr_gpu_addr) & 0xFFFFFFFC);
506 
507 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RPTR_WRITEBACK_ENABLE, 1);
508 	if (amdgpu_sriov_vf(adev))
509 		rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, WPTR_POLL_ENABLE, 1);
510 	else
511 		rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, WPTR_POLL_ENABLE, 0);
512 
513 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, MCU_WPTR_POLL_ENABLE, 1);
514 
515 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_BASE), ring->gpu_addr >> 8);
516 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_BASE_HI), ring->gpu_addr >> 40);
517 
518 	if (!restore)
519 		ring->wptr = 0;
520 
521 	/* before programing wptr to a less value, need set minor_ptr_update first */
522 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_MINOR_PTR_UPDATE), 1);
523 
524 	if (!amdgpu_sriov_vf(adev)) { /* only bare-metal use register write for wptr */
525 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR), lower_32_bits(ring->wptr) << 2);
526 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_WPTR_HI), upper_32_bits(ring->wptr) << 2);
527 	}
528 
529 	doorbell = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_DOORBELL));
530 	doorbell_offset = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_DOORBELL_OFFSET));
531 
532 	if (ring->use_doorbell) {
533 		doorbell = REG_SET_FIELD(doorbell, SDMA0_SDMA_QUEUE0_DOORBELL, ENABLE, 1);
534 		doorbell_offset = REG_SET_FIELD(doorbell_offset, SDMA0_SDMA_QUEUE0_DOORBELL_OFFSET,
535 				OFFSET, ring->doorbell_index);
536 	} else {
537 		doorbell = REG_SET_FIELD(doorbell, SDMA0_SDMA_QUEUE0_DOORBELL, ENABLE, 0);
538 	}
539 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_DOORBELL), doorbell);
540 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_DOORBELL_OFFSET), doorbell_offset);
541 
542 	if (i == 0)
543 		adev->nbio.funcs->sdma_doorbell_range(adev, i, ring->use_doorbell,
544 					      ring->doorbell_index,
545 					      adev->doorbell_index.sdma_doorbell_range * adev->sdma.num_instances);
546 
547 	if (amdgpu_sriov_vf(adev))
548 		sdma_v7_1_ring_set_wptr(ring);
549 
550 	/* set minor_ptr_update to 0 after wptr programed */
551 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_MINOR_PTR_UPDATE), 0);
552 
553 	/* Set up sdma hang watchdog */
554 	temp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_WATCHDOG_CNTL));
555 	/* 100ms per unit */
556 	temp = REG_SET_FIELD(temp, SDMA0_SDMA_WATCHDOG_CNTL, QUEUE_HANG_COUNT,
557 			     max(adev->usec_timeout/100000, 1));
558 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_WATCHDOG_CNTL), temp);
559 
560 	/* Set up RESP_MODE to non-copy addresses */
561 	temp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_CNTL));
562 	temp = REG_SET_FIELD(temp, SDMA0_SDMA_UTCL1_CNTL, RESP_MODE, 3);
563 	temp = REG_SET_FIELD(temp, SDMA0_SDMA_UTCL1_CNTL, REDO_DELAY, 9);
564 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_CNTL), temp);
565 
566 	/* program default cache read and write policy */
567 	temp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_PAGE));
568 	/* clean read policy and write policy bits */
569 	temp &= 0xFF0FFF;
570 	temp |= ((CACHE_READ_POLICY_L2__DEFAULT << 12) |
571 		 (CACHE_WRITE_POLICY_L2__DEFAULT << 14));
572 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_UTCL1_PAGE), temp);
573 
574 	if (!amdgpu_sriov_vf(adev)) {
575 		/* unhalt engine */
576 		temp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL));
577 		temp = REG_SET_FIELD(temp, SDMA0_SDMA_MCU_CNTL, HALT, 0);
578 		temp = REG_SET_FIELD(temp, SDMA0_SDMA_MCU_CNTL, RESET, 0);
579 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL), temp);
580 	}
581 
582 	/* enable DMA RB */
583 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_SDMA_QUEUE0_RB_CNTL, RB_ENABLE, 1);
584 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_RB_CNTL), rb_cntl);
585 
586 	ib_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_IB_CNTL));
587 	ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_SDMA_QUEUE0_IB_CNTL, IB_ENABLE, 1);
588 #ifdef __BIG_ENDIAN
589 	ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_SDMA_QUEUE0_IB_CNTL, IB_SWAP_ENABLE, 1);
590 #endif
591 	/* enable DMA IBs */
592 	WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_IB_CNTL), ib_cntl);
593 	ring->sched.ready = true;
594 
595 	if (amdgpu_sriov_vf(adev)) { /* bare-metal sequence doesn't need below to lines */
596 		sdma_v7_1_inst_ctx_switch_enable(adev, true, i);
597 		sdma_v7_1_inst_enable(adev, true, i);
598 	}
599 
600 	r = amdgpu_ring_test_helper(ring);
601 	if (r)
602 		ring->sched.ready = false;
603 
604 	return r;
605 }
606 
607 /**
608  * sdma_v7_1_inst_gfx_resume - setup and start the async dma engines
609  *
610  * @adev: amdgpu_device pointer
611  * @inst_mask: mask of dma engine instances to be enabled
612  *
613  * Set up the gfx DMA ring buffers and enable them.
614  * Returns 0 for success, error for failure.
615  */
616 static int sdma_v7_1_inst_gfx_resume(struct amdgpu_device *adev,
617 				     uint32_t inst_mask)
618 {
619 	int i, r;
620 
621 	for_each_inst(i, inst_mask) {
622 		r = sdma_v7_1_gfx_resume_instance(adev, i, false);
623 		if (r)
624 			return r;
625 	}
626 
627 	return 0;
628 
629 }
630 
631 /**
632  * sdma_v7_1_inst_rlc_resume - setup and start the async dma engines
633  *
634  * @adev: amdgpu_device pointer
635  * @inst_mask: mask of dma engine instances to be enabled
636  *
637  * Set up the compute DMA queues and enable them.
638  * Returns 0 for success, error for failure.
639  */
640 static int sdma_v7_1_inst_rlc_resume(struct amdgpu_device *adev,
641 				     uint32_t inst_mask)
642 {
643 	return 0;
644 }
645 
646 static void sdma_v7_1_inst_free_ucode_buffer(struct amdgpu_device *adev,
647 					     uint32_t inst_mask)
648 {
649 	int i;
650 
651 	for_each_inst(i, inst_mask) {
652 		amdgpu_bo_free_kernel(&adev->sdma.instance[i].sdma_fw_obj,
653 				      &adev->sdma.instance[i].sdma_fw_gpu_addr,
654 				      (void **)&adev->sdma.instance[i].sdma_fw_ptr);
655 	}
656 }
657 
658 /**
659  * sdma_v7_1_inst_load_microcode - load the sDMA ME ucode
660  *
661  * @adev: amdgpu_device pointer
662  * @inst_mask: mask of dma engine instances to be enabled
663  *
664  * Loads the sDMA0/1 ucode.
665  * Returns 0 for success, -EINVAL if the ucode is not available.
666  */
667 static int sdma_v7_1_inst_load_microcode(struct amdgpu_device *adev,
668 					 uint32_t inst_mask)
669 {
670 	const struct sdma_firmware_header_v3_0 *hdr;
671 	const __le32 *fw_data;
672 	u32 fw_size;
673 	uint32_t tmp, sdma_status, ic_op_cntl;
674 	int i, r, j;
675 
676 	/* halt the MEs */
677 	sdma_v7_1_inst_enable(adev, false, inst_mask);
678 
679 	if (!adev->sdma.instance[0].fw)
680 		return -EINVAL;
681 
682 	hdr = (const struct sdma_firmware_header_v3_0 *)
683 		adev->sdma.instance[0].fw->data;
684 	amdgpu_ucode_print_sdma_hdr(&hdr->header);
685 
686 	fw_data = (const __le32 *)(adev->sdma.instance[0].fw->data +
687 			le32_to_cpu(hdr->ucode_offset_bytes));
688 	fw_size = le32_to_cpu(hdr->ucode_size_bytes);
689 
690 	for_each_inst(i, inst_mask) {
691 		r = amdgpu_bo_create_reserved(adev, fw_size,
692 					      PAGE_SIZE,
693 					      AMDGPU_GEM_DOMAIN_VRAM,
694 					      &adev->sdma.instance[i].sdma_fw_obj,
695 					      &adev->sdma.instance[i].sdma_fw_gpu_addr,
696 					      (void **)&adev->sdma.instance[i].sdma_fw_ptr);
697 		if (r) {
698 			dev_err(adev->dev, "(%d) failed to create sdma ucode bo\n", r);
699 			return r;
700 		}
701 
702 		memcpy(adev->sdma.instance[i].sdma_fw_ptr, fw_data, fw_size);
703 
704 		amdgpu_bo_kunmap(adev->sdma.instance[i].sdma_fw_obj);
705 		amdgpu_bo_unreserve(adev->sdma.instance[i].sdma_fw_obj);
706 
707 		tmp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_CNTL));
708 		tmp = REG_SET_FIELD(tmp, SDMA0_SDMA_IC_CNTL, GPA, 0);
709 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_CNTL), tmp);
710 
711 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_BASE_LO),
712 			lower_32_bits(adev->sdma.instance[i].sdma_fw_gpu_addr));
713 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_BASE_HI),
714 			upper_32_bits(adev->sdma.instance[i].sdma_fw_gpu_addr));
715 
716 		tmp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_OP_CNTL));
717 		tmp = REG_SET_FIELD(tmp, SDMA0_SDMA_IC_OP_CNTL, PRIME_ICACHE, 1);
718 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_OP_CNTL), tmp);
719 
720 		/* Wait for sdma ucode init complete */
721 		for (j = 0; j < adev->usec_timeout; j++) {
722 			ic_op_cntl = RREG32_SOC15_IP(GC,
723 					sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_IC_OP_CNTL));
724 			sdma_status = RREG32_SOC15_IP(GC,
725 					sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_STATUS_REG));
726 			if ((REG_GET_FIELD(ic_op_cntl, SDMA0_SDMA_IC_OP_CNTL, ICACHE_PRIMED) == 1) &&
727 			    (REG_GET_FIELD(sdma_status, SDMA0_SDMA_STATUS_REG, UCODE_INIT_DONE) == 1))
728 				break;
729 			udelay(1);
730 		}
731 
732 		if (j >= adev->usec_timeout) {
733 			dev_err(adev->dev, "failed to init sdma ucode\n");
734 			return -EINVAL;
735 		}
736 	}
737 
738 	return 0;
739 }
740 
741 static int sdma_v7_1_soft_reset(struct amdgpu_ip_block *ip_block)
742 {
743 	struct amdgpu_device *adev = ip_block->adev;
744 	uint32_t inst_mask;
745 	u32 tmp;
746 	int i;
747 
748 	inst_mask = GENMASK(NUM_XCC(adev->sdma.sdma_mask) - 1, 0);
749 	sdma_v7_1_inst_gfx_stop(adev, inst_mask);
750 
751 	for_each_inst(i, inst_mask) {
752 		//tmp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_FREEZE));
753 		//tmp |= SDMA0_SDMA_FREEZE__FREEZE_MASK;
754 		//WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_FREEZE), tmp);
755 		tmp = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL));
756 		tmp |= SDMA0_SDMA_MCU_CNTL__HALT_MASK;
757 		tmp |= SDMA0_SDMA_MCU_CNTL__RESET_MASK;
758 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_MCU_CNTL), tmp);
759 
760 		WREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_QUEUE0_PREEMPT), 0);
761 
762 		udelay(100);
763 
764 		tmp = GRBM_SOFT_RESET__SOFT_RESET_SDMA0_MASK << i;
765 		WREG32_SOC15(GC, 0, regGRBM_SOFT_RESET, tmp);
766 		tmp = RREG32_SOC15(GC, 0, regGRBM_SOFT_RESET);
767 
768 		udelay(100);
769 
770 		WREG32_SOC15(GC, 0, regGRBM_SOFT_RESET, 0);
771 		tmp = RREG32_SOC15(GC, 0, regGRBM_SOFT_RESET);
772 
773 		udelay(100);
774 	}
775 
776 	return sdma_v7_1_inst_start(adev, inst_mask);
777 }
778 
779 static bool sdma_v7_1_check_soft_reset(struct amdgpu_ip_block *ip_block)
780 {
781 	struct amdgpu_device *adev = ip_block->adev;
782 	struct amdgpu_ring *ring;
783 	int i, r;
784 	long tmo = msecs_to_jiffies(1000);
785 
786 	for (i = 0; i < adev->sdma.num_instances; i++) {
787 		ring = &adev->sdma.instance[i].ring;
788 		r = amdgpu_ring_test_ib(ring, tmo);
789 		if (r)
790 			return true;
791 	}
792 
793 	return false;
794 }
795 
796 static int sdma_v7_1_reset_queue(struct amdgpu_ring *ring,
797 				 unsigned int vmid,
798 				 struct amdgpu_fence *timedout_fence)
799 {
800 	struct amdgpu_device *adev = ring->adev;
801 	int r;
802 
803 	if (ring->me >= adev->sdma.num_instances) {
804 		dev_err(adev->dev, "sdma instance not found\n");
805 		return -EINVAL;
806 	}
807 
808 	amdgpu_ring_reset_helper_begin(ring, timedout_fence);
809 
810 	r = amdgpu_mes_reset_legacy_queue(adev, ring, vmid, true, 0);
811 	if (r)
812 		return r;
813 
814 	r = sdma_v7_1_gfx_resume_instance(adev, ring->me, true);
815 	if (r)
816 		return r;
817 
818 	return amdgpu_ring_reset_helper_end(ring, timedout_fence);
819 }
820 
821 /**
822  * sdma_v7_1_inst_start - setup and start the async dma engines
823  *
824  * @adev: amdgpu_device pointer
825  * @inst_mask: mask of dma engine instances to be enabled
826  *
827  * Set up the DMA engines and enable them.
828  * Returns 0 for success, error for failure.
829  */
830 static int sdma_v7_1_inst_start(struct amdgpu_device *adev,
831 				uint32_t inst_mask)
832 {
833 	int r = 0;
834 
835 	if (amdgpu_sriov_vf(adev)) {
836 		sdma_v7_1_inst_ctx_switch_enable(adev, false, inst_mask);
837 		sdma_v7_1_inst_enable(adev, false, inst_mask);
838 
839 		/* set RB registers */
840 		r = sdma_v7_1_inst_gfx_resume(adev, inst_mask);
841 		return r;
842 	}
843 
844 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
845 		r = sdma_v7_1_inst_load_microcode(adev, inst_mask);
846 		if (r) {
847 			sdma_v7_1_inst_free_ucode_buffer(adev, inst_mask);
848 			return r;
849 		}
850 
851 		if (amdgpu_emu_mode == 1)
852 			msleep(1000);
853 	}
854 
855 	/* unhalt the MEs */
856 	sdma_v7_1_inst_enable(adev, true, inst_mask);
857 	/* enable sdma ring preemption */
858 	sdma_v7_1_inst_ctx_switch_enable(adev, true, inst_mask);
859 
860 	/* start the gfx rings and rlc compute queues */
861 	r = sdma_v7_1_inst_gfx_resume(adev, inst_mask);
862 	if (r)
863 		return r;
864 	r = sdma_v7_1_inst_rlc_resume(adev, inst_mask);
865 
866 	return r;
867 }
868 
869 static int sdma_v7_1_mqd_init(struct amdgpu_device *adev, void *mqd,
870 			      struct amdgpu_mqd_prop *prop)
871 {
872 	struct v12_sdma_mqd *m = mqd;
873 	uint64_t wb_gpu_addr;
874 
875 	m->sdmax_rlcx_rb_cntl =
876 		order_base_2(prop->queue_size / 4) << SDMA0_SDMA_QUEUE0_RB_CNTL__RB_SIZE__SHIFT |
877 		1 << SDMA0_SDMA_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
878 		4 << SDMA0_SDMA_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT |
879 		1 << SDMA0_SDMA_QUEUE0_RB_CNTL__MCU_WPTR_POLL_ENABLE__SHIFT;
880 
881 	m->sdmax_rlcx_rb_base = lower_32_bits(prop->hqd_base_gpu_addr >> 8);
882 	m->sdmax_rlcx_rb_base_hi = upper_32_bits(prop->hqd_base_gpu_addr >> 8);
883 
884 	wb_gpu_addr = prop->wptr_gpu_addr;
885 	m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits(wb_gpu_addr);
886 	m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits(wb_gpu_addr);
887 
888 	wb_gpu_addr = prop->rptr_gpu_addr;
889 	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits(wb_gpu_addr);
890 	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits(wb_gpu_addr);
891 
892 	m->sdmax_rlcx_ib_cntl = RREG32_SOC15_IP(GC, sdma_v7_1_get_reg_offset(adev, 0,
893 							regSDMA0_SDMA_QUEUE0_IB_CNTL));
894 
895 	m->sdmax_rlcx_doorbell_offset =
896 		prop->doorbell_index << SDMA0_SDMA_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
897 
898 	m->sdmax_rlcx_doorbell = REG_SET_FIELD(0, SDMA0_SDMA_QUEUE0_DOORBELL, ENABLE, 1);
899 
900 	m->sdmax_rlcx_doorbell_log = 0;
901 	m->sdmax_rlcx_rb_aql_cntl = 0x4000;	//regSDMA0_SDMA_QUEUE0_RB_AQL_CNTL_DEFAULT;
902 	m->sdmax_rlcx_dummy_reg = 0xf;	//regSDMA0_SDMA_QUEUE0_DUMMY_REG_DEFAULT;
903 
904 	m->sdmax_rlcx_csa_addr_lo = lower_32_bits(prop->csa_addr);
905 	m->sdmax_rlcx_csa_addr_hi = upper_32_bits(prop->csa_addr);
906 
907 	return 0;
908 }
909 
910 static void sdma_v7_1_set_mqd_funcs(struct amdgpu_device *adev)
911 {
912 	adev->mqds[AMDGPU_HW_IP_DMA].mqd_size = sizeof(struct v12_sdma_mqd);
913 	adev->mqds[AMDGPU_HW_IP_DMA].init_mqd = sdma_v7_1_mqd_init;
914 }
915 
916 /**
917  * sdma_v7_1_ring_test_ring - simple async dma engine test
918  *
919  * @ring: amdgpu_ring structure holding ring information
920  *
921  * Test the DMA engine by writing using it to write an
922  * value to memory.
923  * Returns 0 for success, error for failure.
924  */
925 static int sdma_v7_1_ring_test_ring(struct amdgpu_ring *ring)
926 {
927 	struct amdgpu_device *adev = ring->adev;
928 	unsigned i;
929 	unsigned index;
930 	int r;
931 	u32 tmp;
932 	u64 gpu_addr;
933 
934 	tmp = 0xCAFEDEAD;
935 
936 	r = amdgpu_device_wb_get(adev, &index);
937 	if (r) {
938 		dev_err(adev->dev, "(%d) failed to allocate wb slot\n", r);
939 		return r;
940 	}
941 
942 	gpu_addr = adev->wb.gpu_addr + (index * 4);
943 	adev->wb.wb[index] = cpu_to_le32(tmp);
944 
945 	r = amdgpu_ring_alloc(ring, 5);
946 	if (r) {
947 		DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", ring->idx, r);
948 		amdgpu_device_wb_free(adev, index);
949 		return r;
950 	}
951 
952 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_WRITE) |
953 			  SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR));
954 	amdgpu_ring_write(ring, lower_32_bits(gpu_addr));
955 	amdgpu_ring_write(ring, upper_32_bits(gpu_addr));
956 	amdgpu_ring_write(ring, SDMA_PKT_WRITE_UNTILED_DW_3_COUNT(0));
957 	amdgpu_ring_write(ring, 0xDEADBEEF);
958 	amdgpu_ring_commit(ring);
959 
960 	for (i = 0; i < adev->usec_timeout; i++) {
961 		tmp = le32_to_cpu(adev->wb.wb[index]);
962 		if (tmp == 0xDEADBEEF)
963 			break;
964 		if (amdgpu_emu_mode == 1)
965 			msleep(1);
966 		else
967 			udelay(1);
968 	}
969 
970 	if (i >= adev->usec_timeout)
971 		r = -ETIMEDOUT;
972 
973 	amdgpu_device_wb_free(adev, index);
974 
975 	return r;
976 }
977 
978 /**
979  * sdma_v7_1_ring_test_ib - test an IB on the DMA engine
980  *
981  * @ring: amdgpu_ring structure holding ring information
982  * @timeout: timeout value in jiffies, or MAX_SCHEDULE_TIMEOUT
983  *
984  * Test a simple IB in the DMA ring.
985  * Returns 0 on success, error on failure.
986  */
987 static int sdma_v7_1_ring_test_ib(struct amdgpu_ring *ring, long timeout)
988 {
989 	struct amdgpu_device *adev = ring->adev;
990 	struct amdgpu_ib ib;
991 	struct dma_fence *f = NULL;
992 	unsigned index;
993 	long r;
994 	u32 tmp = 0;
995 	u64 gpu_addr;
996 
997 	tmp = 0xCAFEDEAD;
998 	memset(&ib, 0, sizeof(ib));
999 
1000 	r = amdgpu_device_wb_get(adev, &index);
1001 	if (r) {
1002 		dev_err(adev->dev, "(%ld) failed to allocate wb slot\n", r);
1003 		return r;
1004 	}
1005 
1006 	gpu_addr = adev->wb.gpu_addr + (index * 4);
1007 	adev->wb.wb[index] = cpu_to_le32(tmp);
1008 
1009 	r = amdgpu_ib_get(adev, NULL, 256, AMDGPU_IB_POOL_DIRECT, &ib);
1010 	if (r) {
1011 		DRM_ERROR("amdgpu: failed to get ib (%ld).\n", r);
1012 		goto err0;
1013 	}
1014 
1015 	ib.ptr[0] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_WRITE) |
1016 		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR);
1017 	ib.ptr[1] = lower_32_bits(gpu_addr);
1018 	ib.ptr[2] = upper_32_bits(gpu_addr);
1019 	ib.ptr[3] = SDMA_PKT_WRITE_UNTILED_DW_3_COUNT(0);
1020 	ib.ptr[4] = 0xDEADBEEF;
1021 	ib.ptr[5] = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP);
1022 	ib.ptr[6] = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP);
1023 	ib.ptr[7] = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP);
1024 	ib.length_dw = 8;
1025 
1026 	r = amdgpu_ib_schedule(ring, 1, &ib, NULL, &f);
1027 	if (r)
1028 		goto err1;
1029 
1030 	r = dma_fence_wait_timeout(f, false, timeout);
1031 	if (r == 0) {
1032 		DRM_ERROR("amdgpu: IB test timed out\n");
1033 		r = -ETIMEDOUT;
1034 		goto err1;
1035 	} else if (r < 0) {
1036 		DRM_ERROR("amdgpu: fence wait failed (%ld).\n", r);
1037 		goto err1;
1038 	}
1039 
1040 	tmp = le32_to_cpu(adev->wb.wb[index]);
1041 
1042 	if (tmp == 0xDEADBEEF)
1043 		r = 0;
1044 	else
1045 		r = -EINVAL;
1046 
1047 err1:
1048 	amdgpu_ib_free(&ib, NULL);
1049 	dma_fence_put(f);
1050 err0:
1051 	amdgpu_device_wb_free(adev, index);
1052 	return r;
1053 }
1054 
1055 
1056 /**
1057  * sdma_v7_1_vm_copy_pte - update PTEs by copying them from the GART
1058  *
1059  * @ib: indirect buffer to fill with commands
1060  * @pe: addr of the page entry
1061  * @src: src addr to copy from
1062  * @count: number of page entries to update
1063  *
1064  * Update PTEs by copying them from the GART using sDMA.
1065  */
1066 static void sdma_v7_1_vm_copy_pte(struct amdgpu_ib *ib,
1067 				  uint64_t pe, uint64_t src,
1068 				  unsigned count)
1069 {
1070 	unsigned bytes = count * 8;
1071 
1072 	ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) |
1073 		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR);
1074 
1075 	ib->ptr[ib->length_dw++] = bytes - 1;
1076 	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */
1077 	ib->ptr[ib->length_dw++] = lower_32_bits(src);
1078 	ib->ptr[ib->length_dw++] = upper_32_bits(src);
1079 	ib->ptr[ib->length_dw++] = lower_32_bits(pe);
1080 	ib->ptr[ib->length_dw++] = upper_32_bits(pe);
1081 
1082 }
1083 
1084 /**
1085  * sdma_v7_1_vm_write_pte - update PTEs by writing them manually
1086  *
1087  * @ib: indirect buffer to fill with commands
1088  * @pe: addr of the page entry
1089  * @value: dst addr to write into pe
1090  * @count: number of page entries to update
1091  * @incr: increase next addr by incr bytes
1092  *
1093  * Update PTEs by writing them manually using sDMA.
1094  */
1095 static void sdma_v7_1_vm_write_pte(struct amdgpu_ib *ib, uint64_t pe,
1096 				   uint64_t value, unsigned count,
1097 				   uint32_t incr)
1098 {
1099 	unsigned ndw = count * 2;
1100 
1101 	ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_WRITE) |
1102 		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR);
1103 	ib->ptr[ib->length_dw++] = lower_32_bits(pe);
1104 	ib->ptr[ib->length_dw++] = upper_32_bits(pe);
1105 	ib->ptr[ib->length_dw++] = ndw - 1;
1106 	for (; ndw > 0; ndw -= 2) {
1107 		ib->ptr[ib->length_dw++] = lower_32_bits(value);
1108 		ib->ptr[ib->length_dw++] = upper_32_bits(value);
1109 		value += incr;
1110 	}
1111 }
1112 
1113 /**
1114  * sdma_v7_1_vm_set_pte_pde - update the page tables using sDMA
1115  *
1116  * @ib: indirect buffer to fill with commands
1117  * @pe: addr of the page entry
1118  * @addr: dst addr to write into pe
1119  * @count: number of page entries to update
1120  * @incr: increase next addr by incr bytes
1121  * @flags: access flags
1122  *
1123  * Update the page tables using sDMA.
1124  */
1125 static void sdma_v7_1_vm_set_pte_pde(struct amdgpu_ib *ib,
1126 				     uint64_t pe,
1127 				     uint64_t addr, unsigned count,
1128 				     uint32_t incr, uint64_t flags)
1129 {
1130 	/* for physically contiguous pages (vram) */
1131 	u32 header = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_PTEPDE);
1132 
1133 	if (amdgpu_mtype_local)
1134 		header |= SDMA_PKT_PTEPDE_COPY_HEADER_MTYPE(0x3);
1135 	else
1136 		header |= (SDMA_PKT_PTEPDE_COPY_HEADER_MTYPE(0x2) |
1137 			   SDMA_PKT_PTEPDE_COPY_HEADER_SNOOP(0x1) |
1138 			   SDMA_PKT_PTEPDE_COPY_HEADER_SCOPE(0x3));
1139 
1140 	ib->ptr[ib->length_dw++] = header;
1141 	ib->ptr[ib->length_dw++] = lower_32_bits(pe); /* dst addr */
1142 	ib->ptr[ib->length_dw++] = upper_32_bits(pe);
1143 	ib->ptr[ib->length_dw++] = lower_32_bits(flags); /* mask */
1144 	ib->ptr[ib->length_dw++] = upper_32_bits(flags);
1145 	ib->ptr[ib->length_dw++] = lower_32_bits(addr); /* value */
1146 	ib->ptr[ib->length_dw++] = upper_32_bits(addr);
1147 	ib->ptr[ib->length_dw++] = incr; /* increment size */
1148 	ib->ptr[ib->length_dw++] = 0;
1149 	ib->ptr[ib->length_dw++] = count - 1; /* number of entries */
1150 }
1151 
1152 /**
1153  * sdma_v7_1_ring_pad_ib - pad the IB
1154  *
1155  * @ring: amdgpu ring pointer
1156  * @ib: indirect buffer to fill with padding
1157  *
1158  * Pad the IB with NOPs to a boundary multiple of 8.
1159  */
1160 static void sdma_v7_1_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
1161 {
1162 	struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
1163 	u32 pad_count;
1164 	int i;
1165 
1166 	pad_count = (-ib->length_dw) & 0x7;
1167 	for (i = 0; i < pad_count; i++)
1168 		if (sdma && sdma->burst_nop && (i == 0))
1169 			ib->ptr[ib->length_dw++] =
1170 				SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_NOP) |
1171 				SDMA_PKT_NOP_HEADER_COUNT(pad_count - 1);
1172 		else
1173 			ib->ptr[ib->length_dw++] =
1174 				SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_NOP);
1175 }
1176 
1177 /**
1178  * sdma_v7_1_ring_emit_pipeline_sync - sync the pipeline
1179  *
1180  * @ring: amdgpu_ring pointer
1181  *
1182  * Make sure all previous operations are completed (CIK).
1183  */
1184 static void sdma_v7_1_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
1185 {
1186 	uint32_t seq = ring->fence_drv.sync_seq;
1187 	uint64_t addr = ring->fence_drv.gpu_addr;
1188 
1189 	/* wait for idle */
1190 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_POLL_REGMEM) |
1191 			  SDMA_PKT_POLL_REGMEM_HEADER_FUNC(3) | /* equal */
1192 			  SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1));
1193 	amdgpu_ring_write(ring, addr & 0xfffffffc);
1194 	amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff);
1195 	amdgpu_ring_write(ring, seq); /* reference */
1196 	amdgpu_ring_write(ring, 0xffffffff); /* mask */
1197 	amdgpu_ring_write(ring, SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) |
1198 			  SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(4)); /* retry count, poll interval */
1199 }
1200 
1201 /**
1202  * sdma_v7_1_ring_emit_vm_flush - vm flush using sDMA
1203  *
1204  * @ring: amdgpu_ring pointer
1205  * @vmid: vmid number to use
1206  * @pd_addr: address
1207  *
1208  * Update the page table base and flush the VM TLB
1209  * using sDMA.
1210  */
1211 static void sdma_v7_1_ring_emit_vm_flush(struct amdgpu_ring *ring,
1212 					 unsigned vmid, uint64_t pd_addr)
1213 {
1214 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
1215 }
1216 
1217 static void sdma_v7_1_ring_emit_wreg(struct amdgpu_ring *ring,
1218 				     uint32_t reg, uint32_t val)
1219 {
1220 	/* SRBM WRITE command will not support on sdma v7.
1221 	 * Use Register WRITE command instead, which OPCODE is same as SRBM WRITE
1222 	 */
1223 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_SRBM_WRITE));
1224 	amdgpu_ring_write(ring, soc_v1_0_normalize_xcc_reg_offset(reg) << 2);
1225 	amdgpu_ring_write(ring, val);
1226 }
1227 
1228 static void sdma_v7_1_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
1229 					 uint32_t val, uint32_t mask)
1230 {
1231 	amdgpu_ring_write(ring, SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_POLL_REGMEM) |
1232 			  SDMA_PKT_POLL_REGMEM_HEADER_FUNC(3)); /* equal */
1233 	amdgpu_ring_write(ring, soc_v1_0_normalize_xcc_reg_offset(reg) << 2);
1234 	amdgpu_ring_write(ring, 0);
1235 	amdgpu_ring_write(ring, val); /* reference */
1236 	amdgpu_ring_write(ring, mask); /* mask */
1237 	amdgpu_ring_write(ring, SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) |
1238 			  SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(10));
1239 }
1240 
1241 static void sdma_v7_1_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
1242 						   uint32_t reg0, uint32_t reg1,
1243 						   uint32_t ref, uint32_t mask)
1244 {
1245 	amdgpu_ring_emit_wreg(ring, reg0, ref);
1246 	/* wait for a cycle to reset vm_inv_eng*_ack */
1247 	amdgpu_ring_emit_reg_wait(ring, reg0, 0, 0);
1248 	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
1249 }
1250 
1251 static int sdma_v7_1_early_init(struct amdgpu_ip_block *ip_block)
1252 {
1253 	struct amdgpu_device *adev = ip_block->adev;
1254 	int r;
1255 
1256 	r = amdgpu_sdma_init_microcode(adev, 0, true);
1257 	if (r) {
1258 		DRM_ERROR("Failed to init sdma firmware!\n");
1259 		return r;
1260 	}
1261 
1262 	sdma_v7_1_set_ring_funcs(adev);
1263 	sdma_v7_1_set_buffer_funcs(adev);
1264 	sdma_v7_1_set_vm_pte_funcs(adev);
1265 	sdma_v7_1_set_irq_funcs(adev);
1266 	sdma_v7_1_set_mqd_funcs(adev);
1267 
1268 	return 0;
1269 }
1270 
1271 static int sdma_v7_1_sw_init(struct amdgpu_ip_block *ip_block)
1272 {
1273 	struct amdgpu_ring *ring;
1274 	int r, i;
1275 	struct amdgpu_device *adev = ip_block->adev;
1276 	uint32_t reg_count = ARRAY_SIZE(sdma_reg_list_7_1);
1277 	uint32_t *ptr;
1278 	u32 xcc_id;
1279 
1280 	/* SDMA trap event */
1281 	r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_GFX,
1282 			      GFX_12_1_0__SRCID__SDMA_TRAP,
1283 			      &adev->sdma.trap_irq);
1284 	if (r)
1285 		return r;
1286 
1287 	for (i = 0; i < adev->sdma.num_instances; i++) {
1288 		ring = &adev->sdma.instance[i].ring;
1289 		ring->ring_obj = NULL;
1290 		ring->use_doorbell = true;
1291 		ring->me = i;
1292 
1293 		for (xcc_id = 0; xcc_id < fls(adev->gfx.xcc_mask); xcc_id++) {
1294 			if (adev->sdma.instance[i].xcc_id == GET_INST(GC, xcc_id))
1295 				break;
1296 		}
1297 
1298 		DRM_DEBUG("SDMA%d.%d use_doorbell being set to: [%s]\n",
1299 				xcc_id, GET_INST(SDMA0, i) % adev->sdma.num_inst_per_xcc,
1300 				ring->use_doorbell?"true":"false");
1301 
1302 		ring->doorbell_index =
1303 			(adev->doorbell_index.sdma_engine[i] << 1); // get DWORD offset
1304 
1305 		ring->vm_hub = AMDGPU_GFXHUB(xcc_id);
1306 		sprintf(ring->name, "sdma%d.%d", xcc_id,
1307 				GET_INST(SDMA0, i) % adev->sdma.num_inst_per_xcc);
1308 		r = amdgpu_ring_init(adev, ring, 1024,
1309 				     &adev->sdma.trap_irq,
1310 				     AMDGPU_SDMA_IRQ_INSTANCE0 + i,
1311 				     AMDGPU_RING_PRIO_DEFAULT, NULL);
1312 		if (r)
1313 			return r;
1314 	}
1315 
1316 	adev->sdma.supported_reset =
1317 		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
1318 	if (!amdgpu_sriov_vf(adev) &&
1319 	    !adev->debug_disable_gpu_ring_reset)
1320 		adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
1321 
1322 	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
1323 	if (r)
1324 		return r;
1325 	/* Allocate memory for SDMA IP Dump buffer */
1326 	ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), GFP_KERNEL);
1327 	if (ptr)
1328 		adev->sdma.ip_dump = ptr;
1329 	else
1330 		DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n");
1331 
1332 #ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ
1333 	adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs;
1334 #endif
1335 
1336 	return r;
1337 }
1338 
1339 static int sdma_v7_1_sw_fini(struct amdgpu_ip_block *ip_block)
1340 {
1341 	struct amdgpu_device *adev = ip_block->adev;
1342 	int i;
1343 
1344 	for (i = 0; i < adev->sdma.num_instances; i++)
1345 		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
1346 
1347 	amdgpu_sdma_sysfs_reset_mask_fini(adev);
1348 	amdgpu_sdma_destroy_inst_ctx(adev, true);
1349 
1350 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT)
1351 		sdma_v7_1_inst_free_ucode_buffer(adev, adev->sdma.sdma_mask);
1352 
1353 	kfree(adev->sdma.ip_dump);
1354 
1355 	return 0;
1356 }
1357 
1358 static int sdma_v7_1_hw_init(struct amdgpu_ip_block *ip_block)
1359 {
1360 	struct amdgpu_device *adev = ip_block->adev;
1361 	uint32_t inst_mask;
1362 
1363 	inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
1364 
1365 	return sdma_v7_1_inst_start(adev, inst_mask);
1366 }
1367 
1368 static int sdma_v7_1_hw_fini(struct amdgpu_ip_block *ip_block)
1369 {
1370 	struct amdgpu_device *adev = ip_block->adev;
1371 
1372 	if (amdgpu_sriov_vf(adev))
1373 		return 0;
1374 
1375 	sdma_v7_1_inst_ctx_switch_enable(adev, false, adev->sdma.sdma_mask);
1376 	sdma_v7_1_inst_enable(adev, false, adev->sdma.sdma_mask);
1377 
1378 	return 0;
1379 }
1380 
1381 static int sdma_v7_1_suspend(struct amdgpu_ip_block *ip_block)
1382 {
1383 	return sdma_v7_1_hw_fini(ip_block);
1384 }
1385 
1386 static int sdma_v7_1_resume(struct amdgpu_ip_block *ip_block)
1387 {
1388 	return sdma_v7_1_hw_init(ip_block);
1389 }
1390 
1391 static bool sdma_v7_1_is_idle(struct amdgpu_ip_block *ip_block)
1392 {
1393 	struct amdgpu_device *adev = ip_block->adev;
1394 	u32 i;
1395 
1396 	for (i = 0; i < adev->sdma.num_instances; i++) {
1397 		u32 tmp = RREG32(sdma_v7_1_get_reg_offset(adev, i, regSDMA0_SDMA_STATUS_REG));
1398 
1399 		if (!(tmp & SDMA0_SDMA_STATUS_REG__IDLE_MASK))
1400 			return false;
1401 	}
1402 
1403 	return true;
1404 }
1405 
1406 static int sdma_v7_1_wait_for_idle(struct amdgpu_ip_block *ip_block)
1407 {
1408 	unsigned i, j;
1409 	u32 sdma[AMDGPU_MAX_SDMA_INSTANCES];
1410 	struct amdgpu_device *adev = ip_block->adev;
1411 
1412 	for (i = 0; i < adev->usec_timeout; i++) {
1413 		for (j = 0; j < adev->sdma.num_instances; j++) {
1414 			sdma[j] = RREG32(sdma_v7_1_get_reg_offset(adev,
1415 						j, regSDMA0_SDMA_STATUS_REG));
1416 			if (!(sdma[j] & SDMA0_SDMA_STATUS_REG__IDLE_MASK))
1417 				break;
1418 		}
1419 		if (j == adev->sdma.num_instances)
1420 			return 0;
1421 		udelay(1);
1422 	}
1423 	return -ETIMEDOUT;
1424 }
1425 
1426 static int sdma_v7_1_ring_preempt_ib(struct amdgpu_ring *ring)
1427 {
1428 	int i, r = 0;
1429 	struct amdgpu_device *adev = ring->adev;
1430 	u32 index = 0;
1431 	u64 sdma_gfx_preempt;
1432 
1433 	amdgpu_sdma_get_index_from_ring(ring, &index);
1434 	sdma_gfx_preempt =
1435 		sdma_v7_1_get_reg_offset(adev, index, regSDMA0_SDMA_QUEUE0_PREEMPT);
1436 
1437 	/* assert preemption condition */
1438 	amdgpu_ring_set_preempt_cond_exec(ring, false);
1439 
1440 	/* emit the trailing fence */
1441 	ring->trail_seq += 1;
1442 	r = amdgpu_ring_alloc(ring, 10);
1443 	if (r) {
1444 		DRM_ERROR("ring %d failed to be allocated \n", ring->idx);
1445 		return r;
1446 	}
1447 	sdma_v7_1_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
1448 				  ring->trail_seq, 0);
1449 	amdgpu_ring_commit(ring);
1450 
1451 	/* assert IB preemption */
1452 	WREG32(sdma_gfx_preempt, 1);
1453 
1454 	/* poll the trailing fence */
1455 	for (i = 0; i < adev->usec_timeout; i++) {
1456 		if (ring->trail_seq ==
1457 		    le32_to_cpu(*(ring->trail_fence_cpu_addr)))
1458 			break;
1459 		udelay(1);
1460 	}
1461 
1462 	if (i >= adev->usec_timeout) {
1463 		r = -EINVAL;
1464 		DRM_ERROR("ring %d failed to be preempted\n", ring->idx);
1465 	}
1466 
1467 	/* deassert IB preemption */
1468 	WREG32(sdma_gfx_preempt, 0);
1469 
1470 	/* deassert the preemption condition */
1471 	amdgpu_ring_set_preempt_cond_exec(ring, true);
1472 	return r;
1473 }
1474 
1475 static int sdma_v7_1_set_trap_irq_state(struct amdgpu_device *adev,
1476 					struct amdgpu_irq_src *source,
1477 					unsigned type,
1478 					enum amdgpu_interrupt_state state)
1479 {
1480 	u32 sdma_cntl;
1481 
1482 	u32 reg_offset = sdma_v7_1_get_reg_offset(adev, type, regSDMA0_SDMA_CNTL);
1483 
1484 	sdma_cntl = RREG32(reg_offset);
1485 	sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA0_SDMA_CNTL, TRAP_ENABLE,
1486 		       state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
1487 	WREG32(reg_offset, sdma_cntl);
1488 
1489 	return 0;
1490 }
1491 
1492 static int sdma_v7_1_process_trap_irq(struct amdgpu_device *adev,
1493 				      struct amdgpu_irq_src *source,
1494 				      struct amdgpu_iv_entry *entry)
1495 {
1496 	int inst, instances, queue, xcc_id = 0;
1497 	uint32_t mes_queue_id = entry->src_data[0];
1498 
1499 	DRM_DEBUG("IH: SDMA trap\n");
1500 
1501 	if (adev->enable_mes && (mes_queue_id & AMDGPU_FENCE_MES_QUEUE_FLAG)) {
1502 		struct amdgpu_mes_queue *queue;
1503 
1504 		mes_queue_id &= AMDGPU_FENCE_MES_QUEUE_ID_MASK;
1505 
1506 		spin_lock(&adev->mes.queue_id_lock);
1507 		queue = idr_find(&adev->mes.queue_id_idr, mes_queue_id);
1508 		if (queue) {
1509 			DRM_DEBUG("process smda queue id = %d\n", mes_queue_id);
1510 			amdgpu_fence_process(queue->ring);
1511 		}
1512 		spin_unlock(&adev->mes.queue_id_lock);
1513 		return 0;
1514 	}
1515 
1516 	queue = entry->ring_id & 0xf;
1517 	if (adev->gfx.funcs && adev->gfx.funcs->ih_node_to_logical_xcc)
1518 		xcc_id = adev->gfx.funcs->ih_node_to_logical_xcc(adev, entry->node_id);
1519 	else
1520 		dev_warn(adev->dev, "IH: SDMA may get wrong xcc id as gfx function not available\n");
1521 	inst = ((entry->ring_id & 0xf0) >> 4) +
1522 		GET_INST(GC, xcc_id) * adev->sdma.num_inst_per_xcc;
1523 	for (instances = 0; instances < adev->sdma.num_instances; instances++) {
1524 		if (inst == GET_INST(SDMA0, instances))
1525 			break;
1526 	}
1527 	if (instances > adev->sdma.num_instances - 1) {
1528 		DRM_ERROR("IH: wrong ring_ID detected, as wrong sdma instance\n");
1529 		return -EINVAL;
1530 	}
1531 
1532 	switch (entry->client_id) {
1533 	case SOC_V1_0_IH_CLIENTID_GFX:
1534 		switch (queue) {
1535 		case 0:
1536 			amdgpu_fence_process(&adev->sdma.instance[instances].ring);
1537 			break;
1538 		default:
1539 			break;
1540 		}
1541 		break;
1542 	}
1543 	return 0;
1544 }
1545 
1546 static int sdma_v7_1_process_illegal_inst_irq(struct amdgpu_device *adev,
1547 					      struct amdgpu_irq_src *source,
1548 					      struct amdgpu_iv_entry *entry)
1549 {
1550 	return 0;
1551 }
1552 
1553 static int sdma_v7_1_set_clockgating_state(struct amdgpu_ip_block *ip_block,
1554 					   enum amd_clockgating_state state)
1555 {
1556 	return 0;
1557 }
1558 
1559 static int sdma_v7_1_set_powergating_state(struct amdgpu_ip_block *ip_block,
1560 					  enum amd_powergating_state state)
1561 {
1562 	return 0;
1563 }
1564 
1565 static void sdma_v7_1_get_clockgating_state(struct amdgpu_ip_block *ip_block,
1566 					    u64 *flags)
1567 {
1568 }
1569 
1570 static void sdma_v7_1_print_ip_state(struct amdgpu_ip_block *ip_block, struct drm_printer *p)
1571 {
1572 	struct amdgpu_device *adev = ip_block->adev;
1573 	int i, j;
1574 	uint32_t reg_count = ARRAY_SIZE(sdma_reg_list_7_1);
1575 	uint32_t instance_offset;
1576 
1577 	if (!adev->sdma.ip_dump)
1578 		return;
1579 
1580 	drm_printf(p, "num_instances:%d\n", adev->sdma.num_instances);
1581 	for (i = 0; i < adev->sdma.num_instances; i++) {
1582 		instance_offset = i * reg_count;
1583 		drm_printf(p, "\nInstance:%d\n", i);
1584 
1585 		for (j = 0; j < reg_count; j++)
1586 			drm_printf(p, "%-50s \t 0x%08x\n", sdma_reg_list_7_1[j].reg_name,
1587 				   adev->sdma.ip_dump[instance_offset + j]);
1588 	}
1589 }
1590 
1591 static void sdma_v7_1_dump_ip_state(struct amdgpu_ip_block *ip_block)
1592 {
1593 	struct amdgpu_device *adev = ip_block->adev;
1594 	int i, j;
1595 	uint32_t instance_offset;
1596 	uint32_t reg_count = ARRAY_SIZE(sdma_reg_list_7_1);
1597 
1598 	if (!adev->sdma.ip_dump)
1599 		return;
1600 
1601 	amdgpu_gfx_off_ctrl(adev, false);
1602 	for (i = 0; i < adev->sdma.num_instances; i++) {
1603 		instance_offset = i * reg_count;
1604 		for (j = 0; j < reg_count; j++)
1605 			adev->sdma.ip_dump[instance_offset + j] =
1606 				RREG32(sdma_v7_1_get_reg_offset(adev, i,
1607 				       sdma_reg_list_7_1[j].reg_offset));
1608 	}
1609 	amdgpu_gfx_off_ctrl(adev, true);
1610 }
1611 
1612 const struct amd_ip_funcs sdma_v7_1_ip_funcs = {
1613 	.name = "sdma_v7_1",
1614 	.early_init = sdma_v7_1_early_init,
1615 	.late_init = NULL,
1616 	.sw_init = sdma_v7_1_sw_init,
1617 	.sw_fini = sdma_v7_1_sw_fini,
1618 	.hw_init = sdma_v7_1_hw_init,
1619 	.hw_fini = sdma_v7_1_hw_fini,
1620 	.suspend = sdma_v7_1_suspend,
1621 	.resume = sdma_v7_1_resume,
1622 	.is_idle = sdma_v7_1_is_idle,
1623 	.wait_for_idle = sdma_v7_1_wait_for_idle,
1624 	.soft_reset = sdma_v7_1_soft_reset,
1625 	.check_soft_reset = sdma_v7_1_check_soft_reset,
1626 	.set_clockgating_state = sdma_v7_1_set_clockgating_state,
1627 	.set_powergating_state = sdma_v7_1_set_powergating_state,
1628 	.get_clockgating_state = sdma_v7_1_get_clockgating_state,
1629 	.dump_ip_state = sdma_v7_1_dump_ip_state,
1630 	.print_ip_state = sdma_v7_1_print_ip_state,
1631 };
1632 
1633 static const struct amdgpu_ring_funcs sdma_v7_1_ring_funcs = {
1634 	.type = AMDGPU_RING_TYPE_SDMA,
1635 	.align_mask = 0xf,
1636 	.nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
1637 	.support_64bit_ptrs = true,
1638 	.secure_submission_supported = true,
1639 	.get_rptr = sdma_v7_1_ring_get_rptr,
1640 	.get_wptr = sdma_v7_1_ring_get_wptr,
1641 	.set_wptr = sdma_v7_1_ring_set_wptr,
1642 	.emit_frame_size =
1643 		5 + /* sdma_v7_1_ring_init_cond_exec */
1644 		6 + /* sdma_v7_1_ring_emit_pipeline_sync */
1645 		/* sdma_v7_1_ring_emit_vm_flush */
1646 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
1647 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
1648 		10 + 10 + 10, /* sdma_v7_1_ring_emit_fence x3 for user fence, vm fence */
1649 	.emit_ib_size = 5 + 7 + 6, /* sdma_v7_1_ring_emit_ib */
1650 	.emit_ib = sdma_v7_1_ring_emit_ib,
1651 	.emit_mem_sync = sdma_v7_1_ring_emit_mem_sync,
1652 	.emit_fence = sdma_v7_1_ring_emit_fence,
1653 	.emit_pipeline_sync = sdma_v7_1_ring_emit_pipeline_sync,
1654 	.emit_vm_flush = sdma_v7_1_ring_emit_vm_flush,
1655 	.test_ring = sdma_v7_1_ring_test_ring,
1656 	.test_ib = sdma_v7_1_ring_test_ib,
1657 	.insert_nop = sdma_v7_1_ring_insert_nop,
1658 	.pad_ib = sdma_v7_1_ring_pad_ib,
1659 	.emit_wreg = sdma_v7_1_ring_emit_wreg,
1660 	.emit_reg_wait = sdma_v7_1_ring_emit_reg_wait,
1661 	.emit_reg_write_reg_wait = sdma_v7_1_ring_emit_reg_write_reg_wait,
1662 	.init_cond_exec = sdma_v7_1_ring_init_cond_exec,
1663 	.preempt_ib = sdma_v7_1_ring_preempt_ib,
1664 	.reset = sdma_v7_1_reset_queue,
1665 };
1666 
1667 static void sdma_v7_1_set_ring_funcs(struct amdgpu_device *adev)
1668 {
1669 	int i, dev_inst;
1670 
1671 	for (i = 0; i < adev->sdma.num_instances; i++) {
1672 		adev->sdma.instance[i].ring.funcs = &sdma_v7_1_ring_funcs;
1673 		adev->sdma.instance[i].ring.me = i;
1674 
1675 		dev_inst = GET_INST(SDMA0, i);
1676 		/* XCC to which SDMA belongs depends on physical instance */
1677 		adev->sdma.instance[i].xcc_id =
1678 			dev_inst / adev->sdma.num_inst_per_xcc;
1679 	}
1680 }
1681 
1682 static const struct amdgpu_irq_src_funcs sdma_v7_1_trap_irq_funcs = {
1683 	.set = sdma_v7_1_set_trap_irq_state,
1684 	.process = sdma_v7_1_process_trap_irq,
1685 };
1686 
1687 static const struct amdgpu_irq_src_funcs sdma_v7_1_illegal_inst_irq_funcs = {
1688 	.process = sdma_v7_1_process_illegal_inst_irq,
1689 };
1690 
1691 static void sdma_v7_1_set_irq_funcs(struct amdgpu_device *adev)
1692 {
1693 	adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE0 +
1694 					adev->sdma.num_instances;
1695 	adev->sdma.trap_irq.funcs = &sdma_v7_1_trap_irq_funcs;
1696 	adev->sdma.illegal_inst_irq.funcs = &sdma_v7_1_illegal_inst_irq_funcs;
1697 }
1698 
1699 /**
1700  * sdma_v7_1_emit_copy_buffer - copy buffer using the sDMA engine
1701  *
1702  * @ib: indirect buffer to fill with commands
1703  * @src_offset: src GPU address
1704  * @dst_offset: dst GPU address
1705  * @byte_count: number of bytes to xfer
1706  * @copy_flags: copy flags for the buffers
1707  *
1708  * Copy GPU buffers using the DMA engine.
1709  * Used by the amdgpu ttm implementation to move pages if
1710  * registered as the asic copy callback.
1711  */
1712 static void sdma_v7_1_emit_copy_buffer(struct amdgpu_ib *ib,
1713 				       uint64_t src_offset,
1714 				       uint64_t dst_offset,
1715 				       uint32_t byte_count,
1716 				       uint32_t copy_flags)
1717 {
1718 	ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) |
1719 		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) |
1720 		SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0);
1721 
1722 	ib->ptr[ib->length_dw++] = byte_count - 1;
1723 	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */
1724 	ib->ptr[ib->length_dw++] = lower_32_bits(src_offset);
1725 	ib->ptr[ib->length_dw++] = upper_32_bits(src_offset);
1726 	ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
1727 	ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset);
1728 }
1729 
1730 /**
1731  * sdma_v7_1_emit_fill_buffer - fill buffer using the sDMA engine
1732  *
1733  * @ib: indirect buffer to fill
1734  * @src_data: value to write to buffer
1735  * @dst_offset: dst GPU address
1736  * @byte_count: number of bytes to xfer
1737  *
1738  * Fill GPU buffers using the DMA engine.
1739  */
1740 static void sdma_v7_1_emit_fill_buffer(struct amdgpu_ib *ib,
1741 				       uint32_t src_data,
1742 				       uint64_t dst_offset,
1743 				       uint32_t byte_count)
1744 {
1745 	ib->ptr[ib->length_dw++] = SDMA_PKT_CONSTANT_FILL_HEADER_OP(SDMA_OP_CONST_FILL);
1746 	ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
1747 	ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset);
1748 	ib->ptr[ib->length_dw++] = src_data;
1749 	ib->ptr[ib->length_dw++] = byte_count - 1;
1750 }
1751 
1752 static const struct amdgpu_buffer_funcs sdma_v7_1_buffer_funcs = {
1753 	.copy_max_bytes = 0x400000,
1754 	.copy_num_dw = 8,
1755 	.emit_copy_buffer = sdma_v7_1_emit_copy_buffer,
1756 	.fill_max_bytes = 0x400000,
1757 	.fill_num_dw = 5,
1758 	.emit_fill_buffer = sdma_v7_1_emit_fill_buffer,
1759 };
1760 
1761 static void sdma_v7_1_set_buffer_funcs(struct amdgpu_device *adev)
1762 {
1763 	adev->mman.buffer_funcs = &sdma_v7_1_buffer_funcs;
1764 	adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].ring;
1765 }
1766 
1767 static const struct amdgpu_vm_pte_funcs sdma_v7_1_vm_pte_funcs = {
1768 	.copy_pte_num_dw = 8,
1769 	.copy_pte = sdma_v7_1_vm_copy_pte,
1770 	.write_pte = sdma_v7_1_vm_write_pte,
1771 	.set_pte_pde = sdma_v7_1_vm_set_pte_pde,
1772 };
1773 
1774 static void sdma_v7_1_set_vm_pte_funcs(struct amdgpu_device *adev)
1775 {
1776 	unsigned i;
1777 
1778 	adev->vm_manager.vm_pte_funcs = &sdma_v7_1_vm_pte_funcs;
1779 	for (i = 0; i < adev->sdma.num_instances; i++) {
1780 		adev->vm_manager.vm_pte_scheds[i] =
1781 			&adev->sdma.instance[i].ring.sched;
1782 	}
1783 	adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
1784 }
1785 
1786 const struct amdgpu_ip_block_version sdma_v7_1_ip_block = {
1787 	.type = AMD_IP_BLOCK_TYPE_SDMA,
1788 	.major = 7,
1789 	.minor = 1,
1790 	.rev = 0,
1791 	.funcs = &sdma_v7_1_ip_funcs,
1792 };
1793 
1794 static int sdma_v7_1_xcp_resume(void *handle, uint32_t inst_mask)
1795 {
1796 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
1797 	int r;
1798 
1799 	r = sdma_v7_1_inst_start(adev, inst_mask);
1800 
1801 	return r;
1802 }
1803 
1804 static int sdma_v7_1_xcp_suspend(void *handle, uint32_t inst_mask)
1805 {
1806 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
1807 
1808 	sdma_v7_1_inst_ctx_switch_enable(adev, false, inst_mask);
1809 	sdma_v7_1_inst_enable(adev, false, inst_mask);
1810 
1811 	return 0;
1812 }
1813 
1814 struct amdgpu_xcp_ip_funcs sdma_v7_1_xcp_funcs = {
1815 	.suspend = &sdma_v7_1_xcp_suspend,
1816 	.resume = &sdma_v7_1_xcp_resume
1817 };
1818