xref: /linux/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c (revision 80be8286d098dc92cc48ab2d0f459dbb5cfde055)
1 /*
2  * Copyright 2025 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include <linux/delay.h>
24 #include <linux/kernel.h>
25 #include <linux/firmware.h>
26 #include <linux/module.h>
27 #include <linux/pci.h>
28 #include "amdgpu.h"
29 #include "amdgpu_gfx.h"
30 #include "amdgpu_psp.h"
31 #include "amdgpu_smu.h"
32 #include "amdgpu_atomfirmware.h"
33 #include "imu_v12_1.h"
34 #include "soc_v1_0.h"
35 #include "gfx_v12_1_pkt.h"
36 
37 #include "gc/gc_12_1_0_offset.h"
38 #include "gc/gc_12_1_0_sh_mask.h"
39 #include "soc24_enum.h"
40 #include "ivsrcid/gfx/irqsrcs_gfx_12_1_0.h"
41 
42 #include "soc15.h"
43 #include "clearstate_gfx12.h"
44 #include "v12_structs.h"
45 #include "gfx_v12_1.h"
46 #include "mes_v12_1.h"
47 
48 #define GFX12_MEC_HPD_SIZE	2048
49 #define NUM_SIMD_PER_CU_GFX12_1	4
50 
51 #define RLCG_UCODE_LOADING_START_ADDRESS	0x00002000L
52 
53 #define regCP_HQD_EOP_CONTROL_DEFAULT                                             0x00000000
54 #define regCP_HQD_PQ_DOORBELL_CONTROL_DEFAULT                                     0x00000000
55 #define regCP_MQD_CONTROL_DEFAULT                                                 0x00000100
56 #define regCP_HQD_PQ_CONTROL_DEFAULT                                              0x00308509
57 #define regCP_HQD_PQ_RPTR_DEFAULT                                                 0x00000000
58 #define regCP_HQD_PERSISTENT_STATE_DEFAULT                                        0x0ae06301
59 #define regCP_HQD_IB_CONTROL_DEFAULT                                              0x00100000
60 
61 MODULE_FIRMWARE("amdgpu/gc_12_1_0_mec.bin");
62 MODULE_FIRMWARE("amdgpu/gc_12_1_0_rlc.bin");
63 
64 #define SH_MEM_ALIGNMENT_MODE_UNALIGNED_GFX12_1_0	0x00000001
65 #define DEFAULT_SH_MEM_CONFIG \
66 	((SH_MEM_ADDRESS_MODE_64 << SH_MEM_CONFIG__ADDRESS_MODE__SHIFT) | \
67 	 (SH_MEM_ALIGNMENT_MODE_UNALIGNED_GFX12_1_0 << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | \
68 	 (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT))
69 
70 static void gfx_v12_1_xcc_disable_gpa_mode(struct amdgpu_device *adev, int xcc_id);
71 static void gfx_v12_1_set_ring_funcs(struct amdgpu_device *adev);
72 static void gfx_v12_1_set_irq_funcs(struct amdgpu_device *adev);
73 static void gfx_v12_1_set_rlc_funcs(struct amdgpu_device *adev);
74 static void gfx_v12_1_set_mqd_funcs(struct amdgpu_device *adev);
75 static void gfx_v12_1_set_imu_funcs(struct amdgpu_device *adev);
76 static int gfx_v12_1_get_cu_info(struct amdgpu_device *adev,
77 				 struct amdgpu_cu_info *cu_info);
78 static uint64_t gfx_v12_1_get_gpu_clock_counter(struct amdgpu_device *adev);
79 static void gfx_v12_1_xcc_select_se_sh(struct amdgpu_device *adev, u32 se_num,
80 				       u32 sh_num, u32 instance, int xcc_id);
81 static void gfx_v12_1_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
82 				     uint32_t val);
83 static int gfx_v12_1_wait_for_rlc_autoload_complete(struct amdgpu_device *adev);
84 static void gfx_v12_1_ring_invalidate_tlbs(struct amdgpu_ring *ring,
85 					   uint16_t pasid, uint32_t flush_type,
86 					   bool all_hub, uint8_t dst_sel);
87 static void gfx_v12_1_xcc_set_safe_mode(struct amdgpu_device *adev, int xcc_id);
88 static void gfx_v12_1_xcc_unset_safe_mode(struct amdgpu_device *adev, int xcc_id);
89 static void gfx_v12_1_update_perf_clk(struct amdgpu_device *adev,
90 				      bool enable);
91 static void gfx_v12_1_xcc_update_perf_clk(struct amdgpu_device *adev,
92 					 bool enable, int xcc_id);
93 static int gfx_v12_1_init_cp_compute_microcode_bo(struct amdgpu_device *adev);
94 
95 static void gfx_v12_1_kiq_set_resources(struct amdgpu_ring *kiq_ring,
96 					uint64_t queue_mask)
97 {
98 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_RESOURCES, 6));
99 	amdgpu_ring_write(kiq_ring, PACKET3_SET_RESOURCES_VMID_MASK(0) |
100 			  PACKET3_SET_RESOURCES_QUEUE_TYPE(0));	/* vmid_mask:0 queue_type:0 (KIQ) */
101 	amdgpu_ring_write(kiq_ring, lower_32_bits(queue_mask));	/* queue mask lo */
102 	amdgpu_ring_write(kiq_ring, upper_32_bits(queue_mask));	/* queue mask hi */
103 	amdgpu_ring_write(kiq_ring, 0);	/* gws mask lo */
104 	amdgpu_ring_write(kiq_ring, 0);	/* gws mask hi */
105 	amdgpu_ring_write(kiq_ring, 0);	/* oac mask */
106 	amdgpu_ring_write(kiq_ring, 0);
107 }
108 
109 static void gfx_v12_1_kiq_map_queues(struct amdgpu_ring *kiq_ring,
110 				     struct amdgpu_ring *ring)
111 {
112 	uint64_t mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
113 	uint64_t wptr_addr = ring->wptr_gpu_addr;
114 	uint32_t me = 0, eng_sel = 0;
115 
116 	switch (ring->funcs->type) {
117 	case AMDGPU_RING_TYPE_COMPUTE:
118 		me = 1;
119 		eng_sel = 0;
120 		break;
121 	case AMDGPU_RING_TYPE_MES:
122 		me = 2;
123 		eng_sel = 5;
124 		break;
125 	default:
126 		WARN_ON(1);
127 	}
128 
129 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
130 	/* Q_sel:0, vmid:0, vidmem: 1, engine:0, num_Q:1*/
131 	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
132 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
133 			  PACKET3_MAP_QUEUES_VMID(0) | /* VMID */
134 			  PACKET3_MAP_QUEUES_QUEUE(ring->queue) |
135 			  PACKET3_MAP_QUEUES_PIPE(ring->pipe) |
136 			  PACKET3_MAP_QUEUES_ME((me)) |
137 			  PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
138 			  PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
139 			  PACKET3_MAP_QUEUES_ENGINE_SEL(eng_sel) |
140 			  PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
141 	amdgpu_ring_write(kiq_ring, PACKET3_MAP_QUEUES_DOORBELL_OFFSET(ring->doorbell_index));
142 	amdgpu_ring_write(kiq_ring, lower_32_bits(mqd_addr));
143 	amdgpu_ring_write(kiq_ring, upper_32_bits(mqd_addr));
144 	amdgpu_ring_write(kiq_ring, lower_32_bits(wptr_addr));
145 	amdgpu_ring_write(kiq_ring, upper_32_bits(wptr_addr));
146 }
147 
148 static void gfx_v12_1_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
149 				       struct amdgpu_ring *ring,
150 				       enum amdgpu_unmap_queues_action action,
151 				       u64 gpu_addr, u64 seq)
152 {
153 	struct amdgpu_device *adev = kiq_ring->adev;
154 	uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
155 
156 	if (adev->enable_mes && !adev->gfx.kiq[0].ring.sched.ready) {
157 		amdgpu_mes_unmap_legacy_queue(adev, ring, action, gpu_addr,
158 					      seq, kiq_ring->xcc_id);
159 		return;
160 	}
161 
162 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4));
163 	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
164 			  PACKET3_UNMAP_QUEUES_ACTION(action) |
165 			  PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) |
166 			  PACKET3_UNMAP_QUEUES_ENGINE_SEL(eng_sel) |
167 			  PACKET3_UNMAP_QUEUES_NUM_QUEUES(1));
168 	amdgpu_ring_write(kiq_ring,
169 		  PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
170 
171 	if (action == PREEMPT_QUEUES_NO_UNMAP) {
172 		amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
173 		amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
174 		amdgpu_ring_write(kiq_ring, seq);
175 	} else {
176 		amdgpu_ring_write(kiq_ring, 0);
177 		amdgpu_ring_write(kiq_ring, 0);
178 		amdgpu_ring_write(kiq_ring, 0);
179 	}
180 }
181 
182 static void gfx_v12_1_kiq_query_status(struct amdgpu_ring *kiq_ring,
183 				       struct amdgpu_ring *ring,
184 				       u64 addr, u64 seq)
185 {
186 	uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
187 
188 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_QUERY_STATUS, 5));
189 	amdgpu_ring_write(kiq_ring,
190 			  PACKET3_QUERY_STATUS_CONTEXT_ID(0) |
191 			  PACKET3_QUERY_STATUS_INTERRUPT_SEL(0) |
192 			  PACKET3_QUERY_STATUS_COMMAND(2));
193 	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
194 			  PACKET3_QUERY_STATUS_DOORBELL_OFFSET(ring->doorbell_index) |
195 			  PACKET3_QUERY_STATUS_ENG_SEL(eng_sel));
196 	amdgpu_ring_write(kiq_ring, lower_32_bits(addr));
197 	amdgpu_ring_write(kiq_ring, upper_32_bits(addr));
198 	amdgpu_ring_write(kiq_ring, lower_32_bits(seq));
199 	amdgpu_ring_write(kiq_ring, upper_32_bits(seq));
200 }
201 
202 static void gfx_v12_1_kiq_invalidate_tlbs(struct amdgpu_ring *kiq_ring,
203 					  uint16_t pasid,
204 					  uint32_t flush_type,
205 					  bool all_hub)
206 {
207 	gfx_v12_1_ring_invalidate_tlbs(kiq_ring, pasid, flush_type, all_hub, 1);
208 }
209 
210 static const struct kiq_pm4_funcs gfx_v12_1_kiq_pm4_funcs = {
211 	.kiq_set_resources = gfx_v12_1_kiq_set_resources,
212 	.kiq_map_queues = gfx_v12_1_kiq_map_queues,
213 	.kiq_unmap_queues = gfx_v12_1_kiq_unmap_queues,
214 	.kiq_query_status = gfx_v12_1_kiq_query_status,
215 	.kiq_invalidate_tlbs = gfx_v12_1_kiq_invalidate_tlbs,
216 	.set_resources_size = 8,
217 	.map_queues_size = 7,
218 	.unmap_queues_size = 6,
219 	.query_status_size = 7,
220 	.invalidate_tlbs_size = 2,
221 };
222 
223 static void gfx_v12_1_set_kiq_pm4_funcs(struct amdgpu_device *adev)
224 {
225 	int i, num_xcc;
226 
227 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
228 	for (i =0; i < num_xcc; i++)
229 		adev->gfx.kiq[i].pmf = &gfx_v12_1_kiq_pm4_funcs;
230 }
231 
232 static void gfx_v12_1_wait_reg_mem(struct amdgpu_ring *ring, int eng_sel,
233 				   int mem_space, int opt, uint32_t addr0,
234 				   uint32_t addr1, uint32_t ref,
235 				   uint32_t mask, uint32_t inv)
236 {
237 	if (mem_space == 0) {
238 		addr0 = soc_v1_0_normalize_xcc_reg_offset(addr0);
239 		addr1 = soc_v1_0_normalize_xcc_reg_offset(addr1);
240 	}
241 
242 	amdgpu_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5));
243 	amdgpu_ring_write(ring,
244 			  /* memory (1) or register (0) */
245 			  (WAIT_REG_MEM_MEM_SPACE(mem_space) |
246 			   WAIT_REG_MEM_OPERATION(opt) | /* wait */
247 			   WAIT_REG_MEM_FUNCTION(3) |  /* equal */
248 			   WAIT_REG_MEM_ENGINE(eng_sel)));
249 
250 	if (mem_space)
251 		BUG_ON(addr0 & 0x3); /* Dword align */
252 	amdgpu_ring_write(ring, addr0);
253 	amdgpu_ring_write(ring, addr1);
254 	amdgpu_ring_write(ring, ref);
255 	amdgpu_ring_write(ring, mask);
256 	amdgpu_ring_write(ring, inv); /* poll interval */
257 }
258 
259 static int gfx_v12_1_ring_test_ring(struct amdgpu_ring *ring)
260 {
261 	struct amdgpu_device *adev = ring->adev;
262 	uint32_t scratch_reg0_offset, xcc_offset;
263 	uint32_t tmp = 0;
264 	unsigned i;
265 	int r;
266 
267 	/* Use register offset which is local to XCC in the packet */
268 	xcc_offset = SOC15_REG_OFFSET(GC, 0, regSCRATCH_REG0);
269 	scratch_reg0_offset = SOC15_REG_OFFSET(GC, GET_INST(GC, ring->xcc_id), regSCRATCH_REG0);
270 	WREG32(scratch_reg0_offset, 0xCAFEDEAD);
271 	tmp = RREG32(scratch_reg0_offset);
272 
273 	r = amdgpu_ring_alloc(ring, 5);
274 	if (r) {
275 		dev_err(adev->dev,
276 			"amdgpu: cp failed to lock ring %d (%d).\n",
277 			ring->idx, r);
278 		return r;
279 	}
280 
281 	if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ) {
282 		gfx_v12_1_ring_emit_wreg(ring, xcc_offset, 0xDEADBEEF);
283 	} else {
284 		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1));
285 		amdgpu_ring_write(ring, xcc_offset -
286 				  PACKET3_SET_UCONFIG_REG_START);
287 		amdgpu_ring_write(ring, 0xDEADBEEF);
288 	}
289 	amdgpu_ring_commit(ring);
290 
291 	for (i = 0; i < adev->usec_timeout; i++) {
292 		tmp = RREG32(scratch_reg0_offset);
293 		if (tmp == 0xDEADBEEF)
294 			break;
295 		if (amdgpu_emu_mode == 1)
296 			msleep(1);
297 		else
298 			udelay(1);
299 	}
300 
301 	if (i >= adev->usec_timeout)
302 		r = -ETIMEDOUT;
303 	return r;
304 }
305 
306 static int gfx_v12_1_ring_test_ib(struct amdgpu_ring *ring, long timeout)
307 {
308 	struct amdgpu_device *adev = ring->adev;
309 	struct amdgpu_ib ib;
310 	struct dma_fence *f = NULL;
311 	unsigned index;
312 	uint64_t gpu_addr;
313 	volatile uint32_t *cpu_ptr;
314 	long r;
315 
316 	/* MES KIQ fw hasn't indirect buffer support for now */
317 	if (adev->enable_mes_kiq &&
318 	    ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
319 		return 0;
320 
321 	memset(&ib, 0, sizeof(ib));
322 
323 	r = amdgpu_device_wb_get(adev, &index);
324 	if (r)
325 		return r;
326 
327 	gpu_addr = adev->wb.gpu_addr + (index * 4);
328 	adev->wb.wb[index] = cpu_to_le32(0xCAFEDEAD);
329 	cpu_ptr = &adev->wb.wb[index];
330 
331 	r = amdgpu_ib_get(adev, NULL, 16, AMDGPU_IB_POOL_DIRECT, &ib);
332 	if (r) {
333 		dev_err(adev->dev, "amdgpu: failed to get ib (%ld).\n", r);
334 		goto err1;
335 	}
336 
337 	ib.ptr[0] = PACKET3(PACKET3_WRITE_DATA, 3);
338 	ib.ptr[1] = WRITE_DATA_DST_SEL(5) | WR_CONFIRM;
339 	ib.ptr[2] = lower_32_bits(gpu_addr);
340 	ib.ptr[3] = upper_32_bits(gpu_addr);
341 	ib.ptr[4] = 0xDEADBEEF;
342 	ib.length_dw = 5;
343 
344 	r = amdgpu_ib_schedule(ring, 1, &ib, NULL, &f);
345 	if (r)
346 		goto err2;
347 
348 	r = dma_fence_wait_timeout(f, false, timeout);
349 	if (r == 0) {
350 		r = -ETIMEDOUT;
351 		goto err2;
352 	} else if (r < 0) {
353 		goto err2;
354 	}
355 
356 	if (le32_to_cpu(*cpu_ptr) == 0xDEADBEEF)
357 		r = 0;
358 	else
359 		r = -EINVAL;
360 err2:
361 	amdgpu_ib_free(&ib, NULL);
362 	dma_fence_put(f);
363 err1:
364 	amdgpu_device_wb_free(adev, index);
365 	return r;
366 }
367 
368 static void gfx_v12_1_free_microcode(struct amdgpu_device *adev)
369 {
370 	amdgpu_ucode_release(&adev->gfx.rlc_fw);
371 	amdgpu_ucode_release(&adev->gfx.mec_fw);
372 
373 	kfree(adev->gfx.rlc.register_list_format);
374 }
375 
376 static int gfx_v12_1_init_toc_microcode(struct amdgpu_device *adev, const char *ucode_prefix)
377 {
378 	const struct psp_firmware_header_v1_0 *toc_hdr;
379 	int err = 0;
380 
381 	err = amdgpu_ucode_request(adev, &adev->psp.toc_fw,
382 				   AMDGPU_UCODE_REQUIRED,
383 				   "amdgpu/%s_toc.bin", ucode_prefix);
384 	if (err)
385 		goto out;
386 
387 	toc_hdr = (const struct psp_firmware_header_v1_0 *)adev->psp.toc_fw->data;
388 	adev->psp.toc.fw_version = le32_to_cpu(toc_hdr->header.ucode_version);
389 	adev->psp.toc.feature_version = le32_to_cpu(toc_hdr->sos.fw_version);
390 	adev->psp.toc.size_bytes = le32_to_cpu(toc_hdr->header.ucode_size_bytes);
391 	adev->psp.toc.start_addr = (uint8_t *)toc_hdr +
392 			le32_to_cpu(toc_hdr->header.ucode_array_offset_bytes);
393 	return 0;
394 out:
395 	amdgpu_ucode_release(&adev->psp.toc_fw);
396 	return err;
397 }
398 
399 static int gfx_v12_1_init_microcode(struct amdgpu_device *adev)
400 {
401 	char ucode_prefix[15];
402 	int err;
403 	const struct rlc_firmware_header_v2_0 *rlc_hdr;
404 	uint16_t version_major;
405 	uint16_t version_minor;
406 
407 	DRM_DEBUG("\n");
408 
409 	amdgpu_ucode_ip_version_decode(adev, GC_HWIP, ucode_prefix, sizeof(ucode_prefix));
410 
411 	if (!amdgpu_sriov_vf(adev)) {
412 		err = amdgpu_ucode_request(adev, &adev->gfx.rlc_fw,
413 					   AMDGPU_UCODE_REQUIRED,
414 					   "amdgpu/%s_rlc.bin", ucode_prefix);
415 		if (err)
416 			goto out;
417 		rlc_hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
418 		version_major = le16_to_cpu(rlc_hdr->header.header_version_major);
419 		version_minor = le16_to_cpu(rlc_hdr->header.header_version_minor);
420 		err = amdgpu_gfx_rlc_init_microcode(adev, version_major, version_minor);
421 		if (err)
422 			goto out;
423 	}
424 
425 	err = amdgpu_ucode_request(adev, &adev->gfx.mec_fw,
426 				   AMDGPU_UCODE_REQUIRED,
427 				   "amdgpu/%s_mec.bin", ucode_prefix);
428 	if (err)
429 		goto out;
430 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC);
431 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC_P0_STACK);
432 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC_P1_STACK);
433 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC_P2_STACK);
434 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC_P3_STACK);
435 
436 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO)
437 		err = gfx_v12_1_init_toc_microcode(adev, ucode_prefix);
438 
439 	/* only one MEC for gfx 12 */
440 	adev->gfx.mec2_fw = NULL;
441 
442 	if (adev->gfx.imu.funcs) {
443 		if (adev->gfx.imu.funcs->init_microcode) {
444 			err = adev->gfx.imu.funcs->init_microcode(adev);
445 			if (err)
446 				dev_err(adev->dev, "Failed to load imu firmware!\n");
447 		}
448 	}
449 
450 out:
451 	if (err) {
452 		amdgpu_ucode_release(&adev->gfx.rlc_fw);
453 		amdgpu_ucode_release(&adev->gfx.mec_fw);
454 	}
455 
456 	return err;
457 }
458 
459 static u32 gfx_v12_1_get_csb_size(struct amdgpu_device *adev)
460 {
461 	u32 count = 0;
462 	const struct cs_section_def *sect = NULL;
463 	const struct cs_extent_def *ext = NULL;
464 
465 	count += 1;
466 
467 	for (sect = gfx12_cs_data; sect->section != NULL; ++sect) {
468 		if (sect->id == SECT_CONTEXT) {
469 			for (ext = sect->section; ext->extent != NULL; ++ext)
470 				count += 2 + ext->reg_count;
471 		} else
472 			return 0;
473 	}
474 
475 	return count;
476 }
477 
478 static void gfx_v12_1_get_csb_buffer(struct amdgpu_device *adev, u32 *buffer)
479 {
480 	u32 count = 0, clustercount = 0, i;
481 	const struct cs_section_def *sect = NULL;
482 	const struct cs_extent_def *ext = NULL;
483 
484 	if (adev->gfx.rlc.cs_data == NULL)
485 		return;
486 	if (buffer == NULL)
487 		return;
488 
489 	count += 1;
490 
491 	for (sect = adev->gfx.rlc.cs_data; sect->section != NULL; ++sect) {
492 		if (sect->id == SECT_CONTEXT) {
493 			for (ext = sect->section; ext->extent != NULL; ++ext) {
494 				clustercount++;
495 				buffer[count++] = ext->reg_count;
496 				buffer[count++] = ext->reg_index;
497 
498 				for (i = 0; i < ext->reg_count; i++)
499 					buffer[count++] = cpu_to_le32(ext->extent[i]);
500 			}
501 		} else
502 			return;
503 	}
504 
505 	buffer[0] = clustercount;
506 }
507 
508 static void gfx_v12_1_rlc_fini(struct amdgpu_device *adev)
509 {
510 	/* clear state block */
511 	amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
512 			&adev->gfx.rlc.clear_state_gpu_addr,
513 			(void **)&adev->gfx.rlc.cs_ptr);
514 
515 	/* jump table block */
516 	amdgpu_bo_free_kernel(&adev->gfx.rlc.cp_table_obj,
517 			&adev->gfx.rlc.cp_table_gpu_addr,
518 			(void **)&adev->gfx.rlc.cp_table_ptr);
519 }
520 
521 static void gfx_v12_1_init_rlcg_reg_access_ctrl(struct amdgpu_device *adev)
522 {
523 	int xcc_id, num_xcc;
524 	struct amdgpu_rlcg_reg_access_ctrl *reg_access_ctrl;
525 
526 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
527 	for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) {
528 		reg_access_ctrl = &adev->gfx.rlc.reg_access_ctrl[GET_INST(GC, xcc_id)];
529 		reg_access_ctrl->scratch_reg0 =
530 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regSCRATCH_REG0);
531 		reg_access_ctrl->scratch_reg1 =
532 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regSCRATCH_REG1);
533 		reg_access_ctrl->scratch_reg2 =
534 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regSCRATCH_REG2);
535 		reg_access_ctrl->scratch_reg3 =
536 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regSCRATCH_REG3);
537 		reg_access_ctrl->grbm_cntl =
538 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regGRBM_GFX_CNTL);
539 		reg_access_ctrl->grbm_idx =
540 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regGRBM_GFX_INDEX);
541 		reg_access_ctrl->spare_int =
542 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SPARE_INT_0);
543 	}
544 	adev->gfx.rlc.rlcg_reg_access_supported = true;
545 }
546 
547 static int gfx_v12_1_rlc_init(struct amdgpu_device *adev)
548 {
549 	const struct cs_section_def *cs_data;
550 	int r, i, num_xcc;
551 
552 	adev->gfx.rlc.cs_data = gfx12_cs_data;
553 
554 	cs_data = adev->gfx.rlc.cs_data;
555 
556 	if (cs_data) {
557 		/* init clear state block */
558 		r = amdgpu_gfx_rlc_init_csb(adev);
559 		if (r)
560 			return r;
561 	}
562 
563 	/* init spm vmid with 0xf */
564 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
565 	for (i = 0; i < num_xcc; i++) {
566 		if (adev->gfx.rlc.funcs->update_spm_vmid)
567 			adev->gfx.rlc.funcs->update_spm_vmid(adev, i, NULL, 0xf);
568 	}
569 
570 	return 0;
571 }
572 
573 static void gfx_v12_1_mec_fini(struct amdgpu_device *adev)
574 {
575 	amdgpu_bo_free_kernel(&adev->gfx.mec.hpd_eop_obj, NULL, NULL);
576 	amdgpu_bo_free_kernel(&adev->gfx.mec.mec_fw_obj, NULL, NULL);
577 	amdgpu_bo_free_kernel(&adev->gfx.mec.mec_fw_data_obj, NULL, NULL);
578 }
579 
580 static int gfx_v12_1_mec_init(struct amdgpu_device *adev)
581 {
582 	int r, i, num_xcc;
583 	u32 *hpd;
584 	size_t mec_hpd_size;
585 
586 	bitmap_zero(adev->gfx.mec_bitmap[0].queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
587 
588 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
589 	for (i = 0; i < num_xcc; i++)
590 		bitmap_zero(adev->gfx.mec_bitmap[i].queue_bitmap,
591 			    AMDGPU_MAX_COMPUTE_QUEUES);
592 
593 	/* take ownership of the relevant compute queues */
594 	amdgpu_gfx_compute_queue_acquire(adev);
595 	mec_hpd_size = adev->gfx.num_compute_rings *
596 		       GFX12_MEC_HPD_SIZE * num_xcc;
597 
598 	if (mec_hpd_size) {
599 		r = amdgpu_bo_create_reserved(adev, mec_hpd_size, PAGE_SIZE,
600 					      AMDGPU_GEM_DOMAIN_GTT,
601 					      &adev->gfx.mec.hpd_eop_obj,
602 					      &adev->gfx.mec.hpd_eop_gpu_addr,
603 					      (void **)&hpd);
604 		if (r) {
605 			dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
606 			gfx_v12_1_mec_fini(adev);
607 			return r;
608 		}
609 
610 		memset(hpd, 0, mec_hpd_size);
611 
612 		amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
613 		amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
614 	}
615 
616 	return 0;
617 }
618 
619 static uint32_t wave_read_ind(struct amdgpu_device *adev,
620 			      uint32_t xcc_id, uint32_t wave,
621 			      uint32_t address)
622 {
623 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_IND_INDEX,
624 		(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
625 		(address << SQ_IND_INDEX__INDEX__SHIFT));
626 	return RREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_IND_DATA);
627 }
628 
629 static void wave_read_regs(struct amdgpu_device *adev,
630 			   uint32_t xcc_id, uint32_t wave,
631 			   uint32_t thread, uint32_t regno,
632 			   uint32_t num, uint32_t *out)
633 {
634 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_IND_INDEX,
635 		(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
636 		(regno << SQ_IND_INDEX__INDEX__SHIFT) |
637 		(thread << SQ_IND_INDEX__WORKITEM_ID__SHIFT) |
638 		(SQ_IND_INDEX__AUTO_INCR_MASK));
639 	while (num--)
640 		*(out++) = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_IND_DATA);
641 }
642 
643 static void gfx_v12_1_read_wave_data(struct amdgpu_device *adev,
644 				     uint32_t xcc_id,
645 				     uint32_t simd, uint32_t wave,
646 				     uint32_t *dst, int *no_fields)
647 {
648 	/* in gfx12 the SIMD_ID is specified as part of the INSTANCE
649 	 * field when performing a select_se_sh so it should be
650 	 * zero here */
651 	WARN_ON(simd != 0);
652 
653 	/* type 4 wave data */
654 	dst[(*no_fields)++] = 4;
655 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_STATUS);
656 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_PC_LO);
657 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_PC_HI);
658 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_EXEC_LO);
659 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_EXEC_HI);
660 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_HW_ID1);
661 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_HW_ID2);
662 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_GPR_ALLOC);
663 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_LDS_ALLOC);
664 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_IB_STS);
665 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_IB_STS2);
666 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_IB_DBG1);
667 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_M0);
668 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_MODE);
669 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_STATE_PRIV);
670 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_EXCP_FLAG_PRIV);
671 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_EXCP_FLAG_USER);
672 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_TRAP_CTRL);
673 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_ACTIVE);
674 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_VALID_AND_IDLE);
675 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_DVGPR_ALLOC_LO);
676 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_DVGPR_ALLOC_HI);
677 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_SCHED_MODE);
678 }
679 
680 static void gfx_v12_1_read_wave_sgprs(struct amdgpu_device *adev,
681 				      uint32_t xcc_id, uint32_t simd,
682 				      uint32_t wave, uint32_t start,
683 				      uint32_t size, uint32_t *dst)
684 {
685 	WARN_ON(simd != 0);
686 
687 	wave_read_regs(adev, xcc_id, wave, 0,
688 		       start + SQIND_WAVE_SGPRS_OFFSET,
689 		       size, dst);
690 }
691 
692 static void gfx_v12_1_read_wave_vgprs(struct amdgpu_device *adev,
693 				      uint32_t xcc_id, uint32_t simd,
694 				      uint32_t wave, uint32_t thread,
695 				      uint32_t start, uint32_t size,
696 				      uint32_t *dst)
697 {
698 	wave_read_regs(adev, xcc_id, wave, thread,
699 		       start + SQIND_WAVE_VGPRS_OFFSET,
700 		       size, dst);
701 }
702 
703 static void gfx_v12_1_select_me_pipe_q(struct amdgpu_device *adev,
704 				       u32 me, u32 pipe, u32 q, u32 vm, u32 xcc_id)
705 {
706 	soc_v1_0_grbm_select(adev, me, pipe, q, vm, GET_INST(GC, xcc_id));
707 }
708 
709 static int gfx_v12_1_get_xccs_per_xcp(struct amdgpu_device *adev)
710 {
711 	/* Fill this in when the interface is ready */
712 	return 1;
713 }
714 
715 static int gfx_v12_1_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node)
716 {
717 	int logic_xcc;
718 	int xcc = (ih_node & 0x7) - 2 + (ih_node >> 3) * 4;
719 
720 	for (logic_xcc = 0; logic_xcc < NUM_XCC(adev->gfx.xcc_mask); logic_xcc++) {
721 		if (xcc == GET_INST(GC, logic_xcc))
722 			return logic_xcc;
723 	}
724 
725 	dev_err(adev->dev, "Couldn't find xcc mapping from IH node");
726 	return -EINVAL;
727 }
728 
729 static const struct amdgpu_gfx_funcs gfx_v12_1_gfx_funcs = {
730 	.get_gpu_clock_counter = &gfx_v12_1_get_gpu_clock_counter,
731 	.select_se_sh = &gfx_v12_1_xcc_select_se_sh,
732 	.read_wave_data = &gfx_v12_1_read_wave_data,
733 	.read_wave_sgprs = &gfx_v12_1_read_wave_sgprs,
734 	.read_wave_vgprs = &gfx_v12_1_read_wave_vgprs,
735 	.select_me_pipe_q = &gfx_v12_1_select_me_pipe_q,
736 	.update_perfmon_mgcg = &gfx_v12_1_update_perf_clk,
737 	.get_xccs_per_xcp = &gfx_v12_1_get_xccs_per_xcp,
738 	.ih_node_to_logical_xcc = &gfx_v12_1_ih_to_xcc_inst,
739 };
740 
741 static int gfx_v12_1_gpu_early_init(struct amdgpu_device *adev)
742 {
743 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
744 	case IP_VERSION(12, 1, 0):
745 		adev->gfx.config.max_hw_contexts = 8;
746 		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
747 		adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
748 		adev->gfx.config.sc_hiz_tile_fifo_size = 0;
749 		adev->gfx.config.sc_earlyz_tile_fifo_size = 0x4C0;
750 		break;
751 	default:
752 		BUG();
753 		break;
754 	}
755 
756 	return 0;
757 }
758 
759 static int gfx_v12_1_compute_ring_init(struct amdgpu_device *adev, int ring_id,
760 				       int xcc_id, int mec, int pipe, int queue)
761 {
762 	int r;
763 	unsigned irq_type;
764 	struct amdgpu_ring *ring;
765 	unsigned int hw_prio;
766 	uint32_t xcc_doorbell_start;
767 
768 	ring = &adev->gfx.compute_ring[xcc_id * adev->gfx.num_compute_rings +
769 				       ring_id];
770 
771 	/* mec0 is me1 */
772 	ring->xcc_id = xcc_id;
773 	ring->me = mec + 1;
774 	ring->pipe = pipe;
775 	ring->queue = queue;
776 
777 	ring->ring_obj = NULL;
778 	ring->use_doorbell = true;
779 	xcc_doorbell_start = adev->doorbell_index.mec_ring0 +
780 			     xcc_id * adev->doorbell_index.xcc_doorbell_range;
781 	ring->doorbell_index = (xcc_doorbell_start + ring_id) << 1;
782 	ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr +
783 			     (ring_id + xcc_id * adev->gfx.num_compute_rings) *
784 			     GFX12_MEC_HPD_SIZE;
785 	ring->vm_hub = AMDGPU_GFXHUB(xcc_id);
786 	sprintf(ring->name, "comp_%d.%d.%d.%d",
787 			ring->xcc_id, ring->me, ring->pipe, ring->queue);
788 
789 	irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
790 		+ ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
791 		+ ring->pipe;
792 	hw_prio = amdgpu_gfx_is_high_priority_compute_queue(adev, ring) ?
793 			AMDGPU_GFX_PIPE_PRIO_HIGH : AMDGPU_GFX_PIPE_PRIO_NORMAL;
794 	/* type-2 packets are deprecated on MEC, use type-3 instead */
795 	r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq, irq_type,
796 			     hw_prio, NULL);
797 	if (r)
798 		return r;
799 
800 	return 0;
801 }
802 
803 static struct {
804 	SOC24_FIRMWARE_ID	id;
805 	unsigned int		offset;
806 	unsigned int		size;
807 	unsigned int		size_x16;
808 	unsigned int		num_inst;
809 } rlc_autoload_info[SOC24_FIRMWARE_ID_MAX];
810 
811 #define RLC_TOC_OFFSET_DWUNIT   8
812 #define RLC_SIZE_MULTIPLE       1024
813 #define RLC_TOC_UMF_SIZE_inM	23ULL
814 #define RLC_TOC_FORMAT_API	165ULL
815 
816 #define RLC_NUM_INS_CODE0   1
817 #define RLC_NUM_INS_CODE1   8
818 #define RLC_NUM_INS_CODE2   2
819 #define RLC_NUM_INS_CODE3   16
820 
821 static void gfx_v12_1_parse_rlc_toc(struct amdgpu_device *adev, void *rlc_toc)
822 {
823 	RLC_TABLE_OF_CONTENT_V2 *ucode = rlc_toc;
824 
825 	while (ucode && (ucode->id > SOC24_FIRMWARE_ID_INVALID)) {
826 		rlc_autoload_info[ucode->id].id = ucode->id;
827 		rlc_autoload_info[ucode->id].offset =
828 			ucode->offset * RLC_TOC_OFFSET_DWUNIT * 4;
829 		rlc_autoload_info[ucode->id].size =
830 			ucode->size_x16 ? ucode->size * RLC_SIZE_MULTIPLE * 4 :
831 					  ucode->size * 4;
832 		switch (ucode->vfflr_image_code) {
833 		case 0:
834 			rlc_autoload_info[ucode->id].num_inst =
835 				RLC_NUM_INS_CODE0;
836 			break;
837 		case 1:
838 			rlc_autoload_info[ucode->id].num_inst =
839 				RLC_NUM_INS_CODE1;
840 			break;
841 		case 2:
842 			rlc_autoload_info[ucode->id].num_inst =
843 				RLC_NUM_INS_CODE2;
844 			break;
845 		case 3:
846 			rlc_autoload_info[ucode->id].num_inst =
847 				RLC_NUM_INS_CODE3;
848 			break;
849 		default:
850 			dev_err(adev->dev,
851 				"Invalid Instance number detected\n");
852 			break;
853 		}
854 		ucode++;
855 	}
856 }
857 
858 static uint32_t gfx_v12_1_calc_toc_total_size(struct amdgpu_device *adev)
859 {
860 	uint32_t total_size = 0;
861 	SOC24_FIRMWARE_ID id;
862 
863 	gfx_v12_1_parse_rlc_toc(adev, adev->psp.toc.start_addr);
864 
865 	for (id = SOC24_FIRMWARE_ID_RLC_G_UCODE; id < SOC24_FIRMWARE_ID_MAX; id++)
866 		total_size += rlc_autoload_info[id].size;
867 
868 	/* In case the offset in rlc toc ucode is aligned */
869 	if (total_size < rlc_autoload_info[SOC24_FIRMWARE_ID_MAX-1].offset)
870 		total_size = rlc_autoload_info[SOC24_FIRMWARE_ID_MAX-1].offset +
871 			rlc_autoload_info[SOC24_FIRMWARE_ID_MAX-1].size;
872 	if (total_size < (RLC_TOC_UMF_SIZE_inM << 20))
873 		total_size = RLC_TOC_UMF_SIZE_inM << 20;
874 
875 	return total_size;
876 }
877 
878 static int gfx_v12_1_rlc_autoload_buffer_init(struct amdgpu_device *adev)
879 {
880 	int r;
881 	uint32_t total_size;
882 
883 	total_size = gfx_v12_1_calc_toc_total_size(adev);
884 
885 	r = amdgpu_bo_create_reserved(adev, total_size, 64 * 1024,
886 				      AMDGPU_GEM_DOMAIN_VRAM,
887 				      &adev->gfx.rlc.rlc_autoload_bo,
888 				      &adev->gfx.rlc.rlc_autoload_gpu_addr,
889 				      (void **)&adev->gfx.rlc.rlc_autoload_ptr);
890 
891 	if (r) {
892 		dev_err(adev->dev, "(%d) failed to create fw autoload bo\n", r);
893 		return r;
894 	}
895 
896 	return 0;
897 }
898 
899 static void gfx_v12_1_rlc_backdoor_autoload_copy_ucode(struct amdgpu_device *adev,
900 						       SOC24_FIRMWARE_ID id,
901 						       const void *fw_data,
902 						       uint32_t fw_size)
903 {
904 	uint32_t toc_offset;
905 	uint32_t toc_fw_size, toc_fw_inst_size;
906 	char *ptr = adev->gfx.rlc.rlc_autoload_ptr;
907 	int i, num_inst;
908 
909 	if (id <= SOC24_FIRMWARE_ID_INVALID || id >= SOC24_FIRMWARE_ID_MAX)
910 		return;
911 
912 	toc_offset = rlc_autoload_info[id].offset;
913 	toc_fw_size = rlc_autoload_info[id].size;
914 	num_inst = rlc_autoload_info[id].num_inst;
915 	toc_fw_inst_size = toc_fw_size / num_inst;
916 
917 	if (fw_size == 0)
918 		fw_size = toc_fw_inst_size;
919 
920 	if (fw_size > toc_fw_inst_size)
921 		fw_size = toc_fw_inst_size;
922 
923 	for (i = 0; i < num_inst; i++) {
924 		if ((num_inst == RLC_NUM_INS_CODE0) ||
925 		    ((1 << (i / 2)) & adev->gfx.xcc_mask)) {
926 			memcpy(ptr + toc_offset + i * toc_fw_inst_size, fw_data, fw_size);
927 
928 			if (fw_size < toc_fw_inst_size)
929 				memset(ptr + toc_offset + fw_size + i * toc_fw_inst_size,
930 				       0, toc_fw_inst_size - fw_size);
931 		}
932 	}
933 }
934 
935 static void
936 gfx_v12_1_rlc_backdoor_autoload_copy_toc_ucode(struct amdgpu_device *adev)
937 {
938 	void *data;
939 	uint32_t size;
940 	uint32_t *toc_ptr;
941 
942 	data = adev->psp.toc.start_addr;
943 	size = rlc_autoload_info[SOC24_FIRMWARE_ID_RLC_TOC].size;
944 
945 	toc_ptr = (uint32_t *)data + size / 4 - 2;
946 	*toc_ptr = (RLC_TOC_FORMAT_API << 24) | 0x1;
947 
948 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLC_TOC,
949 						   data, size);
950 }
951 
952 static void
953 gfx_v12_1_rlc_backdoor_autoload_copy_gfx_ucode(struct amdgpu_device *adev)
954 {
955 	const __le32 *fw_data;
956 	uint32_t fw_size;
957 	const struct gfx_firmware_header_v2_0 *cpv2_hdr;
958 	const struct rlc_firmware_header_v2_0 *rlc_hdr;
959 	const struct rlc_firmware_header_v2_1 *rlcv21_hdr;
960 	const struct rlc_firmware_header_v2_2 *rlcv22_hdr;
961 	uint16_t version_major, version_minor;
962 
963 	/* mec ucode */
964 	cpv2_hdr = (const struct gfx_firmware_header_v2_0 *)
965 		adev->gfx.mec_fw->data;
966 	/* instruction */
967 	fw_data = (const __le32 *) (adev->gfx.mec_fw->data +
968 		le32_to_cpu(cpv2_hdr->ucode_offset_bytes));
969 	fw_size = le32_to_cpu(cpv2_hdr->ucode_size_bytes);
970 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC,
971 						   fw_data, fw_size);
972 	/* data */
973 	fw_data = (const __le32 *) (adev->gfx.mec_fw->data +
974 		le32_to_cpu(cpv2_hdr->data_offset_bytes));
975 	fw_size = le32_to_cpu(cpv2_hdr->data_size_bytes);
976 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC_P0_STACK,
977 						   fw_data, fw_size);
978 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC_P1_STACK,
979 						   fw_data, fw_size);
980 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC_P2_STACK,
981 						   fw_data, fw_size);
982 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC_P3_STACK,
983 						   fw_data, fw_size);
984 
985 	/* rlc ucode */
986 	rlc_hdr = (const struct rlc_firmware_header_v2_0 *)
987 		adev->gfx.rlc_fw->data;
988 	fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
989 			le32_to_cpu(rlc_hdr->header.ucode_array_offset_bytes));
990 	fw_size = le32_to_cpu(rlc_hdr->header.ucode_size_bytes);
991 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLC_G_UCODE,
992 						   fw_data, fw_size);
993 
994 	version_major = le16_to_cpu(rlc_hdr->header.header_version_major);
995 	version_minor = le16_to_cpu(rlc_hdr->header.header_version_minor);
996 	if (version_major == 2) {
997 		if (version_minor >= 1) {
998 			rlcv21_hdr = (const struct rlc_firmware_header_v2_1 *)adev->gfx.rlc_fw->data;
999 
1000 			fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1001 					le32_to_cpu(rlcv21_hdr->save_restore_list_gpm_offset_bytes));
1002 			fw_size = le32_to_cpu(rlcv21_hdr->save_restore_list_gpm_size_bytes);
1003 			gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLCG_SCRATCH,
1004 						   fw_data, fw_size);
1005 
1006 			fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1007 					le32_to_cpu(rlcv21_hdr->save_restore_list_srm_offset_bytes));
1008 			fw_size = le32_to_cpu(rlcv21_hdr->save_restore_list_srm_size_bytes);
1009 			gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLC_SRM_ARAM,
1010 						   fw_data, fw_size);
1011 		}
1012 		if (version_minor >= 2) {
1013 			rlcv22_hdr = (const struct rlc_firmware_header_v2_2 *)adev->gfx.rlc_fw->data;
1014 
1015 			fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1016 					le32_to_cpu(rlcv22_hdr->rlc_iram_ucode_offset_bytes));
1017 			fw_size = le32_to_cpu(rlcv22_hdr->rlc_iram_ucode_size_bytes);
1018 			gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLX6_UCODE,
1019 						   fw_data, fw_size);
1020 
1021 			fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1022 					le32_to_cpu(rlcv22_hdr->rlc_dram_ucode_offset_bytes));
1023 			fw_size = le32_to_cpu(rlcv22_hdr->rlc_dram_ucode_size_bytes);
1024 			gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLX6_DRAM_BOOT,
1025 						   fw_data, fw_size);
1026 		}
1027 	}
1028 }
1029 
1030 static void
1031 gfx_v12_1_rlc_backdoor_autoload_copy_sdma_ucode(struct amdgpu_device *adev)
1032 {
1033 	const __le32 *fw_data;
1034 	uint32_t fw_size;
1035 	const struct sdma_firmware_header_v3_0 *sdma_hdr;
1036 
1037 	if (adev->sdma.instance[0].fw) {
1038 		sdma_hdr = (const struct sdma_firmware_header_v3_0 *)
1039 			adev->sdma.instance[0].fw->data;
1040 		fw_data = (const __le32 *) (adev->sdma.instance[0].fw->data +
1041 				le32_to_cpu(sdma_hdr->ucode_offset_bytes));
1042 		fw_size = le32_to_cpu(sdma_hdr->ucode_size_bytes);
1043 
1044 		gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_SDMA_UCODE_TH0,
1045 							   fw_data, fw_size);
1046 	}
1047 }
1048 
1049 static void
1050 gfx_v12_1_rlc_backdoor_autoload_copy_mes_ucode(struct amdgpu_device *adev)
1051 {
1052 	const __le32 *fw_data;
1053 	unsigned fw_size;
1054 	const struct mes_firmware_header_v1_0 *mes_hdr;
1055 	int pipe, ucode_id, data_id;
1056 
1057 	for (pipe = 0; pipe < 2; pipe++) {
1058 		if (pipe == 0) {
1059 			ucode_id = SOC24_FIRMWARE_ID_RS64_MES_P0;
1060 			data_id  = SOC24_FIRMWARE_ID_RS64_MES_P0_STACK;
1061 		} else {
1062 			ucode_id = SOC24_FIRMWARE_ID_RS64_MES_P1;
1063 			data_id  = SOC24_FIRMWARE_ID_RS64_MES_P1_STACK;
1064 		}
1065 
1066 		mes_hdr = (const struct mes_firmware_header_v1_0 *)
1067 			adev->mes.fw[pipe]->data;
1068 
1069 		fw_data = (const __le32 *)(adev->mes.fw[pipe]->data +
1070 				le32_to_cpu(mes_hdr->mes_ucode_offset_bytes));
1071 		fw_size = le32_to_cpu(mes_hdr->mes_ucode_size_bytes);
1072 
1073 		gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, ucode_id, fw_data, fw_size);
1074 
1075 		fw_data = (const __le32 *)(adev->mes.fw[pipe]->data +
1076 				le32_to_cpu(mes_hdr->mes_ucode_data_offset_bytes));
1077 		fw_size = le32_to_cpu(mes_hdr->mes_ucode_data_size_bytes);
1078 
1079 		gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, data_id, fw_data, fw_size);
1080 	}
1081 }
1082 
1083 static int gfx_v12_1_rlc_backdoor_autoload_enable(struct amdgpu_device *adev)
1084 {
1085 	uint32_t rlc_g_offset, rlc_g_size;
1086 	uint64_t gpu_addr;
1087 	uint32_t data;
1088 	int i, num_xcc;
1089 
1090 	/* RLC autoload sequence 2: copy ucode */
1091 	gfx_v12_1_rlc_backdoor_autoload_copy_sdma_ucode(adev);
1092 	gfx_v12_1_rlc_backdoor_autoload_copy_gfx_ucode(adev);
1093 	gfx_v12_1_rlc_backdoor_autoload_copy_mes_ucode(adev);
1094 	gfx_v12_1_rlc_backdoor_autoload_copy_toc_ucode(adev);
1095 
1096 	rlc_g_offset = rlc_autoload_info[SOC24_FIRMWARE_ID_RLC_G_UCODE].offset;
1097 	rlc_g_size = rlc_autoload_info[SOC24_FIRMWARE_ID_RLC_G_UCODE].size;
1098 	gpu_addr = adev->gfx.rlc.rlc_autoload_gpu_addr + rlc_g_offset - adev->gmc.vram_start;
1099 
1100 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1101 	for (i = 0; i < num_xcc; i++) {
1102 		WREG32_SOC15(GC, GET_INST(GC, i),
1103 			     regGFX_IMU_RLC_BOOTLOADER_ADDR_HI,
1104 			     upper_32_bits(gpu_addr));
1105 		WREG32_SOC15(GC, GET_INST(GC, i),
1106 			     regGFX_IMU_RLC_BOOTLOADER_ADDR_LO,
1107 			     lower_32_bits(gpu_addr));
1108 		WREG32_SOC15(GC, GET_INST(GC, i),
1109 			     regGFX_IMU_RLC_BOOTLOADER_SIZE,
1110 			     rlc_g_size);
1111 	}
1112 
1113 	if (adev->gfx.imu.funcs) {
1114 		/* RLC autoload sequence 3: load IMU fw */
1115 		if (adev->gfx.imu.funcs->load_microcode)
1116 			adev->gfx.imu.funcs->load_microcode(adev);
1117 	}
1118 
1119 	/* unhalt rlc to start autoload */
1120 	for (i = 0; i < num_xcc; i++) {
1121 		data = RREG32_SOC15(GC, GET_INST(GC, i), regRLC_GPM_THREAD_ENABLE);
1122 		data = REG_SET_FIELD(data, RLC_GPM_THREAD_ENABLE, THREAD0_ENABLE, 1);
1123 		data = REG_SET_FIELD(data, RLC_GPM_THREAD_ENABLE, THREAD1_ENABLE, 1);
1124 		WREG32_SOC15(GC, GET_INST(GC, i), regRLC_GPM_THREAD_ENABLE, data);
1125 		WREG32_SOC15(GC, GET_INST(GC, i), regRLC_CNTL, RLC_CNTL__RLC_ENABLE_F32_MASK);
1126 	}
1127 
1128 	return 0;
1129 }
1130 
1131 static int gfx_v12_1_sw_init(struct amdgpu_ip_block *ip_block)
1132 {
1133 	int i, j, k, r, ring_id = 0;
1134 	unsigned num_compute_rings;
1135 	int xcc_id, num_xcc;
1136 	struct amdgpu_device *adev = ip_block->adev;
1137 
1138 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
1139 	case IP_VERSION(12, 1, 0):
1140 		adev->gfx.mec.num_mec = 1;
1141 		adev->gfx.mec.num_pipe_per_mec = 4;
1142 		adev->gfx.mec.num_queue_per_pipe = 8;
1143 		break;
1144 	default:
1145 		adev->gfx.mec.num_mec = 2;
1146 		adev->gfx.mec.num_pipe_per_mec = 2;
1147 		adev->gfx.mec.num_queue_per_pipe = 4;
1148 		break;
1149 	}
1150 
1151 	/* recalculate compute rings to use based on hardware configuration */
1152 	num_compute_rings = (adev->gfx.mec.num_pipe_per_mec *
1153 			     adev->gfx.mec.num_queue_per_pipe) / 2;
1154 	adev->gfx.num_compute_rings = min(adev->gfx.num_compute_rings,
1155 					  num_compute_rings);
1156 
1157 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1158 
1159 	/* EOP Event */
1160 	r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_GRBM_CP,
1161 			      GFX_12_1_0__SRCID__CP_EOP_INTERRUPT,
1162 			      &adev->gfx.eop_irq);
1163 	if (r)
1164 		return r;
1165 
1166 	/* Privileged reg */
1167 	r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_GRBM_CP,
1168 			      GFX_12_1_0__SRCID__CP_PRIV_REG_FAULT,
1169 			      &adev->gfx.priv_reg_irq);
1170 	if (r)
1171 		return r;
1172 
1173 	/* Privileged inst */
1174 	r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_GRBM_CP,
1175 			      GFX_12_1_0__SRCID__CP_PRIV_INSTR_FAULT,
1176 			      &adev->gfx.priv_inst_irq);
1177 	if (r)
1178 		return r;
1179 
1180 	adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE;
1181 
1182 	r = gfx_v12_1_rlc_init(adev);
1183 	if (r) {
1184 		dev_err(adev->dev, "Failed to init rlc BOs!\n");
1185 		return r;
1186 	}
1187 
1188 	r = gfx_v12_1_mec_init(adev);
1189 	if (r) {
1190 		dev_err(adev->dev, "Failed to init MEC BOs!\n");
1191 		return r;
1192 	}
1193 
1194 	/* set up the compute queues - allocate horizontally across pipes */
1195 	for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) {
1196 		ring_id = 0;
1197 		for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
1198 			for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) {
1199 				for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; k++) {
1200 					if (!amdgpu_gfx_is_mec_queue_enabled(adev,
1201 								xcc_id, i, k, j))
1202 						continue;
1203 
1204 					r = gfx_v12_1_compute_ring_init(adev, ring_id,
1205 								xcc_id, i, k, j);
1206 					if (r)
1207 						return r;
1208 
1209 					ring_id++;
1210 				}
1211 			}
1212 		}
1213 
1214 		if (!adev->enable_mes_kiq) {
1215 			r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, xcc_id);
1216 			if (r) {
1217 				dev_err(adev->dev, "Failed to init KIQ BOs!\n");
1218 				return r;
1219 			}
1220 
1221 			r = amdgpu_gfx_kiq_init_ring(adev, xcc_id);
1222 			if (r)
1223 				return r;
1224 		}
1225 
1226 		r = amdgpu_gfx_mqd_sw_init(adev, sizeof(struct v12_1_compute_mqd), xcc_id);
1227 		if (r)
1228 			return r;
1229 	}
1230 
1231 	/* allocate visible FB for rlc auto-loading fw */
1232 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {
1233 		r = gfx_v12_1_rlc_autoload_buffer_init(adev);
1234 		if (r)
1235 			return r;
1236 	} else if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
1237 		r = gfx_v12_1_init_cp_compute_microcode_bo(adev);
1238 		if (r)
1239 			return r;
1240 	}
1241 
1242 	r = gfx_v12_1_gpu_early_init(adev);
1243 	if (r)
1244 		return r;
1245 
1246 	r = amdgpu_gfx_sysfs_init(adev);
1247 	if (r)
1248 		return r;
1249 
1250 	return 0;
1251 }
1252 
1253 static void gfx_v12_1_rlc_autoload_buffer_fini(struct amdgpu_device *adev)
1254 {
1255 	amdgpu_bo_free_kernel(&adev->gfx.rlc.rlc_autoload_bo,
1256 			&adev->gfx.rlc.rlc_autoload_gpu_addr,
1257 			(void **)&adev->gfx.rlc.rlc_autoload_ptr);
1258 }
1259 
1260 static int gfx_v12_1_sw_fini(struct amdgpu_ip_block *ip_block)
1261 {
1262 	int i, num_xcc;
1263 	struct amdgpu_device *adev = ip_block->adev;
1264 
1265 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1266 	for (i = 0; i < adev->gfx.num_compute_rings * num_xcc; i++)
1267 		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
1268 
1269 	for (i = 0; i < num_xcc; i++) {
1270 		amdgpu_gfx_mqd_sw_fini(adev, i);
1271 
1272 		if (!adev->enable_mes_kiq) {
1273 			amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq[i].ring);
1274 			amdgpu_gfx_kiq_fini(adev, i);
1275 		}
1276 	}
1277 
1278 	gfx_v12_1_rlc_fini(adev);
1279 	gfx_v12_1_mec_fini(adev);
1280 
1281 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO)
1282 		gfx_v12_1_rlc_autoload_buffer_fini(adev);
1283 
1284 	gfx_v12_1_free_microcode(adev);
1285 
1286 	return 0;
1287 }
1288 
1289 static void gfx_v12_1_xcc_select_se_sh(struct amdgpu_device *adev, u32 se_num,
1290 				       u32 sh_num, u32 instance, int xcc_id)
1291 {
1292 	u32 data;
1293 
1294 	if (instance == 0xffffffff)
1295 		data = REG_SET_FIELD(0, GRBM_GFX_INDEX,
1296 				     INSTANCE_BROADCAST_WRITES, 1);
1297 	else
1298 		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX,
1299 				     instance);
1300 
1301 	if (se_num == 0xffffffff)
1302 		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES,
1303 				     1);
1304 	else
1305 		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
1306 
1307 	if (sh_num == 0xffffffff)
1308 		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SA_BROADCAST_WRITES,
1309 				     1);
1310 	else
1311 		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SA_INDEX, sh_num);
1312 
1313 	WREG32_SOC15_RLC_SHADOW_EX(reg, GC, GET_INST(GC, xcc_id), regGRBM_GFX_INDEX, data);
1314 }
1315 
1316 static u32 gfx_v12_1_get_sa_active_bitmap(struct amdgpu_device *adev,
1317 					  int xcc_id)
1318 {
1319 	u32 gc_disabled_sa_mask, gc_user_disabled_sa_mask, sa_mask;
1320 
1321 	gc_disabled_sa_mask = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCC_GC_SA_UNIT_DISABLE);
1322 	gc_disabled_sa_mask = REG_GET_FIELD(gc_disabled_sa_mask,
1323 					    CC_GC_SA_UNIT_DISABLE,
1324 					    SA_DISABLE);
1325 	gc_user_disabled_sa_mask = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SA_UNIT_DISABLE);
1326 	gc_user_disabled_sa_mask = REG_GET_FIELD(gc_user_disabled_sa_mask,
1327 						 GC_USER_SA_UNIT_DISABLE,
1328 						 SA_DISABLE);
1329 	sa_mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_sh_per_se *
1330 					    adev->gfx.config.max_shader_engines);
1331 
1332 	return sa_mask & (~(gc_disabled_sa_mask | gc_user_disabled_sa_mask));
1333 }
1334 
1335 static u32 gfx_v12_1_get_rb_active_bitmap(struct amdgpu_device *adev,
1336 					  int xcc_id)
1337 {
1338 	u32 gc_disabled_rb_mask, gc_user_disabled_rb_mask;
1339 	u32 rb_mask;
1340 
1341 	gc_disabled_rb_mask = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
1342 					   regCC_RB_BACKEND_DISABLE);
1343 	gc_disabled_rb_mask = REG_GET_FIELD(gc_disabled_rb_mask,
1344 					    CC_RB_BACKEND_DISABLE,
1345 					    BACKEND_DISABLE);
1346 	gc_user_disabled_rb_mask = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
1347 						regGC_USER_RB_BACKEND_DISABLE);
1348 	gc_user_disabled_rb_mask = REG_GET_FIELD(gc_user_disabled_rb_mask,
1349 						 GC_USER_RB_BACKEND_DISABLE,
1350 						 BACKEND_DISABLE);
1351 	rb_mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_backends_per_se *
1352 					    adev->gfx.config.max_shader_engines);
1353 
1354 	return rb_mask & (~(gc_disabled_rb_mask | gc_user_disabled_rb_mask));
1355 }
1356 
1357 static void gfx_v12_1_setup_rb(struct amdgpu_device *adev)
1358 {
1359 	u32 rb_bitmap_width_per_sa;
1360 	u32 max_sa;
1361 	u32 active_sa_bitmap;
1362 	u32 global_active_rb_bitmap;
1363 	u32 active_rb_bitmap = 0;
1364 	u32 i;
1365 	int xcc_id;
1366 
1367 	for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) {
1368 		/* query sa bitmap from SA_UNIT_DISABLE registers */
1369 		active_sa_bitmap = gfx_v12_1_get_sa_active_bitmap(adev, xcc_id);
1370 		/* query rb bitmap from RB_BACKEND_DISABLE registers */
1371 		global_active_rb_bitmap = gfx_v12_1_get_rb_active_bitmap(adev, xcc_id);
1372 
1373 		/* generate active rb bitmap according to active sa bitmap */
1374 		max_sa = adev->gfx.config.max_shader_engines *
1375 			 adev->gfx.config.max_sh_per_se;
1376 		rb_bitmap_width_per_sa = adev->gfx.config.max_backends_per_se /
1377 					 adev->gfx.config.max_sh_per_se;
1378 		for (i = 0; i < max_sa; i++) {
1379 			if (active_sa_bitmap & (1 << i))
1380 				active_rb_bitmap |= (0x3 << (i * rb_bitmap_width_per_sa));
1381 		}
1382 
1383 		active_rb_bitmap |= global_active_rb_bitmap;
1384 	}
1385 
1386 	adev->gfx.config.backend_enable_mask = active_rb_bitmap;
1387 	adev->gfx.config.num_rbs = hweight32(active_rb_bitmap);
1388 }
1389 
1390 static void gfx_v12_1_xcc_init_compute_vmid(struct amdgpu_device *adev,
1391 					    int xcc_id)
1392 {
1393 	int i;
1394 	uint32_t sh_mem_bases;
1395 	uint32_t data;
1396 
1397 	/*
1398 	 * Configure apertures:
1399 	 * LDS:         0x20000000'00000000 - 0x20000001'00000000 (4GB)
1400 	 * Scratch:     0x10000000'00000000 - 0x10000001'00000000 (4GB)
1401 	 */
1402 	sh_mem_bases = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,
1403 				     (adev->gmc.private_aperture_start >> 58));
1404 	sh_mem_bases = REG_SET_FIELD(sh_mem_bases, SH_MEM_BASES, SHARED_BASE,
1405 				     (adev->gmc.shared_aperture_start >> 48));
1406 
1407 	mutex_lock(&adev->srbm_mutex);
1408 	for (i = adev->vm_manager.first_kfd_vmid; i < AMDGPU_NUM_VMID; i++) {
1409 		soc_v1_0_grbm_select(adev, 0, 0, 0, i, GET_INST(GC, xcc_id));
1410 		/* CP and shaders */
1411 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSH_MEM_CONFIG, DEFAULT_SH_MEM_CONFIG);
1412 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSH_MEM_BASES, sh_mem_bases);
1413 
1414 		/* Enable trap for each kfd vmid. */
1415 		data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regSPI_GDBG_PER_VMID_CNTL);
1416 		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
1417 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSPI_GDBG_PER_VMID_CNTL, data);
1418 
1419 		/* Disable VGPR deallocation instruction for each KFD vmid. */
1420 		data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_DEBUG);
1421 		data = REG_SET_FIELD(data, SQ_DEBUG, DISABLE_VGPR_DEALLOC, 1);
1422 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_DEBUG, data);
1423 	}
1424 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
1425 	mutex_unlock(&adev->srbm_mutex);
1426 }
1427 
1428 static void gfx_v12_1_tcp_harvest(struct amdgpu_device *adev)
1429 {
1430 	/* TODO: harvest feature to be added later. */
1431 }
1432 
1433 static void gfx_v12_1_get_tcc_info(struct amdgpu_device *adev)
1434 {
1435 }
1436 
1437 static void gfx_v12_1_xcc_constants_init(struct amdgpu_device *adev,
1438 					 int xcc_id)
1439 {
1440 	u32 tmp;
1441 	int i;
1442 
1443 	/* XXX SH_MEM regs */
1444 	/* where to put LDS, scratch, GPUVM in FSA64 space */
1445 	mutex_lock(&adev->srbm_mutex);
1446 	for (i = 0; i < adev->vm_manager.id_mgr[AMDGPU_GFXHUB(0)].num_ids; i++) {
1447 		soc_v1_0_grbm_select(adev, 0, 0, 0, i, GET_INST(GC, xcc_id));
1448 		/* CP and shaders */
1449 		WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1450 			     regSH_MEM_CONFIG, DEFAULT_SH_MEM_CONFIG);
1451 		if (i != 0) {
1452 			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,
1453 				(adev->gmc.private_aperture_start >> 58));
1454 			tmp = REG_SET_FIELD(tmp, SH_MEM_BASES, SHARED_BASE,
1455 				(adev->gmc.shared_aperture_start >> 48));
1456 			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSH_MEM_BASES, tmp);
1457 		}
1458 	}
1459 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
1460 
1461 	mutex_unlock(&adev->srbm_mutex);
1462 
1463 	gfx_v12_1_xcc_init_compute_vmid(adev, xcc_id);
1464 }
1465 
1466 static void gfx_v12_1_constants_init(struct amdgpu_device *adev)
1467 {
1468 	int i, num_xcc;
1469 
1470 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1471 
1472 	gfx_v12_1_setup_rb(adev);
1473 	gfx_v12_1_get_cu_info(adev, &adev->gfx.cu_info);
1474 	gfx_v12_1_get_tcc_info(adev);
1475 	adev->gfx.config.pa_sc_tile_steering_override = 0;
1476 
1477 	for (i = 0; i < num_xcc; i++)
1478 		gfx_v12_1_xcc_constants_init(adev, i);
1479 }
1480 
1481 static void gfx_v12_1_xcc_enable_gui_idle_interrupt(struct amdgpu_device *adev,
1482 						    bool enable, int xcc_id)
1483 {
1484 	u32 tmp;
1485 
1486 	if (amdgpu_sriov_vf(adev))
1487 		return;
1488 
1489 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_INT_CNTL_RING0);
1490 
1491 	tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CNTX_BUSY_INT_ENABLE,
1492 			    enable ? 1 : 0);
1493 	tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CNTX_EMPTY_INT_ENABLE,
1494 			    enable ? 1 : 0);
1495 	tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CMP_BUSY_INT_ENABLE,
1496 			    enable ? 1 : 0);
1497 	tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, GFX_IDLE_INT_ENABLE,
1498 			    enable ? 1 : 0);
1499 
1500 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_INT_CNTL_RING0, tmp);
1501 }
1502 
1503 static int gfx_v12_1_xcc_init_csb(struct amdgpu_device *adev,
1504 				  int xcc_id)
1505 {
1506 	adev->gfx.rlc.funcs->get_csb_buffer(adev, adev->gfx.rlc.cs_ptr);
1507 
1508 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CSIB_ADDR_HI,
1509 			adev->gfx.rlc.clear_state_gpu_addr >> 32);
1510 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CSIB_ADDR_LO,
1511 			adev->gfx.rlc.clear_state_gpu_addr & 0xfffffffc);
1512 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1513 		     regRLC_CSIB_LENGTH, adev->gfx.rlc.clear_state_size);
1514 
1515 	return 0;
1516 }
1517 
1518 static void gfx_v12_1_xcc_rlc_stop(struct amdgpu_device *adev,
1519 				   int xcc_id)
1520 {
1521 	u32 tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CNTL);
1522 
1523 	tmp = REG_SET_FIELD(tmp, RLC_CNTL, RLC_ENABLE_F32, 0);
1524 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CNTL, tmp);
1525 }
1526 
1527 static void gfx_v12_1_rlc_stop(struct amdgpu_device *adev)
1528 {
1529 	int i, num_xcc;
1530 
1531 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1532 	for (i = 0; i < num_xcc; i++)
1533 		gfx_v12_1_xcc_rlc_stop(adev, i);
1534 }
1535 
1536 static void gfx_v12_1_xcc_rlc_reset(struct amdgpu_device *adev,
1537 				    int xcc_id)
1538 {
1539 	WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id),
1540 			      GRBM_SOFT_RESET, SOFT_RESET_RLC, 1);
1541 	udelay(50);
1542 	WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id),
1543 			      GRBM_SOFT_RESET, SOFT_RESET_RLC, 0);
1544 	udelay(50);
1545 }
1546 
1547 static void gfx_v12_1_rlc_reset(struct amdgpu_device *adev)
1548 {
1549 	int i, num_xcc;
1550 
1551 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1552 	for (i = 0; i < num_xcc; i++)
1553 		gfx_v12_1_xcc_rlc_reset(adev, i);
1554 }
1555 
1556 static void gfx_v12_1_xcc_rlc_smu_handshake_cntl(struct amdgpu_device *adev,
1557 						 bool enable, int xcc_id)
1558 {
1559 	uint32_t rlc_pg_cntl;
1560 
1561 	rlc_pg_cntl = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_PG_CNTL);
1562 
1563 	if (!enable) {
1564 		/* RLC_PG_CNTL[23] = 0 (default)
1565 		 * RLC will wait for handshake acks with SMU
1566 		 * GFXOFF will be enabled
1567 		 * RLC_PG_CNTL[23] = 1
1568 		 * RLC will not issue any message to SMU
1569 		 * hence no handshake between SMU & RLC
1570 		 * GFXOFF will be disabled
1571 		 */
1572 		rlc_pg_cntl |= RLC_PG_CNTL__SMU_HANDSHAKE_DISABLE_MASK;
1573 	} else
1574 		rlc_pg_cntl &= ~RLC_PG_CNTL__SMU_HANDSHAKE_DISABLE_MASK;
1575 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_PG_CNTL, rlc_pg_cntl);
1576 }
1577 
1578 static void gfx_v12_1_xcc_rlc_start(struct amdgpu_device *adev,
1579 				    int xcc_id)
1580 {
1581 	/* TODO: enable rlc & smu handshake until smu
1582 	 * and gfxoff feature works as expected */
1583 	if (!(amdgpu_pp_feature_mask & PP_GFXOFF_MASK))
1584 		gfx_v12_1_xcc_rlc_smu_handshake_cntl(adev, false, xcc_id);
1585 
1586 	WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id), RLC_CNTL, RLC_ENABLE_F32, 1);
1587 	udelay(50);
1588 }
1589 
1590 static void gfx_v12_1_rlc_start(struct amdgpu_device *adev)
1591 {
1592 	int i, num_xcc;
1593 
1594 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1595 	for (i = 0; i < num_xcc; i++) {
1596 		gfx_v12_1_xcc_rlc_start(adev, i);
1597 	}
1598 }
1599 
1600 static void gfx_v12_1_xcc_rlc_enable_srm(struct amdgpu_device *adev,
1601 					 int xcc_id)
1602 {
1603 	uint32_t tmp;
1604 
1605 	/* enable Save Restore Machine */
1606 	tmp = RREG32(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SRM_CNTL));
1607 	tmp |= RLC_SRM_CNTL__AUTO_INCR_ADDR_MASK;
1608 	tmp |= RLC_SRM_CNTL__SRM_ENABLE_MASK;
1609 	WREG32(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SRM_CNTL), tmp);
1610 }
1611 
1612 static void gfx_v12_1_xcc_load_rlcg_microcode(struct amdgpu_device *adev,
1613 					      int xcc_id)
1614 {
1615 	const struct rlc_firmware_header_v2_0 *hdr;
1616 	const __le32 *fw_data;
1617 	unsigned i, fw_size;
1618 
1619 	hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
1620 	fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1621 			   le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1622 	fw_size = le32_to_cpu(hdr->header.ucode_size_bytes) / 4;
1623 
1624 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_GPM_UCODE_ADDR,
1625 		     RLCG_UCODE_LOADING_START_ADDRESS);
1626 
1627 	for (i = 0; i < fw_size; i++)
1628 		WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1629 			     regRLC_GPM_UCODE_DATA,
1630 			     le32_to_cpup(fw_data++));
1631 
1632 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1633 		     regRLC_GPM_UCODE_ADDR,
1634 		     adev->gfx.rlc_fw_version);
1635 }
1636 
1637 static void gfx_v12_1_xcc_load_rlc_iram_dram_microcode(struct amdgpu_device *adev,
1638 						       int xcc_id)
1639 {
1640 	const struct rlc_firmware_header_v2_2 *hdr;
1641 	const __le32 *fw_data;
1642 	unsigned i, fw_size;
1643 	u32 tmp;
1644 
1645 	hdr = (const struct rlc_firmware_header_v2_2 *)adev->gfx.rlc_fw->data;
1646 
1647 	fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1648 			le32_to_cpu(hdr->rlc_iram_ucode_offset_bytes));
1649 	fw_size = le32_to_cpu(hdr->rlc_iram_ucode_size_bytes) / 4;
1650 
1651 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_LX6_IRAM_ADDR, 0);
1652 
1653 	for (i = 0; i < fw_size; i++) {
1654 		if ((amdgpu_emu_mode == 1) && (i % 100 == 99))
1655 			msleep(1);
1656 		WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1657 			     regRLC_LX6_IRAM_DATA,
1658 			     le32_to_cpup(fw_data++));
1659 	}
1660 
1661 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1662 		     regRLC_LX6_IRAM_ADDR, adev->gfx.rlc_fw_version);
1663 
1664 	fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1665 			le32_to_cpu(hdr->rlc_dram_ucode_offset_bytes));
1666 	fw_size = le32_to_cpu(hdr->rlc_dram_ucode_size_bytes) / 4;
1667 
1668 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1669 		     regRLC_LX6_DRAM_ADDR, 0);
1670 	for (i = 0; i < fw_size; i++) {
1671 		if ((amdgpu_emu_mode == 1) && (i % 100 == 99))
1672 			msleep(1);
1673 		WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1674 			     regRLC_LX6_DRAM_DATA,
1675 			     le32_to_cpup(fw_data++));
1676 	}
1677 
1678 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1679 		     regRLC_LX6_IRAM_ADDR, adev->gfx.rlc_fw_version);
1680 
1681 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_LX6_CNTL);
1682 	tmp = REG_SET_FIELD(tmp, RLC_LX6_CNTL, PDEBUG_ENABLE, 1);
1683 	tmp = REG_SET_FIELD(tmp, RLC_LX6_CNTL, BRESET, 0);
1684 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_LX6_CNTL, tmp);
1685 }
1686 
1687 static int gfx_v12_1_xcc_rlc_load_microcode(struct amdgpu_device *adev,
1688 					    int xcc_id)
1689 {
1690 	const struct rlc_firmware_header_v2_0 *hdr;
1691 	uint16_t version_major;
1692 	uint16_t version_minor;
1693 
1694 	if (!adev->gfx.rlc_fw)
1695 		return -EINVAL;
1696 
1697 	hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
1698 	amdgpu_ucode_print_rlc_hdr(&hdr->header);
1699 
1700 	version_major = le16_to_cpu(hdr->header.header_version_major);
1701 	version_minor = le16_to_cpu(hdr->header.header_version_minor);
1702 
1703 	if (version_major == 2) {
1704 		gfx_v12_1_xcc_load_rlcg_microcode(adev, xcc_id);
1705 		if (amdgpu_dpm == 1) {
1706 			if (version_minor >= 2)
1707 				gfx_v12_1_xcc_load_rlc_iram_dram_microcode(adev, xcc_id);
1708 		}
1709 
1710 		return 0;
1711 	}
1712 
1713 	return -EINVAL;
1714 }
1715 
1716 static int gfx_v12_1_xcc_rlc_resume(struct amdgpu_device *adev,
1717 				    int xcc_id)
1718 {
1719 	int r;
1720 
1721 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
1722 		gfx_v12_1_xcc_init_csb(adev, xcc_id);
1723 
1724 		if (!amdgpu_sriov_vf(adev)) /* enable RLC SRM */
1725 			gfx_v12_1_xcc_rlc_enable_srm(adev, xcc_id);
1726 	} else {
1727 		if (amdgpu_sriov_vf(adev)) {
1728 			gfx_v12_1_xcc_init_csb(adev, xcc_id);
1729 			return 0;
1730 		}
1731 
1732 		gfx_v12_1_xcc_rlc_stop(adev, xcc_id);
1733 
1734 		/* disable CG */
1735 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL, 0);
1736 
1737 		/* disable PG */
1738 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_PG_CNTL, 0);
1739 
1740 		if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
1741 			/* legacy rlc firmware loading */
1742 			r = gfx_v12_1_xcc_rlc_load_microcode(adev, xcc_id);
1743 			if (r)
1744 				return r;
1745 		}
1746 
1747 		gfx_v12_1_xcc_init_csb(adev, xcc_id);
1748 
1749 		gfx_v12_1_xcc_rlc_start(adev, xcc_id);
1750 	}
1751 
1752 	return 0;
1753 }
1754 
1755 static int gfx_v12_1_rlc_resume(struct amdgpu_device *adev)
1756 {
1757 	int r, i, num_xcc;
1758 
1759 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1760 	for (i = 0; i < num_xcc; i++) {
1761 		r = gfx_v12_1_xcc_rlc_resume(adev, i);
1762 		if (r)
1763 			return r;
1764 	}
1765 
1766 	return 0;
1767 }
1768 
1769 static void gfx_v12_1_xcc_config_gfx_rs64(struct amdgpu_device *adev,
1770 					  int xcc_id)
1771 {
1772 	const struct gfx_firmware_header_v2_0 *mec_hdr;
1773 	uint32_t pipe_id, tmp;
1774 
1775 	mec_hdr = (const struct gfx_firmware_header_v2_0 *)
1776 		adev->gfx.mec_fw->data;
1777 
1778 	/* config mec program start addr */
1779 	for (pipe_id = 0; pipe_id < 4; pipe_id++) {
1780 		soc_v1_0_grbm_select(adev, 1, pipe_id, 0, 0, GET_INST(GC, xcc_id));
1781 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_PRGRM_CNTR_START,
1782 					mec_hdr->ucode_start_addr_lo >> 2 |
1783 					mec_hdr->ucode_start_addr_hi << 30);
1784 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_PRGRM_CNTR_START_HI,
1785 					mec_hdr->ucode_start_addr_hi >> 2);
1786 	}
1787 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
1788 
1789 	/* reset mec pipe */
1790 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL);
1791 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE0_RESET, 1);
1792 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE1_RESET, 1);
1793 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE2_RESET, 1);
1794 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE3_RESET, 1);
1795 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL, tmp);
1796 
1797 	/* clear mec pipe reset */
1798 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE0_RESET, 0);
1799 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE1_RESET, 0);
1800 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE2_RESET, 0);
1801 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE3_RESET, 0);
1802 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL, tmp);
1803 }
1804 
1805 static void gfx_v12_1_config_gfx_rs64(struct amdgpu_device *adev)
1806 {
1807 	int i, num_xcc;
1808 
1809 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1810 
1811 	for (i = 0; i < num_xcc; i++)
1812 		gfx_v12_1_xcc_config_gfx_rs64(adev, i);
1813 }
1814 
1815 static void gfx_v12_1_xcc_set_mec_ucode_start_addr(struct amdgpu_device *adev,
1816 						   int xcc_id)
1817 {
1818 	const struct gfx_firmware_header_v2_0 *cp_hdr;
1819 	unsigned pipe_id;
1820 
1821 	cp_hdr = (const struct gfx_firmware_header_v2_0 *)
1822 		adev->gfx.mec_fw->data;
1823 	mutex_lock(&adev->srbm_mutex);
1824 	for (pipe_id = 0; pipe_id < adev->gfx.mec.num_pipe_per_mec; pipe_id++) {
1825 		soc_v1_0_grbm_select(adev, 1, pipe_id, 0, 0, GET_INST(GC, xcc_id));
1826 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_PRGRM_CNTR_START,
1827 			     cp_hdr->ucode_start_addr_lo >> 2 |
1828 			     cp_hdr->ucode_start_addr_hi << 30);
1829 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_PRGRM_CNTR_START_HI,
1830 			     cp_hdr->ucode_start_addr_hi >> 2);
1831 	}
1832 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
1833 	mutex_unlock(&adev->srbm_mutex);
1834 }
1835 
1836 static int gfx_v12_1_xcc_wait_for_rlc_autoload_complete(struct amdgpu_device *adev,
1837 							int xcc_id)
1838 {
1839 	uint32_t cp_status;
1840 	uint32_t bootload_status;
1841 	int i;
1842 
1843 	for (i = 0; i < adev->usec_timeout; i++) {
1844 		cp_status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_STAT);
1845 		bootload_status = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
1846 					       regRLC_RLCS_BOOTLOAD_STATUS);
1847 
1848 		if ((cp_status == 0) &&
1849 		    (REG_GET_FIELD(bootload_status,
1850 			RLC_RLCS_BOOTLOAD_STATUS, BOOTLOAD_COMPLETE) == 1)) {
1851 			break;
1852 		}
1853 		udelay(1);
1854 		if (amdgpu_emu_mode)
1855 			msleep(10);
1856 	}
1857 
1858 	if (i >= adev->usec_timeout) {
1859 		dev_err(adev->dev,
1860 			"rlc autoload: xcc%d gc ucode autoload timeout\n", xcc_id);
1861 		return -ETIMEDOUT;
1862 	}
1863 
1864 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {
1865 		gfx_v12_1_xcc_set_mec_ucode_start_addr(adev, xcc_id);
1866 	}
1867 
1868 	return 0;
1869 }
1870 
1871 static int gfx_v12_1_wait_for_rlc_autoload_complete(struct amdgpu_device *adev)
1872 {
1873 	int xcc_id;
1874 
1875 	for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++)
1876 		gfx_v12_1_xcc_wait_for_rlc_autoload_complete(adev, xcc_id);
1877 
1878 	return 0;
1879 }
1880 
1881 static void gfx_v12_1_xcc_cp_compute_enable(struct amdgpu_device *adev,
1882 					    bool enable, int xcc_id)
1883 {
1884 	u32 data;
1885 
1886 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL);
1887 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_INVALIDATE_ICACHE,
1888 						 enable ? 0 : 1);
1889 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE0_RESET,
1890 						 enable ? 0 : 1);
1891 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE1_RESET,
1892 						 enable ? 0 : 1);
1893 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE2_RESET,
1894 						 enable ? 0 : 1);
1895 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE3_RESET,
1896 						 enable ? 0 : 1);
1897 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE0_ACTIVE,
1898 						 enable ? 1 : 0);
1899 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE1_ACTIVE,
1900 			                         enable ? 1 : 0);
1901 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE2_ACTIVE,
1902 						 enable ? 1 : 0);
1903 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE3_ACTIVE,
1904 						 enable ? 1 : 0);
1905 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_HALT,
1906 						 enable ? 0 : 1);
1907 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL, data);
1908 
1909 	adev->gfx.kiq[xcc_id].ring.sched.ready = enable;
1910 
1911 	udelay(50);
1912 }
1913 
1914 static int gfx_v12_1_init_cp_compute_microcode_bo(struct amdgpu_device *adev)
1915 {
1916 	const struct gfx_firmware_header_v2_0 *mec_hdr;
1917 	const __le32 *fw_ucode, *fw_data;
1918 	u32 fw_ucode_size, fw_data_size;
1919 	u32 *fw_ucode_ptr, *fw_data_ptr;
1920 	int i, r, xcc_id;
1921 
1922 	if (!adev->gfx.mec_fw)
1923 		return -EINVAL;
1924 
1925 	mec_hdr = (const struct gfx_firmware_header_v2_0 *)adev->gfx.mec_fw->data;
1926 	amdgpu_ucode_print_gfx_hdr(&mec_hdr->header);
1927 
1928 	fw_ucode = (const __le32 *) (adev->gfx.mec_fw->data +
1929 				le32_to_cpu(mec_hdr->ucode_offset_bytes));
1930 	fw_ucode_size = le32_to_cpu(mec_hdr->ucode_size_bytes);
1931 
1932 	fw_data = (const __le32 *) (adev->gfx.mec_fw->data +
1933 				le32_to_cpu(mec_hdr->data_offset_bytes));
1934 	fw_data_size = le32_to_cpu(mec_hdr->data_size_bytes);
1935 
1936 	if (adev->gfx.mec.mec_fw_obj == NULL) {
1937 		r = amdgpu_bo_create_reserved(adev, fw_ucode_size,
1938 					      64 * 1024, AMDGPU_GEM_DOMAIN_VRAM,
1939 					      &adev->gfx.mec.mec_fw_obj,
1940 					      &adev->gfx.mec.mec_fw_gpu_addr,
1941 					      (void **)&fw_ucode_ptr);
1942 		if (r) {
1943 			dev_err(adev->dev, "(%d) failed to create mec fw ucode bo\n", r);
1944 			gfx_v12_1_mec_fini(adev);
1945 			return r;
1946 		}
1947 
1948 		memcpy(fw_ucode_ptr, fw_ucode, fw_ucode_size);
1949 
1950 		amdgpu_bo_kunmap(adev->gfx.mec.mec_fw_obj);
1951 		amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_obj);
1952 	}
1953 
1954 	if (adev->gfx.mec.mec_fw_data_obj == NULL) {
1955 		r = amdgpu_bo_create_reserved(adev,
1956 					      ALIGN(fw_data_size, 64 * 1024) *
1957 					      adev->gfx.mec.num_pipe_per_mec * NUM_XCC(adev->gfx.xcc_mask),
1958 					      64 * 1024, AMDGPU_GEM_DOMAIN_VRAM,
1959 					      &adev->gfx.mec.mec_fw_data_obj,
1960 					      &adev->gfx.mec.mec_fw_data_gpu_addr,
1961 					      (void **)&fw_data_ptr);
1962 		if (r) {
1963 			dev_err(adev->dev, "(%d) failed to create mec fw data bo\n", r);
1964 			gfx_v12_1_mec_fini(adev);
1965 			return r;
1966 		}
1967 
1968 		for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) {
1969 			for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++) {
1970 				u32 offset = (xcc_id * adev->gfx.mec.num_pipe_per_mec + i) *
1971 					     ALIGN(fw_data_size, 64 * 1024) / 4;
1972 				memcpy(fw_data_ptr + offset, fw_data, fw_data_size);
1973 			}
1974 		}
1975 
1976 		amdgpu_bo_kunmap(adev->gfx.mec.mec_fw_data_obj);
1977 		amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_data_obj);
1978 	}
1979 
1980 	return 0;
1981 }
1982 
1983 static int gfx_v12_1_xcc_cp_compute_load_microcode_rs64(struct amdgpu_device *adev,
1984 							int xcc_id)
1985 {
1986 	const struct gfx_firmware_header_v2_0 *mec_hdr;
1987 	u32 fw_data_size;
1988 	u32 tmp, i, usec_timeout = 50000; /* Wait for 50 ms */
1989 
1990 	if (!adev->gfx.mec_fw)
1991 		return -EINVAL;
1992 
1993 	mec_hdr = (const struct gfx_firmware_header_v2_0 *)adev->gfx.mec_fw->data;
1994 	fw_data_size = le32_to_cpu(mec_hdr->data_size_bytes);
1995 
1996 	gfx_v12_1_xcc_cp_compute_enable(adev, false, xcc_id);
1997 
1998 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_CNTL);
1999 	tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, VMID, 0);
2000 	tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, EXE_DISABLE, 0);
2001 	tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, CACHE_POLICY, 0);
2002 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_CNTL, tmp);
2003 
2004 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_BASE_CNTL);
2005 	tmp = REG_SET_FIELD(tmp, CP_MEC_DC_BASE_CNTL, VMID, 0);
2006 	tmp = REG_SET_FIELD(tmp, CP_MEC_DC_BASE_CNTL, CACHE_POLICY, 0);
2007 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_BASE_CNTL, tmp);
2008 
2009 	mutex_lock(&adev->srbm_mutex);
2010 	for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++) {
2011 		soc_v1_0_grbm_select(adev, 1, i, 0, 0, GET_INST(GC, xcc_id));
2012 
2013 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_MDBASE_LO,
2014 			     lower_32_bits(adev->gfx.mec.mec_fw_data_gpu_addr +
2015 					   (xcc_id * adev->gfx.mec.num_pipe_per_mec + i) *
2016 					   ALIGN(fw_data_size, 64 * 1024)));
2017 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_MDBASE_HI,
2018 			     upper_32_bits(adev->gfx.mec.mec_fw_data_gpu_addr +
2019 					   (xcc_id * adev->gfx.mec.num_pipe_per_mec + i) *
2020 					   ALIGN(fw_data_size, 64 * 1024)));
2021 
2022 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_LO,
2023 				lower_32_bits(adev->gfx.mec.mec_fw_gpu_addr));
2024 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_HI,
2025 				upper_32_bits(adev->gfx.mec.mec_fw_gpu_addr));
2026 	}
2027 	mutex_unlock(&adev->srbm_mutex);
2028 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
2029 
2030 	/* Trigger an invalidation of the L1 instruction caches */
2031 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL);
2032 	tmp = REG_SET_FIELD(tmp, CP_MEC_DC_OP_CNTL, INVALIDATE_DCACHE, 1);
2033 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL, tmp);
2034 
2035 	/* Wait for invalidation complete */
2036 	for (i = 0; i < usec_timeout; i++) {
2037 		tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL);
2038 		if (1 == REG_GET_FIELD(tmp, CP_MEC_DC_OP_CNTL,
2039 				       INVALIDATE_DCACHE_COMPLETE))
2040 			break;
2041 		udelay(1);
2042 	}
2043 
2044 	if (i >= usec_timeout) {
2045 		dev_err(adev->dev, "failed to invalidate data cache\n");
2046 		return -EINVAL;
2047 	}
2048 
2049 	/* Trigger an invalidation of the L1 instruction caches */
2050 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL);
2051 	tmp = REG_SET_FIELD(tmp, CP_CPC_IC_OP_CNTL, INVALIDATE_CACHE, 1);
2052 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL, tmp);
2053 
2054 	/* Wait for invalidation complete */
2055 	for (i = 0; i < usec_timeout; i++) {
2056 		tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL);
2057 		if (1 == REG_GET_FIELD(tmp, CP_CPC_IC_OP_CNTL,
2058 				       INVALIDATE_CACHE_COMPLETE))
2059 			break;
2060 		udelay(1);
2061 	}
2062 
2063 	if (i >= usec_timeout) {
2064 		dev_err(adev->dev, "failed to invalidate instruction cache\n");
2065 		return -EINVAL;
2066 	}
2067 
2068 	gfx_v12_1_xcc_set_mec_ucode_start_addr(adev, xcc_id);
2069 
2070 	return 0;
2071 }
2072 
2073 static void gfx_v12_1_xcc_kiq_setting(struct amdgpu_ring *ring,
2074 				      int xcc_id)
2075 {
2076 	uint32_t tmp;
2077 	struct amdgpu_device *adev = ring->adev;
2078 
2079 	/* tell RLC which is KIQ queue */
2080 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS);
2081 	tmp &= 0xffffff00;
2082 	tmp |= (ring->me << 5) | (ring->pipe << 3) | (ring->queue);
2083 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS, tmp);
2084 	tmp |= 0x80;
2085 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS, tmp);
2086 }
2087 
2088 static void gfx_v12_1_xcc_cp_set_doorbell_range(struct amdgpu_device *adev,
2089 						int xcc_id)
2090 {
2091 	/* disable gfx engine doorbell range */
2092 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_DOORBELL_RANGE_LOWER, 0);
2093 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_DOORBELL_RANGE_UPPER, 0);
2094 
2095 	/* set compute engine doorbell range */
2096 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DOORBELL_RANGE_LOWER,
2097 		     ((adev->doorbell_index.kiq +
2098 		       xcc_id * adev->doorbell_index.xcc_doorbell_range) *
2099 		      2) << 2);
2100 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DOORBELL_RANGE_UPPER,
2101 		     ((adev->doorbell_index.userqueue_end +
2102 		       xcc_id * adev->doorbell_index.xcc_doorbell_range) *
2103 		      2) << 2);
2104 }
2105 
2106 static int gfx_v12_1_compute_mqd_init(struct amdgpu_device *adev, void *m,
2107 				      struct amdgpu_mqd_prop *prop)
2108 {
2109 	struct v12_1_compute_mqd *mqd = m;
2110 	uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr;
2111 	uint32_t tmp;
2112 
2113 	mqd->header = 0xC0310800;
2114 	mqd->compute_pipelinestat_enable = 0x00000001;
2115 	mqd->compute_static_thread_mgmt_se0 = 0xffffffff;
2116 	mqd->compute_static_thread_mgmt_se1 = 0xffffffff;
2117 	mqd->compute_static_thread_mgmt_se2 = 0xffffffff;
2118 	mqd->compute_static_thread_mgmt_se3 = 0xffffffff;
2119 	mqd->compute_misc_reserved = 0x00000007;
2120 
2121 	eop_base_addr = prop->eop_gpu_addr >> 8;
2122 	mqd->cp_hqd_eop_base_addr_lo = eop_base_addr;
2123 	mqd->cp_hqd_eop_base_addr_hi = upper_32_bits(eop_base_addr);
2124 
2125 	/* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
2126 	tmp = regCP_HQD_EOP_CONTROL_DEFAULT;
2127 	tmp = REG_SET_FIELD(tmp, CP_HQD_EOP_CONTROL, EOP_SIZE,
2128 			(order_base_2(GFX12_MEC_HPD_SIZE / 4) - 1));
2129 
2130 	mqd->cp_hqd_eop_control = tmp;
2131 
2132 	/* enable doorbell? */
2133 	tmp = regCP_HQD_PQ_DOORBELL_CONTROL_DEFAULT;
2134 
2135 	if (prop->use_doorbell) {
2136 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2137 				    DOORBELL_OFFSET, prop->doorbell_index);
2138 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2139 				    DOORBELL_EN, 1);
2140 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2141 				    DOORBELL_SOURCE, 0);
2142 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2143 				    DOORBELL_HIT, 0);
2144 	} else {
2145 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2146 				    DOORBELL_EN, 0);
2147 	}
2148 
2149 	mqd->cp_hqd_pq_doorbell_control = tmp;
2150 
2151 	/* disable the queue if it's active */
2152 	mqd->cp_hqd_dequeue_request = 0;
2153 	mqd->cp_hqd_pq_rptr = 0;
2154 	mqd->cp_hqd_pq_wptr_lo = 0;
2155 	mqd->cp_hqd_pq_wptr_hi = 0;
2156 
2157 	/* set the pointer to the MQD */
2158 	mqd->cp_mqd_base_addr_lo = prop->mqd_gpu_addr & 0xfffffffc;
2159 	mqd->cp_mqd_base_addr_hi = upper_32_bits(prop->mqd_gpu_addr);
2160 
2161 	/* set MQD vmid to 0 */
2162 	tmp = regCP_MQD_CONTROL_DEFAULT;
2163 	tmp = REG_SET_FIELD(tmp, CP_MQD_CONTROL, VMID, 0);
2164 	mqd->cp_mqd_control = tmp;
2165 
2166 	/* set the pointer to the HQD, this is similar CP_RB0_BASE/_HI */
2167 	hqd_gpu_addr = prop->hqd_base_gpu_addr >> 8;
2168 	mqd->cp_hqd_pq_base_lo = hqd_gpu_addr;
2169 	mqd->cp_hqd_pq_base_hi = upper_32_bits(hqd_gpu_addr);
2170 
2171 	/* set up the HQD, this is similar to CP_RB0_CNTL */
2172 	tmp = regCP_HQD_PQ_CONTROL_DEFAULT;
2173 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, QUEUE_SIZE,
2174 			    (order_base_2(prop->queue_size / 4) - 1));
2175 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE,
2176 			    (order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1));
2177 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
2178 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
2179 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
2180 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
2181 	mqd->cp_hqd_pq_control = tmp;
2182 
2183 	/* set the wb address whether it's enabled or not */
2184 	wb_gpu_addr = prop->rptr_gpu_addr;
2185 	mqd->cp_hqd_pq_rptr_report_addr_lo = wb_gpu_addr & 0xfffffffc;
2186 	mqd->cp_hqd_pq_rptr_report_addr_hi =
2187 		upper_32_bits(wb_gpu_addr) & 0xffff;
2188 
2189 	/* only used if CP_PQ_WPTR_POLL_CNTL.CP_PQ_WPTR_POLL_CNTL__EN_MASK=1 */
2190 	wb_gpu_addr = prop->wptr_gpu_addr;
2191 	mqd->cp_hqd_pq_wptr_poll_addr_lo = wb_gpu_addr & 0xfffffffc;
2192 	mqd->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits(wb_gpu_addr) & 0xffff;
2193 
2194 	tmp = 0;
2195 	/* enable the doorbell if requested */
2196 	if (prop->use_doorbell) {
2197 		tmp = regCP_HQD_PQ_DOORBELL_CONTROL_DEFAULT;
2198 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2199 				DOORBELL_OFFSET, prop->doorbell_index);
2200 
2201 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2202 				    DOORBELL_EN, 1);
2203 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2204 				    DOORBELL_SOURCE, 0);
2205 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2206 				    DOORBELL_HIT, 0);
2207 	}
2208 
2209 	mqd->cp_hqd_pq_doorbell_control = tmp;
2210 
2211 	/* reset read and write pointers, similar to CP_RB0_WPTR/_RPTR */
2212 	mqd->cp_hqd_pq_rptr = regCP_HQD_PQ_RPTR_DEFAULT;
2213 
2214 	/* set the vmid for the queue */
2215 	mqd->cp_hqd_vmid = 0;
2216 
2217 	tmp = regCP_HQD_PERSISTENT_STATE_DEFAULT;
2218 	tmp = REG_SET_FIELD(tmp, CP_HQD_PERSISTENT_STATE, PRELOAD_SIZE, 0x63);
2219 	mqd->cp_hqd_persistent_state = tmp;
2220 
2221 	/* set MIN_IB_AVAIL_SIZE */
2222 	tmp = regCP_HQD_IB_CONTROL_DEFAULT;
2223 	tmp = REG_SET_FIELD(tmp, CP_HQD_IB_CONTROL, MIN_IB_AVAIL_SIZE, 1);
2224 	mqd->cp_hqd_ib_control = tmp;
2225 
2226 	/* set static priority for a compute queue/ring */
2227 	mqd->cp_hqd_pipe_priority = prop->hqd_pipe_priority;
2228 	mqd->cp_hqd_queue_priority = prop->hqd_queue_priority;
2229 
2230 	mqd->cp_mqd_stride_size = prop->mqd_stride_size ? prop->mqd_stride_size :
2231 		sizeof(struct v12_1_compute_mqd);
2232 
2233 	mqd->cp_hqd_active = prop->hqd_active;
2234 
2235 	return 0;
2236 }
2237 
2238 static int gfx_v12_1_xcc_kiq_init_register(struct amdgpu_ring *ring,
2239 					   int xcc_id)
2240 {
2241 	struct amdgpu_device *adev = ring->adev;
2242 	struct v12_1_compute_mqd *mqd = ring->mqd_ptr;
2243 	int j;
2244 
2245 	/* inactivate the queue */
2246 	if (amdgpu_sriov_vf(adev))
2247 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE, 0);
2248 
2249 	/* disable wptr polling */
2250 	WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id), CP_PQ_WPTR_POLL_CNTL, EN, 0);
2251 
2252 	/* write the EOP addr */
2253 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_EOP_BASE_ADDR,
2254 	       mqd->cp_hqd_eop_base_addr_lo);
2255 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_EOP_BASE_ADDR_HI,
2256 	       mqd->cp_hqd_eop_base_addr_hi);
2257 
2258 	/* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
2259 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_EOP_CONTROL,
2260 	       mqd->cp_hqd_eop_control);
2261 
2262 	/* enable doorbell? */
2263 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_DOORBELL_CONTROL,
2264 	       mqd->cp_hqd_pq_doorbell_control);
2265 
2266 	/* disable the queue if it's active */
2267 	if (RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1) {
2268 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_DEQUEUE_REQUEST, 1);
2269 		for (j = 0; j < adev->usec_timeout; j++) {
2270 			if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1))
2271 				break;
2272 			udelay(1);
2273 		}
2274 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_DEQUEUE_REQUEST,
2275 		       mqd->cp_hqd_dequeue_request);
2276 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_RPTR,
2277 		       mqd->cp_hqd_pq_rptr);
2278 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_LO,
2279 		       mqd->cp_hqd_pq_wptr_lo);
2280 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_HI,
2281 		       mqd->cp_hqd_pq_wptr_hi);
2282 	}
2283 
2284 	/* set the pointer to the MQD */
2285 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MQD_BASE_ADDR,
2286 	       mqd->cp_mqd_base_addr_lo);
2287 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MQD_BASE_ADDR_HI,
2288 	       mqd->cp_mqd_base_addr_hi);
2289 
2290 	/* set MQD vmid to 0 */
2291 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MQD_CONTROL,
2292 	       mqd->cp_mqd_control);
2293 
2294 	/* set the pointer to the HQD, this is similar CP_RB0_BASE/_HI */
2295 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_BASE,
2296 	       mqd->cp_hqd_pq_base_lo);
2297 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_BASE_HI,
2298 	       mqd->cp_hqd_pq_base_hi);
2299 
2300 	/* set up the HQD, this is similar to CP_RB0_CNTL */
2301 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_CONTROL,
2302 	       mqd->cp_hqd_pq_control);
2303 
2304 	/* set the wb address whether it's enabled or not */
2305 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_RPTR_REPORT_ADDR,
2306 		mqd->cp_hqd_pq_rptr_report_addr_lo);
2307 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_RPTR_REPORT_ADDR_HI,
2308 		mqd->cp_hqd_pq_rptr_report_addr_hi);
2309 
2310 	/* only used if CP_PQ_WPTR_POLL_CNTL.CP_PQ_WPTR_POLL_CNTL__EN_MASK=1 */
2311 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_POLL_ADDR,
2312 	       mqd->cp_hqd_pq_wptr_poll_addr_lo);
2313 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_POLL_ADDR_HI,
2314 	       mqd->cp_hqd_pq_wptr_poll_addr_hi);
2315 
2316 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_DOORBELL_CONTROL,
2317 	       mqd->cp_hqd_pq_doorbell_control);
2318 
2319 	/* reset read and write pointers, similar to CP_RB0_WPTR/_RPTR */
2320 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_LO,
2321 	       mqd->cp_hqd_pq_wptr_lo);
2322 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_HI,
2323 	       mqd->cp_hqd_pq_wptr_hi);
2324 
2325 	/* set the vmid for the queue */
2326 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_VMID, mqd->cp_hqd_vmid);
2327 
2328 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PERSISTENT_STATE,
2329 	       mqd->cp_hqd_persistent_state);
2330 
2331 	/* activate the queue */
2332 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE,
2333 	       mqd->cp_hqd_active);
2334 
2335 	if (ring->use_doorbell)
2336 		WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id), CP_PQ_STATUS, DOORBELL_ENABLE, 1);
2337 
2338 	return 0;
2339 }
2340 
2341 static int gfx_v12_1_xcc_kiq_init_queue(struct amdgpu_ring *ring,
2342 					int xcc_id)
2343 {
2344 	struct amdgpu_device *adev = ring->adev;
2345 	struct v12_1_compute_mqd *mqd = ring->mqd_ptr;
2346 
2347 	gfx_v12_1_xcc_kiq_setting(ring, xcc_id);
2348 
2349 	if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
2350 		/* reset MQD to a clean status */
2351 		if (adev->gfx.kiq[xcc_id].mqd_backup)
2352 			memcpy(mqd, adev->gfx.kiq[xcc_id].mqd_backup, sizeof(*mqd));
2353 
2354 		/* reset ring buffer */
2355 		ring->wptr = 0;
2356 		amdgpu_ring_clear_ring(ring);
2357 
2358 		mutex_lock(&adev->srbm_mutex);
2359 		soc_v1_0_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, xcc_id));
2360 		gfx_v12_1_xcc_kiq_init_register(ring, xcc_id);
2361 		soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
2362 		mutex_unlock(&adev->srbm_mutex);
2363 	} else {
2364 		memset((void *)mqd, 0, sizeof(*mqd));
2365 		if (amdgpu_sriov_vf(adev) && adev->in_suspend)
2366 			amdgpu_ring_clear_ring(ring);
2367 		mutex_lock(&adev->srbm_mutex);
2368 		soc_v1_0_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, xcc_id));
2369 		amdgpu_ring_init_mqd(ring);
2370 		gfx_v12_1_xcc_kiq_init_register(ring, xcc_id);
2371 		soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
2372 		mutex_unlock(&adev->srbm_mutex);
2373 
2374 		if (adev->gfx.kiq[xcc_id].mqd_backup)
2375 			memcpy(adev->gfx.kiq[xcc_id].mqd_backup, mqd, sizeof(*mqd));
2376 	}
2377 
2378 	return 0;
2379 }
2380 
2381 static int gfx_v12_1_xcc_kcq_init_queue(struct amdgpu_ring *ring,
2382 					int xcc_id)
2383 {
2384 	struct amdgpu_device *adev = ring->adev;
2385 	struct v12_1_compute_mqd *mqd = ring->mqd_ptr;
2386 	int mqd_idx = ring - &adev->gfx.compute_ring[0];
2387 
2388 	if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
2389 		memset((void *)mqd, 0, sizeof(*mqd));
2390 		mutex_lock(&adev->srbm_mutex);
2391 		soc_v1_0_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, xcc_id));
2392 		amdgpu_ring_init_mqd(ring);
2393 		soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
2394 		mutex_unlock(&adev->srbm_mutex);
2395 
2396 		if (adev->gfx.mec.mqd_backup[mqd_idx])
2397 			memcpy_fromio(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
2398 	} else {
2399 		/* restore MQD to a clean status */
2400 		if (adev->gfx.mec.mqd_backup[mqd_idx])
2401 			memcpy_toio(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
2402 		/* reset ring buffer */
2403 		ring->wptr = 0;
2404 		atomic64_set((atomic64_t *)ring->wptr_cpu_addr, 0);
2405 		amdgpu_ring_clear_ring(ring);
2406 	}
2407 
2408 	return 0;
2409 }
2410 
2411 static int gfx_v12_1_xcc_kiq_resume(struct amdgpu_device *adev,
2412 				    int xcc_id)
2413 {
2414 	struct amdgpu_ring *ring;
2415 	int r;
2416 
2417 	ring = &adev->gfx.kiq[xcc_id].ring;
2418 
2419 	r = amdgpu_bo_reserve(ring->mqd_obj, false);
2420 	if (unlikely(r != 0))
2421 		return r;
2422 
2423 	r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
2424 	if (unlikely(r != 0)) {
2425 		amdgpu_bo_unreserve(ring->mqd_obj);
2426 		return r;
2427 	}
2428 
2429 	gfx_v12_1_xcc_kiq_init_queue(ring, xcc_id);
2430 	amdgpu_bo_kunmap(ring->mqd_obj);
2431 	ring->mqd_ptr = NULL;
2432 	amdgpu_bo_unreserve(ring->mqd_obj);
2433 	ring->sched.ready = true;
2434 	return 0;
2435 }
2436 
2437 static int gfx_v12_1_xcc_kcq_resume(struct amdgpu_device *adev,
2438 				    int xcc_id)
2439 {
2440 	struct amdgpu_ring *ring = NULL;
2441 	int r = 0, i;
2442 
2443 	if (!amdgpu_async_gfx_ring)
2444 		gfx_v12_1_xcc_cp_compute_enable(adev, true, xcc_id);
2445 
2446 	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
2447 		ring = &adev->gfx.compute_ring[i + xcc_id * adev->gfx.num_compute_rings];
2448 
2449 		r = amdgpu_bo_reserve(ring->mqd_obj, false);
2450 		if (unlikely(r != 0))
2451 			goto done;
2452 		r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
2453 		if (!r) {
2454 			r = gfx_v12_1_xcc_kcq_init_queue(ring, xcc_id);
2455 			amdgpu_bo_kunmap(ring->mqd_obj);
2456 			ring->mqd_ptr = NULL;
2457 		}
2458 		amdgpu_bo_unreserve(ring->mqd_obj);
2459 		if (r)
2460 			goto done;
2461 	}
2462 
2463 	r = amdgpu_gfx_enable_kcq(adev, xcc_id);
2464 done:
2465 	return r;
2466 }
2467 
2468 static int gfx_v12_1_xcc_cp_resume(struct amdgpu_device *adev, uint16_t xcc_mask)
2469 {
2470 	int r, i, xcc_id;
2471 	struct amdgpu_ring *ring;
2472 
2473 	for_each_inst(xcc_id, xcc_mask) {
2474 		if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
2475 			/* legacy firmware loading */
2476 			r = gfx_v12_1_xcc_cp_compute_load_microcode_rs64(adev, xcc_id);
2477 			if (r)
2478 				return r;
2479 		}
2480 
2481 		if (!(adev->flags & AMD_IS_APU))
2482 			gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, false, xcc_id);
2483 
2484 		gfx_v12_1_xcc_cp_set_doorbell_range(adev, xcc_id);
2485 
2486 		gfx_v12_1_xcc_cp_compute_enable(adev, true, xcc_id);
2487 
2488 		if (adev->enable_mes_kiq && adev->mes.kiq_hw_init)
2489 			r = amdgpu_mes_kiq_hw_init(adev, xcc_id);
2490 		else
2491 			r = gfx_v12_1_xcc_kiq_resume(adev, xcc_id);
2492 		if (r)
2493 			return r;
2494 
2495 		r = gfx_v12_1_xcc_kcq_resume(adev, xcc_id);
2496 		if (r)
2497 			return r;
2498 
2499 		for (i = 0; i < adev->gfx.num_compute_rings; i++) {
2500 			ring = &adev->gfx.compute_ring[i + xcc_id * adev->gfx.num_compute_rings];
2501 			r = amdgpu_ring_test_helper(ring);
2502 			if (r)
2503 				return r;
2504 		}
2505 	}
2506 
2507 	return 0;
2508 }
2509 
2510 static int gfx_v12_1_cp_resume(struct amdgpu_device *adev)
2511 {
2512 	int num_xcc, num_xcp, num_xcc_per_xcp;
2513 	uint16_t xcc_mask;
2514 	int r = 0;
2515 
2516 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2517 	if (amdgpu_sriov_vf(adev)) {
2518 		enum amdgpu_gfx_partition mode;
2519 
2520 		mode = amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
2521 						       AMDGPU_XCP_FL_NONE);
2522 		if (mode == AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
2523 			return -EINVAL;
2524 		if (adev->gfx.funcs &&
2525 		    adev->gfx.funcs->get_xccs_per_xcp) {
2526 			num_xcc_per_xcp = adev->gfx.funcs->get_xccs_per_xcp(adev);
2527 			adev->gfx.num_xcc_per_xcp = num_xcc_per_xcp;
2528 			num_xcp = num_xcc / num_xcc_per_xcp;
2529 		} else {
2530 			return -EINVAL;
2531 		}
2532 		r = amdgpu_xcp_init(adev->xcp_mgr, num_xcp, mode);
2533 
2534 	} else {
2535 		if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
2536 						    AMDGPU_XCP_FL_NONE) ==
2537 		    AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
2538 			r = amdgpu_xcp_switch_partition_mode(adev->xcp_mgr,
2539 							     amdgpu_user_partt_mode);
2540 	}
2541 
2542 	if (r)
2543 		return r;
2544 
2545 	xcc_mask = GENMASK(NUM_XCC(adev->gfx.xcc_mask) - 1, 0);
2546 
2547 	return gfx_v12_1_xcc_cp_resume(adev, xcc_mask);
2548 }
2549 
2550 static int gfx_v12_1_gfxhub_enable(struct amdgpu_device *adev)
2551 {
2552 	int r, i;
2553 	bool value;
2554 
2555 	r = adev->gfxhub.funcs->gart_enable(adev);
2556 	if (r)
2557 		return r;
2558 
2559 	value = (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS) ?
2560 		false : true;
2561 
2562 	adev->gfxhub.funcs->set_fault_enable_default(adev, value);
2563 	/* TODO investigate why TLB flush is needed,
2564 	 * are we missing a flush somewhere else? */
2565 	for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS) {
2566 		if (AMDGPU_IS_GFXHUB(i))
2567 			adev->gmc.gmc_funcs->flush_gpu_tlb(adev, 0, AMDGPU_GFXHUB(i), 0);
2568 	}
2569 
2570 	return 0;
2571 }
2572 
2573 static int get_gb_addr_config(struct amdgpu_device *adev)
2574 {
2575 	u32 gb_addr_config;
2576 
2577 	gb_addr_config = RREG32_SOC15(GC, GET_INST(GC, 0), regGB_ADDR_CONFIG_READ);
2578 	if (gb_addr_config == 0)
2579 		return -EINVAL;
2580 
2581 	adev->gfx.config.gb_addr_config_fields.num_pkrs =
2582 		1 << REG_GET_FIELD(gb_addr_config, GB_ADDR_CONFIG_READ, NUM_PKRS);
2583 
2584 	adev->gfx.config.gb_addr_config = gb_addr_config;
2585 
2586 	adev->gfx.config.gb_addr_config_fields.num_pipes = 1 <<
2587 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2588 				      GB_ADDR_CONFIG_READ, NUM_PIPES);
2589 
2590 	adev->gfx.config.max_tile_pipes =
2591 		adev->gfx.config.gb_addr_config_fields.num_pipes;
2592 
2593 	adev->gfx.config.gb_addr_config_fields.max_compress_frags = 1 <<
2594 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2595 				      GB_ADDR_CONFIG_READ, MAX_COMPRESSED_FRAGS);
2596 	adev->gfx.config.gb_addr_config_fields.num_rb_per_se = 1 <<
2597 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2598 				      GB_ADDR_CONFIG_READ, NUM_RB_PER_SE);
2599 	adev->gfx.config.gb_addr_config_fields.num_se = 1 <<
2600 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2601 				      GB_ADDR_CONFIG_READ, NUM_SHADER_ENGINES);
2602 	adev->gfx.config.gb_addr_config_fields.pipe_interleave_size = 1 << (8 +
2603 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2604 				      GB_ADDR_CONFIG_READ, PIPE_INTERLEAVE_SIZE));
2605 
2606 	return 0;
2607 }
2608 
2609 static void gfx_v12_1_xcc_disable_gpa_mode(struct amdgpu_device *adev,
2610 					   int xcc_id)
2611 {
2612 	uint32_t data;
2613 
2614 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPC_PSP_DEBUG);
2615 	data |= CPC_PSP_DEBUG__GPA_OVERRIDE_MASK;
2616 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPC_PSP_DEBUG, data);
2617 
2618 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPG_PSP_DEBUG);
2619 	data |= CPG_PSP_DEBUG__GPA_OVERRIDE_MASK;
2620 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPG_PSP_DEBUG, data);
2621 }
2622 
2623 static void gfx_v12_1_xcc_setup_tcp_thrashing_ctrl(struct amdgpu_device *adev,
2624 					 int xcc_id)
2625 {
2626 	uint32_t val;
2627 
2628 	/* Set the TCP UTCL0 register to enable atomics */
2629 	val = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
2630 					regTCP_UTCL0_THRASHING_CTRL);
2631 	val = REG_SET_FIELD(val, TCP_UTCL0_THRASHING_CTRL, THRASHING_EN, 0x2);
2632 	val = REG_SET_FIELD(val, TCP_UTCL0_THRASHING_CTRL,
2633 					RETRY_FRAGMENT_THRESHOLD_UP_EN, 0x1);
2634 	val = REG_SET_FIELD(val, TCP_UTCL0_THRASHING_CTRL,
2635 					RETRY_FRAGMENT_THRESHOLD_DOWN_EN, 0x1);
2636 
2637 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
2638 					regTCP_UTCL0_THRASHING_CTRL, val);
2639 }
2640 
2641 static void gfx_v12_1_xcc_enable_atomics(struct amdgpu_device *adev,
2642 					 int xcc_id)
2643 {
2644 	uint32_t data;
2645 
2646 	/* Set the TCP UTCL0 register to enable atomics */
2647 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regTCP_UTCL0_CNTL1);
2648 	data = REG_SET_FIELD(data, TCP_UTCL0_CNTL1, ATOMIC_REQUESTER_EN, 0x1);
2649 
2650 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regTCP_UTCL0_CNTL1, data);
2651 }
2652 
2653 static void gfx_v12_1_xcc_disable_burst(struct amdgpu_device *adev,
2654 					int xcc_id)
2655 {
2656 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGL1_DRAM_BURST_CTRL, 0xf);
2657 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGLARB_DRAM_BURST_CTRL, 0xf);
2658 }
2659 
2660 static void gfx_v12_1_init_golden_registers(struct amdgpu_device *adev)
2661 {
2662 	int i;
2663 
2664 	for (i = 0; i < NUM_XCC(adev->gfx.xcc_mask); i++) {
2665 		gfx_v12_1_xcc_disable_burst(adev, i);
2666 		gfx_v12_1_xcc_enable_atomics(adev, i);
2667 		gfx_v12_1_xcc_setup_tcp_thrashing_ctrl(adev, i);
2668 	}
2669 }
2670 
2671 static int gfx_v12_1_hw_init(struct amdgpu_ip_block *ip_block)
2672 {
2673 	int r, i, num_xcc;
2674 	struct amdgpu_device *adev = ip_block->adev;
2675 
2676 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {
2677 		/* rlc autoload firmware */
2678 		r = gfx_v12_1_rlc_backdoor_autoload_enable(adev);
2679 		if (r)
2680 			return r;
2681 	} else {
2682 		if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
2683 			num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2684 
2685 			if (adev->gfx.imu.funcs) {
2686 				if (adev->gfx.imu.funcs->load_microcode)
2687 					adev->gfx.imu.funcs->load_microcode(adev);
2688 			}
2689 
2690 			for (i = 0; i < num_xcc; i++) {
2691 				/* disable gpa mode in backdoor loading */
2692 				gfx_v12_1_xcc_disable_gpa_mode(adev, i);
2693 			}
2694 		}
2695 	}
2696 
2697 	if ((adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) ||
2698 	    (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP)) {
2699 		r = gfx_v12_1_wait_for_rlc_autoload_complete(adev);
2700 		if (r) {
2701 			dev_err(adev->dev, "(%d) failed to wait rlc autoload complete\n", r);
2702 			return r;
2703 		}
2704 	}
2705 
2706 	adev->gfx.is_poweron = true;
2707 
2708 	if (get_gb_addr_config(adev))
2709 		DRM_WARN("Invalid gb_addr_config !\n");
2710 
2711 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP)
2712 		gfx_v12_1_config_gfx_rs64(adev);
2713 
2714 	r = gfx_v12_1_gfxhub_enable(adev);
2715 	if (r)
2716 		return r;
2717 
2718 	gfx_v12_1_init_golden_registers(adev);
2719 
2720 	gfx_v12_1_constants_init(adev);
2721 
2722 	if (adev->nbio.funcs->gc_doorbell_init)
2723 		adev->nbio.funcs->gc_doorbell_init(adev);
2724 
2725 	r = gfx_v12_1_rlc_resume(adev);
2726 	if (r)
2727 		return r;
2728 
2729 	/*
2730 	 * init golden registers and rlc resume may override some registers,
2731 	 * reconfig them here
2732 	 */
2733 	gfx_v12_1_tcp_harvest(adev);
2734 
2735 	r = gfx_v12_1_cp_resume(adev);
2736 	if (r)
2737 		return r;
2738 
2739 	return r;
2740 }
2741 
2742 static void gfx_v12_1_xcc_fini(struct amdgpu_device *adev,
2743 			      int xcc_id)
2744 {
2745 	uint32_t tmp;
2746 
2747 	if (!adev->no_hw_access) {
2748 		if (amdgpu_gfx_disable_kcq(adev, xcc_id))
2749 			DRM_ERROR("KCQ disable failed\n");
2750 
2751 		amdgpu_mes_kiq_hw_fini(adev, xcc_id);
2752 	}
2753 
2754 	if (amdgpu_sriov_vf(adev)) {
2755 		/* Program KIQ position of RLC_CP_SCHEDULERS during destroy */
2756 		tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS);
2757 		tmp &= 0xffffff00;
2758 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS, tmp);
2759 	}
2760 	gfx_v12_1_xcc_cp_compute_enable(adev, false, xcc_id);
2761 	gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, false, xcc_id);
2762 }
2763 
2764 static int gfx_v12_1_hw_fini(struct amdgpu_ip_block *ip_block)
2765 {
2766 	struct amdgpu_device *adev = ip_block->adev;
2767 	int i, num_xcc;
2768 
2769 	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
2770 	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
2771 
2772 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2773 	for (i = 0; i < num_xcc; i++) {
2774 		gfx_v12_1_xcc_fini(adev, i);
2775 	}
2776 
2777 	adev->gfxhub.funcs->gart_disable(adev);
2778 
2779 	adev->gfx.is_poweron = false;
2780 
2781 	return 0;
2782 }
2783 
2784 static int gfx_v12_1_suspend(struct amdgpu_ip_block *ip_block)
2785 {
2786 	return gfx_v12_1_hw_fini(ip_block);
2787 }
2788 
2789 static int gfx_v12_1_resume(struct amdgpu_ip_block *ip_block)
2790 {
2791 	return gfx_v12_1_hw_init(ip_block);
2792 }
2793 
2794 static bool gfx_v12_1_is_idle(struct amdgpu_ip_block *ip_block)
2795 {
2796 	struct amdgpu_device *adev = ip_block->adev;
2797 	int i, num_xcc;
2798 
2799 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2800 	for (i = 0; i < num_xcc; i++) {
2801 		if (REG_GET_FIELD(RREG32_SOC15(GC, GET_INST(GC, i),
2802 				regGRBM_STATUS), GRBM_STATUS, GUI_ACTIVE))
2803 			return false;
2804 	}
2805 	return true;
2806 }
2807 
2808 static int gfx_v12_1_wait_for_idle(struct amdgpu_ip_block *ip_block)
2809 {
2810 	unsigned i;
2811 	struct amdgpu_device *adev = ip_block->adev;
2812 
2813 	for (i = 0; i < adev->usec_timeout; i++) {
2814 		if (gfx_v12_1_is_idle(ip_block))
2815 			return 0;
2816 		udelay(1);
2817 	}
2818 	return -ETIMEDOUT;
2819 }
2820 
2821 static uint64_t gfx_v12_1_get_gpu_clock_counter(struct amdgpu_device *adev)
2822 {
2823 	uint64_t clock = 0;
2824 
2825 	if (adev->smuio.funcs &&
2826 	    adev->smuio.funcs->get_gpu_clock_counter)
2827 		clock = adev->smuio.funcs->get_gpu_clock_counter(adev);
2828 	else
2829 		dev_warn(adev->dev, "query gpu clock counter is not supported\n");
2830 
2831 	return clock;
2832 }
2833 
2834 static int gfx_v12_1_early_init(struct amdgpu_ip_block *ip_block)
2835 {
2836 	struct amdgpu_device *adev = ip_block->adev;
2837 
2838 	adev->gfx.funcs = &gfx_v12_1_gfx_funcs;
2839 
2840 	adev->gfx.num_compute_rings = min(amdgpu_gfx_get_num_kcq(adev),
2841 					  AMDGPU_MAX_COMPUTE_RINGS);
2842 
2843 	gfx_v12_1_set_kiq_pm4_funcs(adev);
2844 	gfx_v12_1_set_ring_funcs(adev);
2845 	gfx_v12_1_set_irq_funcs(adev);
2846 	gfx_v12_1_set_rlc_funcs(adev);
2847 	gfx_v12_1_set_mqd_funcs(adev);
2848 	gfx_v12_1_set_imu_funcs(adev);
2849 
2850 	gfx_v12_1_init_rlcg_reg_access_ctrl(adev);
2851 
2852 	return gfx_v12_1_init_microcode(adev);
2853 }
2854 
2855 static int gfx_v12_1_late_init(struct amdgpu_ip_block *ip_block)
2856 {
2857 	struct amdgpu_device *adev = ip_block->adev;
2858 	int r;
2859 
2860 	r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0);
2861 	if (r)
2862 		return r;
2863 
2864 	r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0);
2865 	if (r)
2866 		return r;
2867 
2868 	return 0;
2869 }
2870 
2871 static bool gfx_v12_1_is_rlc_enabled(struct amdgpu_device *adev)
2872 {
2873 	uint32_t rlc_cntl;
2874 
2875 	/* if RLC is not enabled, do nothing */
2876 	rlc_cntl = RREG32_SOC15(GC, GET_INST(GC, 0), regRLC_CNTL);
2877 	return (REG_GET_FIELD(rlc_cntl, RLC_CNTL, RLC_ENABLE_F32)) ? true : false;
2878 }
2879 
2880 static void gfx_v12_1_xcc_set_safe_mode(struct amdgpu_device *adev,
2881 					int xcc_id)
2882 {
2883 	uint32_t data;
2884 	unsigned i;
2885 
2886 	data = RLC_SAFE_MODE__CMD_MASK;
2887 	data |= (1 << RLC_SAFE_MODE__MESSAGE__SHIFT);
2888 
2889 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_SAFE_MODE, data);
2890 
2891 	/* wait for RLC_SAFE_MODE */
2892 	for (i = 0; i < adev->usec_timeout; i++) {
2893 		if (!REG_GET_FIELD(RREG32_SOC15(GC, GET_INST(GC, xcc_id),
2894 						regRLC_SAFE_MODE), RLC_SAFE_MODE, CMD))
2895 			break;
2896 		udelay(1);
2897 	}
2898 }
2899 
2900 static void gfx_v12_1_xcc_unset_safe_mode(struct amdgpu_device *adev,
2901 					  int xcc_id)
2902 {
2903 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
2904 		     regRLC_SAFE_MODE, RLC_SAFE_MODE__CMD_MASK);
2905 }
2906 
2907 static void gfx_v12_1_update_perf_clk(struct amdgpu_device *adev,
2908 				      bool enable)
2909 {
2910 	int i, num_xcc;
2911 
2912 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2913 	for (i = 0; i < num_xcc; i++)
2914 		gfx_v12_1_xcc_update_perf_clk(adev, enable, i);
2915 }
2916 
2917 static void gfx_v12_1_update_spm_vmid(struct amdgpu_device *adev,
2918 				      int xcc_id,
2919 				      struct amdgpu_ring *ring,
2920 				      unsigned vmid)
2921 {
2922 	u32 reg, data;
2923 
2924 	reg = SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SPM_MC_CNTL);
2925 	if (amdgpu_sriov_is_pp_one_vf(adev))
2926 		data = RREG32_NO_KIQ(reg);
2927 	else
2928 		data = RREG32(reg);
2929 
2930 	data &= ~RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK;
2931 	data |= (vmid & RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK) << RLC_SPM_MC_CNTL__RLC_SPM_VMID__SHIFT;
2932 
2933 	if (amdgpu_sriov_is_pp_one_vf(adev))
2934 		WREG32_SOC15_NO_KIQ(GC, GET_INST(GC, xcc_id), regRLC_SPM_MC_CNTL, data);
2935 	else
2936 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_SPM_MC_CNTL, data);
2937 
2938 	if (ring
2939 	    && amdgpu_sriov_is_pp_one_vf(adev)
2940 	    && ((ring->funcs->type == AMDGPU_RING_TYPE_GFX)
2941 		|| (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE))) {
2942 		uint32_t reg = SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SPM_MC_CNTL);
2943 		amdgpu_ring_emit_wreg(ring, reg, data);
2944 	}
2945 }
2946 
2947 static const struct amdgpu_rlc_funcs gfx_v12_1_rlc_funcs = {
2948 	.is_rlc_enabled = gfx_v12_1_is_rlc_enabled,
2949 	.set_safe_mode = gfx_v12_1_xcc_set_safe_mode,
2950 	.unset_safe_mode = gfx_v12_1_xcc_unset_safe_mode,
2951 	.init = gfx_v12_1_rlc_init,
2952 	.get_csb_size = gfx_v12_1_get_csb_size,
2953 	.get_csb_buffer = gfx_v12_1_get_csb_buffer,
2954 	.resume = gfx_v12_1_rlc_resume,
2955 	.stop = gfx_v12_1_rlc_stop,
2956 	.reset = gfx_v12_1_rlc_reset,
2957 	.start = gfx_v12_1_rlc_start,
2958 	.update_spm_vmid = gfx_v12_1_update_spm_vmid,
2959 };
2960 
2961 #if 0
2962 static void gfx_v12_cntl_power_gating(struct amdgpu_device *adev, bool enable)
2963 {
2964 	/* TODO */
2965 }
2966 
2967 static void gfx_v12_cntl_pg(struct amdgpu_device *adev, bool enable)
2968 {
2969 	/* TODO */
2970 }
2971 #endif
2972 
2973 static int gfx_v12_1_set_powergating_state(struct amdgpu_ip_block *ip_block,
2974 					   enum amd_powergating_state state)
2975 {
2976 	struct amdgpu_device *adev = ip_block->adev;
2977 	bool enable = (state == AMD_PG_STATE_GATE);
2978 
2979 	if (amdgpu_sriov_vf(adev))
2980 		return 0;
2981 
2982 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
2983 	case IP_VERSION(12, 1, 0):
2984 		amdgpu_gfx_off_ctrl(adev, enable);
2985 		break;
2986 	default:
2987 		break;
2988 	}
2989 
2990 	return 0;
2991 }
2992 
2993 static void gfx_v12_1_xcc_update_coarse_grain_clock_gating(struct amdgpu_device *adev,
2994 							   bool enable, int xcc_id)
2995 {
2996 	uint32_t def, data;
2997 
2998 	if (!(adev->cg_flags &
2999 	      (AMD_CG_SUPPORT_GFX_CGCG |
3000 	      AMD_CG_SUPPORT_GFX_CGLS |
3001 	      AMD_CG_SUPPORT_GFX_3D_CGCG |
3002 	      AMD_CG_SUPPORT_GFX_3D_CGLS)))
3003 		return;
3004 
3005 	if (enable) {
3006 		def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
3007 					  regRLC_CGTT_MGCG_OVERRIDE);
3008 
3009 		/* unset CGCG override */
3010 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGCG)
3011 			data &= ~RLC_CGTT_MGCG_OVERRIDE__GFXIP_CGCG_OVERRIDE_MASK;
3012 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS)
3013 			data &= ~RLC_CGTT_MGCG_OVERRIDE__GFXIP_CGLS_OVERRIDE_MASK;
3014 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_3D_CGCG ||
3015 		    adev->cg_flags & AMD_CG_SUPPORT_GFX_3D_CGLS)
3016 			data &= ~RLC_CGTT_MGCG_OVERRIDE__GFXIP_GFX3D_CG_OVERRIDE_MASK;
3017 
3018 		/* update CGCG override bits */
3019 		if (def != data)
3020 			WREG32_SOC15(GC, GET_INST(GC, xcc_id),
3021 				     regRLC_CGTT_MGCG_OVERRIDE, data);
3022 
3023 		/* enable cgcg FSM(0x0000363F) */
3024 		def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL);
3025 
3026 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGCG) {
3027 			data &= ~RLC_CGCG_CGLS_CTRL__CGCG_GFX_IDLE_THRESHOLD_MASK;
3028 			data |= (0x36 << RLC_CGCG_CGLS_CTRL__CGCG_GFX_IDLE_THRESHOLD__SHIFT) |
3029 				 RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK;
3030 		}
3031 
3032 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS) {
3033 			data &= ~RLC_CGCG_CGLS_CTRL__CGLS_REP_COMPANSAT_DELAY_MASK;
3034 			data |= (0x000F << RLC_CGCG_CGLS_CTRL__CGLS_REP_COMPANSAT_DELAY__SHIFT) |
3035 				 RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK;
3036 		}
3037 
3038 		if (def != data)
3039 			WREG32_SOC15(GC, GET_INST(GC, xcc_id),
3040 				     regRLC_CGCG_CGLS_CTRL, data);
3041 
3042 		/* set IDLE_POLL_COUNT(0x00900100) */
3043 		def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL);
3044 
3045 		data &= ~CP_RB_WPTR_POLL_CNTL__POLL_FREQUENCY_MASK;
3046 		data &= ~CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT_MASK;
3047 		data |= (0x0100 << CP_RB_WPTR_POLL_CNTL__POLL_FREQUENCY__SHIFT) |
3048 			(0x0090 << CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT__SHIFT);
3049 
3050 		if (def != data)
3051 			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL, data);
3052 
3053 		data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_INT_CNTL);
3054 		data = REG_SET_FIELD(data, CP_INT_CNTL, CNTX_BUSY_INT_ENABLE, 1);
3055 		data = REG_SET_FIELD(data, CP_INT_CNTL, CNTX_EMPTY_INT_ENABLE, 1);
3056 		data = REG_SET_FIELD(data, CP_INT_CNTL, CMP_BUSY_INT_ENABLE, 1);
3057 		data = REG_SET_FIELD(data, CP_INT_CNTL, GFX_IDLE_INT_ENABLE, 1);
3058 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_INT_CNTL, data);
3059 	} else {
3060 		/* Program RLC_CGCG_CGLS_CTRL */
3061 		def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL);
3062 
3063 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGCG)
3064 			data &= ~RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK;
3065 
3066 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS)
3067 			data &= ~RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK;
3068 
3069 		if (def != data)
3070 			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL, data);
3071 	}
3072 }
3073 
3074 static void gfx_v12_1_xcc_update_medium_grain_clock_gating(struct amdgpu_device *adev,
3075 							   bool enable, int xcc_id)
3076 {
3077 	uint32_t data, def;
3078 	if (!(adev->cg_flags & (AMD_CG_SUPPORT_GFX_MGCG | AMD_CG_SUPPORT_GFX_MGLS)))
3079 		return;
3080 
3081 	/* It is disabled by HW by default */
3082 	if (enable) {
3083 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_MGCG) {
3084 			/* 1 - RLC_CGTT_MGCG_OVERRIDE */
3085 			def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3086 
3087 			data &= ~(RLC_CGTT_MGCG_OVERRIDE__GRBM_CGTT_SCLK_OVERRIDE_MASK |
3088 				  RLC_CGTT_MGCG_OVERRIDE__RLC_CGTT_SCLK_OVERRIDE_MASK |
3089 				  RLC_CGTT_MGCG_OVERRIDE__GFXIP_MGCG_OVERRIDE_MASK);
3090 
3091 			if (def != data)
3092 				WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3093 		}
3094 	} else {
3095 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_MGCG) {
3096 			def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3097 
3098 			data |= (RLC_CGTT_MGCG_OVERRIDE__RLC_CGTT_SCLK_OVERRIDE_MASK |
3099 				 RLC_CGTT_MGCG_OVERRIDE__GRBM_CGTT_SCLK_OVERRIDE_MASK |
3100 				 RLC_CGTT_MGCG_OVERRIDE__GFXIP_MGCG_OVERRIDE_MASK);
3101 
3102 			if (def != data)
3103 				WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3104 		}
3105 	}
3106 }
3107 
3108 static void gfx_v12_1_xcc_update_repeater_fgcg(struct amdgpu_device *adev,
3109 					       bool enable, int xcc_id)
3110 {
3111 	uint32_t def, data;
3112 
3113 	if (!(adev->cg_flags & AMD_CG_SUPPORT_REPEATER_FGCG))
3114 		return;
3115 
3116 	def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3117 
3118 	if (enable)
3119 		data &= ~(RLC_CGTT_MGCG_OVERRIDE__GFXIP_REPEATER_FGCG_OVERRIDE_MASK |
3120 				  RLC_CGTT_MGCG_OVERRIDE__RLC_REPEATER_FGCG_OVERRIDE_MASK);
3121 	else
3122 		data |= RLC_CGTT_MGCG_OVERRIDE__GFXIP_REPEATER_FGCG_OVERRIDE_MASK |
3123 				RLC_CGTT_MGCG_OVERRIDE__RLC_REPEATER_FGCG_OVERRIDE_MASK;
3124 
3125 	if (def != data)
3126 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3127 }
3128 
3129 static void gfx_v12_1_xcc_update_sram_fgcg(struct amdgpu_device *adev,
3130 					   bool enable, int xcc_id)
3131 {
3132 	uint32_t def, data;
3133 
3134 	if (!(adev->cg_flags & AMD_CG_SUPPORT_GFX_FGCG))
3135 		return;
3136 
3137 	def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3138 
3139 	if (enable)
3140 		data &= ~RLC_CGTT_MGCG_OVERRIDE__GFXIP_FGCG_OVERRIDE_MASK;
3141 	else
3142 		data |= RLC_CGTT_MGCG_OVERRIDE__GFXIP_FGCG_OVERRIDE_MASK;
3143 
3144 	if (def != data)
3145 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3146 }
3147 
3148 static void gfx_v12_1_xcc_update_perf_clk(struct amdgpu_device *adev,
3149 					  bool enable, int xcc_id)
3150 {
3151 	uint32_t def, data;
3152 
3153 	if (!(adev->cg_flags & AMD_CG_SUPPORT_GFX_PERF_CLK))
3154 		return;
3155 
3156 	def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3157 
3158 	if (enable)
3159 		data &= ~RLC_CGTT_MGCG_OVERRIDE__PERFMON_CLOCK_STATE_MASK;
3160 	else
3161 		data |= RLC_CGTT_MGCG_OVERRIDE__PERFMON_CLOCK_STATE_MASK;
3162 
3163 	if (def != data)
3164 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3165 }
3166 
3167 static int gfx_v12_1_xcc_update_gfx_clock_gating(struct amdgpu_device *adev,
3168 					     bool enable, int xcc_id)
3169 {
3170 	amdgpu_gfx_rlc_enter_safe_mode(adev, xcc_id);
3171 
3172 	gfx_v12_1_xcc_update_coarse_grain_clock_gating(adev, enable, xcc_id);
3173 
3174 	gfx_v12_1_xcc_update_medium_grain_clock_gating(adev, enable, xcc_id);
3175 
3176 	gfx_v12_1_xcc_update_repeater_fgcg(adev, enable, xcc_id);
3177 
3178 	gfx_v12_1_xcc_update_sram_fgcg(adev, enable, xcc_id);
3179 
3180 	gfx_v12_1_xcc_update_perf_clk(adev, enable, xcc_id);
3181 
3182 	if (adev->cg_flags &
3183 	    (AMD_CG_SUPPORT_GFX_MGCG |
3184 	     AMD_CG_SUPPORT_GFX_CGLS |
3185 	     AMD_CG_SUPPORT_GFX_CGCG |
3186 	     AMD_CG_SUPPORT_GFX_3D_CGCG |
3187 	     AMD_CG_SUPPORT_GFX_3D_CGLS))
3188 		gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, enable, xcc_id);
3189 
3190 	amdgpu_gfx_rlc_exit_safe_mode(adev, xcc_id);
3191 
3192 	return 0;
3193 }
3194 
3195 static int gfx_v12_1_set_clockgating_state(struct amdgpu_ip_block *ip_block,
3196 					   enum amd_clockgating_state state)
3197 {
3198 	struct amdgpu_device *adev = ip_block->adev;
3199 	int i, num_xcc;
3200 
3201 	if (amdgpu_sriov_vf(adev))
3202 		return 0;
3203 
3204 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3205 	switch (adev->ip_versions[GC_HWIP][0]) {
3206 	case IP_VERSION(12, 1, 0):
3207 		for (i = 0; i < num_xcc; i++)
3208 			gfx_v12_1_xcc_update_gfx_clock_gating(adev,
3209 				  state == AMD_CG_STATE_GATE, i);
3210 		break;
3211 	default:
3212 		break;
3213 	}
3214 
3215 	return 0;
3216 }
3217 
3218 static void gfx_v12_1_get_clockgating_state(struct amdgpu_ip_block *ip_block, u64 *flags)
3219 {
3220 	struct amdgpu_device *adev = ip_block->adev;
3221 	int data;
3222 
3223 	/* AMD_CG_SUPPORT_GFX_MGCG */
3224 	data = RREG32_SOC15(GC, GET_INST(GC, 0), regRLC_CGTT_MGCG_OVERRIDE);
3225 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__GFXIP_MGCG_OVERRIDE_MASK))
3226 		*flags |= AMD_CG_SUPPORT_GFX_MGCG;
3227 
3228 	/* AMD_CG_SUPPORT_REPEATER_FGCG */
3229 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__GFXIP_REPEATER_FGCG_OVERRIDE_MASK))
3230 		*flags |= AMD_CG_SUPPORT_REPEATER_FGCG;
3231 
3232 	/* AMD_CG_SUPPORT_GFX_FGCG */
3233 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__GFXIP_FGCG_OVERRIDE_MASK))
3234 		*flags |= AMD_CG_SUPPORT_GFX_FGCG;
3235 
3236 	/* AMD_CG_SUPPORT_GFX_PERF_CLK */
3237 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__PERFMON_CLOCK_STATE_MASK))
3238 		*flags |= AMD_CG_SUPPORT_GFX_PERF_CLK;
3239 
3240 	/* AMD_CG_SUPPORT_GFX_CGCG */
3241 	data = RREG32_SOC15(GC, GET_INST(GC, 0), regRLC_CGCG_CGLS_CTRL);
3242 	if (data & RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK)
3243 		*flags |= AMD_CG_SUPPORT_GFX_CGCG;
3244 
3245 	/* AMD_CG_SUPPORT_GFX_CGLS */
3246 	if (data & RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK)
3247 		*flags |= AMD_CG_SUPPORT_GFX_CGLS;
3248 }
3249 
3250 static u64 gfx_v12_1_ring_get_rptr_compute(struct amdgpu_ring *ring)
3251 {
3252 	/* gfx12 hardware is 32bit rptr */
3253 	return *(uint32_t *)ring->rptr_cpu_addr;
3254 }
3255 
3256 static u64 gfx_v12_1_ring_get_wptr_compute(struct amdgpu_ring *ring)
3257 {
3258 	u64 wptr;
3259 
3260 	/* XXX check if swapping is necessary on BE */
3261 	if (ring->use_doorbell)
3262 		wptr = atomic64_read((atomic64_t *)ring->wptr_cpu_addr);
3263 	else
3264 		BUG();
3265 	return wptr;
3266 }
3267 
3268 static void gfx_v12_1_ring_set_wptr_compute(struct amdgpu_ring *ring)
3269 {
3270 	struct amdgpu_device *adev = ring->adev;
3271 
3272 	/* XXX check if swapping is necessary on BE */
3273 	if (ring->use_doorbell) {
3274 		atomic64_set((atomic64_t *)ring->wptr_cpu_addr,
3275 			     ring->wptr);
3276 		WDOORBELL64(ring->doorbell_index, ring->wptr);
3277 	} else {
3278 		BUG(); /* only DOORBELL method supported on gfx12 now */
3279 	}
3280 }
3281 
3282 static void gfx_v12_1_ring_emit_ib_compute(struct amdgpu_ring *ring,
3283 					   struct amdgpu_job *job,
3284 					   struct amdgpu_ib *ib,
3285 					   uint32_t flags)
3286 {
3287 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
3288 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
3289 
3290 	/* Currently, there is a high possibility to get wave ID mismatch
3291 	 * between ME and GDS, leading to a hw deadlock, because ME generates
3292 	 * different wave IDs than the GDS expects. This situation happens
3293 	 * randomly when at least 5 compute pipes use GDS ordered append.
3294 	 * The wave IDs generated by ME are also wrong after suspend/resume.
3295 	 * Those are probably bugs somewhere else in the kernel driver.
3296 	 *
3297 	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
3298 	 * GDS to 0 for this ring (me/pipe).
3299 	 */
3300 	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
3301 		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
3302 		amdgpu_ring_write(ring, regGDS_COMPUTE_MAX_WAVE_ID);
3303 	}
3304 
3305 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
3306 	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
3307 	amdgpu_ring_write(ring,
3308 #ifdef __BIG_ENDIAN
3309 				(2 << 0) |
3310 #endif
3311 				lower_32_bits(ib->gpu_addr));
3312 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
3313 	amdgpu_ring_write(ring, control);
3314 }
3315 
3316 static void gfx_v12_1_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
3317 				     u64 seq, unsigned flags)
3318 {
3319 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
3320 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
3321 
3322 	/* RELEASE_MEM - flush caches, send int */
3323 	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
3324 	amdgpu_ring_write(ring, (PACKET3_RELEASE_MEM_GCR_SEQ(1) |
3325 				 PACKET3_RELEASE_MEM_GCR_GLV_WB |
3326 				 PACKET3_RELEASE_MEM_GCR_GL2_WB |
3327 				 PACKET3_RELEASE_MEM_GCR_GL2_SCOPE(2) |
3328 				 PACKET3_RELEASE_MEM_TEMPORAL(3) |
3329 				 PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
3330 				 PACKET3_RELEASE_MEM_EVENT_INDEX(5)));
3331 	amdgpu_ring_write(ring, (PACKET3_RELEASE_MEM_DATA_SEL(write64bit ? 2 : 1) |
3332 				 PACKET3_RELEASE_MEM_INT_SEL(int_sel ? 2 : 0)));
3333 
3334 	/*
3335 	 * the address should be Qword aligned if 64bit write, Dword
3336 	 * aligned if only send 32bit data low (discard data high)
3337 	 */
3338 	if (write64bit)
3339 		BUG_ON(addr & 0x7);
3340 	else
3341 		BUG_ON(addr & 0x3);
3342 	amdgpu_ring_write(ring, lower_32_bits(addr));
3343 	amdgpu_ring_write(ring, upper_32_bits(addr));
3344 	amdgpu_ring_write(ring, lower_32_bits(seq));
3345 	amdgpu_ring_write(ring, upper_32_bits(seq));
3346 	amdgpu_ring_write(ring, 0);
3347 }
3348 
3349 static void gfx_v12_1_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
3350 {
3351 	int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
3352 	uint32_t seq = ring->fence_drv.sync_seq;
3353 	uint64_t addr = ring->fence_drv.gpu_addr;
3354 
3355 	gfx_v12_1_wait_reg_mem(ring, usepfp, 1, 0, lower_32_bits(addr),
3356 			       upper_32_bits(addr), seq, 0xffffffff, 4);
3357 }
3358 
3359 static void gfx_v12_1_ring_invalidate_tlbs(struct amdgpu_ring *ring,
3360 				   uint16_t pasid, uint32_t flush_type,
3361 				   bool all_hub, uint8_t dst_sel)
3362 {
3363 	amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
3364 	amdgpu_ring_write(ring,
3365 			  PACKET3_INVALIDATE_TLBS_DST_SEL(dst_sel) |
3366 			  PACKET3_INVALIDATE_TLBS_ALL_HUB(all_hub) |
3367 			  PACKET3_INVALIDATE_TLBS_PASID(pasid) |
3368 			  PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(flush_type));
3369 }
3370 
3371 static void gfx_v12_1_ring_emit_vm_flush(struct amdgpu_ring *ring,
3372 					 unsigned vmid, uint64_t pd_addr)
3373 {
3374 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
3375 
3376 	/* compute doesn't have PFP */
3377 	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
3378 		/* sync PFP to ME, otherwise we might get invalid PFP reads */
3379 		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
3380 		amdgpu_ring_write(ring, 0x0);
3381 	}
3382 }
3383 
3384 static void gfx_v12_1_ring_emit_fence_kiq(struct amdgpu_ring *ring, u64 addr,
3385 					  u64 seq, unsigned int flags)
3386 {
3387 	struct amdgpu_device *adev = ring->adev;
3388 
3389 	/* we only allocate 32bit for each seq wb address */
3390 	BUG_ON(flags & AMDGPU_FENCE_FLAG_64BIT);
3391 
3392 	/* write fence seq to the "addr" */
3393 	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
3394 	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
3395 				 WRITE_DATA_DST_SEL(5) | WR_CONFIRM));
3396 	amdgpu_ring_write(ring, lower_32_bits(addr));
3397 	amdgpu_ring_write(ring, upper_32_bits(addr));
3398 	amdgpu_ring_write(ring, lower_32_bits(seq));
3399 
3400 	if (flags & AMDGPU_FENCE_FLAG_INT) {
3401 		/* set register to trigger INT */
3402 		amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
3403 		amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
3404 					 WRITE_DATA_DST_SEL(0) | WR_CONFIRM));
3405 		amdgpu_ring_write(ring, SOC15_REG_OFFSET(GC, GET_INST(GC, 0), regCPC_INT_STATUS));
3406 		amdgpu_ring_write(ring, 0);
3407 		amdgpu_ring_write(ring, 0x20000000); /* src_id is 178 */
3408 	}
3409 }
3410 
3411 static void gfx_v12_1_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
3412 				     uint32_t reg_val_offs)
3413 {
3414 	struct amdgpu_device *adev = ring->adev;
3415 
3416 	reg = soc_v1_0_normalize_xcc_reg_offset(reg);
3417 
3418 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
3419 	amdgpu_ring_write(ring, 0 |	/* src: register*/
3420 				(5 << 8) |	/* dst: memory */
3421 				(1 << 20));	/* write confirm */
3422 	amdgpu_ring_write(ring, reg);
3423 	amdgpu_ring_write(ring, 0);
3424 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
3425 				reg_val_offs * 4));
3426 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
3427 				reg_val_offs * 4));
3428 }
3429 
3430 static void gfx_v12_1_ring_emit_wreg(struct amdgpu_ring *ring,
3431 				     uint32_t reg,
3432 				     uint32_t val)
3433 {
3434 	uint32_t cmd = 0;
3435 
3436 	reg = soc_v1_0_normalize_xcc_reg_offset(reg);
3437 
3438 	switch (ring->funcs->type) {
3439 	case AMDGPU_RING_TYPE_KIQ:
3440 		cmd = (1 << 16); /* no inc addr */
3441 		break;
3442 	default:
3443 		cmd = WR_CONFIRM;
3444 		break;
3445 	}
3446 	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
3447 	amdgpu_ring_write(ring, cmd);
3448 	amdgpu_ring_write(ring, reg);
3449 	amdgpu_ring_write(ring, 0);
3450 	amdgpu_ring_write(ring, val);
3451 }
3452 
3453 static void gfx_v12_1_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
3454 					uint32_t val, uint32_t mask)
3455 {
3456 	gfx_v12_1_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
3457 }
3458 
3459 static void gfx_v12_1_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
3460 						   uint32_t reg0, uint32_t reg1,
3461 						   uint32_t ref, uint32_t mask)
3462 {
3463 	int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
3464 
3465 	gfx_v12_1_wait_reg_mem(ring, usepfp, 0, 1, reg0, reg1,
3466 			       ref, mask, 0x20);
3467 }
3468 
3469 static void gfx_v12_1_xcc_set_compute_eop_interrupt_state(struct amdgpu_device *adev,
3470 							int me, int pipe,
3471 							enum amdgpu_interrupt_state state,
3472 							int xcc_id)
3473 {
3474 	u32 mec_int_cntl, mec_int_cntl_reg;
3475 
3476 	/*
3477 	 * amdgpu controls only the first MEC. That's why this function only
3478 	 * handles the setting of interrupts for this specific MEC. All other
3479 	 * pipes' interrupts are set by amdkfd.
3480 	 */
3481 
3482 	if (me == 1) {
3483 		switch (pipe) {
3484 		case 0:
3485 			mec_int_cntl_reg = SOC15_REG_OFFSET(
3486 					GC, GET_INST(GC, xcc_id),
3487 					regCP_ME1_PIPE0_INT_CNTL);
3488 			break;
3489 		case 1:
3490 			mec_int_cntl_reg = SOC15_REG_OFFSET(
3491 					GC, GET_INST(GC, xcc_id),
3492 					regCP_ME1_PIPE1_INT_CNTL);
3493 			break;
3494 		case 2:
3495 			mec_int_cntl_reg = SOC15_REG_OFFSET(
3496 					GC, GET_INST(GC, xcc_id),
3497 					regCP_ME1_PIPE2_INT_CNTL);
3498 			break;
3499 		case 3:
3500 			mec_int_cntl_reg = SOC15_REG_OFFSET(
3501 					GC, GET_INST(GC, xcc_id),
3502 					regCP_ME1_PIPE3_INT_CNTL);
3503 			break;
3504 		default:
3505 			DRM_DEBUG("invalid pipe %d\n", pipe);
3506 			return;
3507 		}
3508 	} else {
3509 		DRM_DEBUG("invalid me %d\n", me);
3510 		return;
3511 	}
3512 
3513 	switch (state) {
3514 	case AMDGPU_IRQ_STATE_DISABLE:
3515 		mec_int_cntl = RREG32_XCC(mec_int_cntl_reg, xcc_id);
3516 		mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
3517 					     TIME_STAMP_INT_ENABLE, 0);
3518 		mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
3519 					     GENERIC0_INT_ENABLE, 0);
3520 		WREG32_XCC(mec_int_cntl_reg, mec_int_cntl, xcc_id);
3521 		break;
3522 	case AMDGPU_IRQ_STATE_ENABLE:
3523 		mec_int_cntl = RREG32_XCC(mec_int_cntl_reg, xcc_id);
3524 		mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
3525 					     TIME_STAMP_INT_ENABLE, 1);
3526 		mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
3527 					     GENERIC0_INT_ENABLE, 1);
3528 		WREG32_XCC(mec_int_cntl_reg, mec_int_cntl, xcc_id);
3529 		break;
3530 	default:
3531 		break;
3532 	}
3533 }
3534 
3535 static int gfx_v12_1_set_eop_interrupt_state(struct amdgpu_device *adev,
3536 					    struct amdgpu_irq_src *src,
3537 					    unsigned type,
3538 					    enum amdgpu_interrupt_state state)
3539 {
3540 	int i, num_xcc;
3541 
3542 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3543 	for (i = 0; i < num_xcc; i++) {
3544 		switch (type) {
3545 		case AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP:
3546 			gfx_v12_1_xcc_set_compute_eop_interrupt_state(
3547 					adev, 1, 0, state, i);
3548 			break;
3549 		case AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE1_EOP:
3550 			gfx_v12_1_xcc_set_compute_eop_interrupt_state(
3551 					adev, 1, 1, state, i);
3552 			break;
3553 		case AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE2_EOP:
3554 			gfx_v12_1_xcc_set_compute_eop_interrupt_state(
3555 					adev, 1, 2, state, i);
3556 			break;
3557 		case AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE3_EOP:
3558 			gfx_v12_1_xcc_set_compute_eop_interrupt_state(
3559 					adev, 1, 3, state, i);
3560 			break;
3561 		default:
3562 			break;
3563 		}
3564 	}
3565 
3566 	return 0;
3567 }
3568 
3569 static int gfx_v12_1_eop_irq(struct amdgpu_device *adev,
3570 			     struct amdgpu_irq_src *source,
3571 			     struct amdgpu_iv_entry *entry)
3572 {
3573 	int i, xcc_id;
3574 	u8 me_id, pipe_id, queue_id;
3575 	struct amdgpu_ring *ring;
3576 	uint32_t mes_queue_id = entry->src_data[0];
3577 
3578 	DRM_DEBUG("IH: CP EOP\n");
3579 
3580 	if (adev->enable_mes && (mes_queue_id & AMDGPU_FENCE_MES_QUEUE_FLAG)) {
3581 		struct amdgpu_mes_queue *queue;
3582 
3583 		mes_queue_id &= AMDGPU_FENCE_MES_QUEUE_ID_MASK;
3584 
3585 		spin_lock(&adev->mes.queue_id_lock);
3586 		queue = idr_find(&adev->mes.queue_id_idr, mes_queue_id);
3587 		if (queue) {
3588 			DRM_DEBUG("process mes queue id = %d\n", mes_queue_id);
3589 			amdgpu_fence_process(queue->ring);
3590 		}
3591 		spin_unlock(&adev->mes.queue_id_lock);
3592 	} else {
3593 		me_id = (entry->ring_id & 0x0c) >> 2;
3594 		pipe_id = (entry->ring_id & 0x03) >> 0;
3595 		queue_id = (entry->ring_id & 0x70) >> 4;
3596 		xcc_id = gfx_v12_1_ih_to_xcc_inst(adev, entry->node_id);
3597 
3598 		if (xcc_id == -EINVAL)
3599 			return -EINVAL;
3600 
3601 		switch (me_id) {
3602 		case 0:
3603 			if (pipe_id == 0)
3604 				amdgpu_fence_process(&adev->gfx.gfx_ring[0]);
3605 			else
3606 				amdgpu_fence_process(&adev->gfx.gfx_ring[1]);
3607 			break;
3608 		case 1:
3609 		case 2:
3610 			for (i = 0; i < adev->gfx.num_compute_rings; i++) {
3611 				ring = &adev->gfx.compute_ring
3612 						[i +
3613 						 xcc_id * adev->gfx.num_compute_rings];
3614 				/* Per-queue interrupt is supported for MEC starting from VI.
3615 				 * The interrupt can only be enabled/disabled per pipe instead
3616 				 * of per queue.
3617 				 */
3618 				if ((ring->me == me_id) &&
3619 				    (ring->pipe == pipe_id) &&
3620 				    (ring->queue == queue_id))
3621 					amdgpu_fence_process(ring);
3622 			}
3623 			break;
3624 		}
3625 	}
3626 
3627 	return 0;
3628 }
3629 
3630 static int gfx_v12_1_set_priv_reg_fault_state(struct amdgpu_device *adev,
3631 					      struct amdgpu_irq_src *source,
3632 					      unsigned type,
3633 					      enum amdgpu_interrupt_state state)
3634 {
3635 	int i, num_xcc;
3636 
3637 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3638 	switch (state) {
3639 	case AMDGPU_IRQ_STATE_DISABLE:
3640 	case AMDGPU_IRQ_STATE_ENABLE:
3641 		for (i = 0; i < num_xcc; i++)
3642 			WREG32_FIELD15_PREREG(GC, GET_INST(GC, i), CP_INT_CNTL_RING0,
3643 					      PRIV_REG_INT_ENABLE,
3644 					      state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
3645 		break;
3646 	default:
3647 		break;
3648 	}
3649 
3650 	return 0;
3651 }
3652 
3653 static int gfx_v12_1_set_priv_inst_fault_state(struct amdgpu_device *adev,
3654 					       struct amdgpu_irq_src *source,
3655 					       unsigned type,
3656 					       enum amdgpu_interrupt_state state)
3657 {
3658 	int i, num_xcc;
3659 
3660 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3661 	switch (state) {
3662 	case AMDGPU_IRQ_STATE_DISABLE:
3663 	case AMDGPU_IRQ_STATE_ENABLE:
3664 		for (i = 0; i < num_xcc; i++)
3665 			WREG32_FIELD15_PREREG(GC, GET_INST(GC, i), CP_INT_CNTL_RING0,
3666 				       PRIV_INSTR_INT_ENABLE,
3667 				       state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
3668 		break;
3669 	default:
3670 		break;
3671 	}
3672 
3673 	return 0;
3674 }
3675 
3676 static void gfx_v12_1_handle_priv_fault(struct amdgpu_device *adev,
3677 					struct amdgpu_iv_entry *entry)
3678 {
3679 	u8 me_id, pipe_id, queue_id;
3680 	struct amdgpu_ring *ring;
3681 	int i, xcc_id;
3682 
3683 	me_id = (entry->ring_id & 0x0c) >> 2;
3684 	pipe_id = (entry->ring_id & 0x03) >> 0;
3685 	queue_id = (entry->ring_id & 0x70) >> 4;
3686 	xcc_id = gfx_v12_1_ih_to_xcc_inst(adev, entry->node_id);
3687 
3688 	if (xcc_id == -EINVAL)
3689 		return;
3690 
3691 	switch (me_id) {
3692 	case 0:
3693 		for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
3694 			ring = &adev->gfx.gfx_ring[i];
3695 			/* we only enabled 1 gfx queue per pipe for now */
3696 			if (ring->me == me_id && ring->pipe == pipe_id)
3697 				drm_sched_fault(&ring->sched);
3698 		}
3699 		break;
3700 	case 1:
3701 	case 2:
3702 		for (i = 0; i < adev->gfx.num_compute_rings; i++) {
3703 			ring = &adev->gfx.compute_ring
3704 					[i +
3705 					 xcc_id * adev->gfx.num_compute_rings];
3706 			if (ring->me == me_id && ring->pipe == pipe_id &&
3707 			    ring->queue == queue_id)
3708 				drm_sched_fault(&ring->sched);
3709 		}
3710 		break;
3711 	default:
3712 		BUG();
3713 		break;
3714 	}
3715 }
3716 
3717 static int gfx_v12_1_priv_reg_irq(struct amdgpu_device *adev,
3718 				  struct amdgpu_irq_src *source,
3719 				  struct amdgpu_iv_entry *entry)
3720 {
3721 	DRM_ERROR("Illegal register access in command stream\n");
3722 	gfx_v12_1_handle_priv_fault(adev, entry);
3723 	return 0;
3724 }
3725 
3726 static int gfx_v12_1_priv_inst_irq(struct amdgpu_device *adev,
3727 				   struct amdgpu_irq_src *source,
3728 				   struct amdgpu_iv_entry *entry)
3729 {
3730 	DRM_ERROR("Illegal instruction in command stream\n");
3731 	gfx_v12_1_handle_priv_fault(adev, entry);
3732 	return 0;
3733 }
3734 
3735 static void gfx_v12_1_emit_mem_sync(struct amdgpu_ring *ring)
3736 {
3737 	const unsigned int gcr_cntl =
3738 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(1) |
3739 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(1) |
3740 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(1) |
3741 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(1) |
3742 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(1) |
3743 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_SCOPE(2);
3744 
3745 	/* ACQUIRE_MEM - make one or more surfaces valid for use by the subsequent operations */
3746 	amdgpu_ring_write(ring, PACKET3(PACKET3_ACQUIRE_MEM, 6));
3747 	amdgpu_ring_write(ring, 0); /* CP_COHER_CNTL */
3748 	amdgpu_ring_write(ring, 0xffffffff);  /* CP_COHER_SIZE */
3749 	amdgpu_ring_write(ring, 0xffffff);  /* CP_COHER_SIZE_HI */
3750 	amdgpu_ring_write(ring, 0); /* CP_COHER_BASE */
3751 	amdgpu_ring_write(ring, 0);  /* CP_COHER_BASE_HI */
3752 	amdgpu_ring_write(ring, 0x0000000A); /* POLL_INTERVAL */
3753 	amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
3754 }
3755 
3756 static const struct amd_ip_funcs gfx_v12_1_ip_funcs = {
3757 	.name = "gfx_v12_1",
3758 	.early_init = gfx_v12_1_early_init,
3759 	.late_init = gfx_v12_1_late_init,
3760 	.sw_init = gfx_v12_1_sw_init,
3761 	.sw_fini = gfx_v12_1_sw_fini,
3762 	.hw_init = gfx_v12_1_hw_init,
3763 	.hw_fini = gfx_v12_1_hw_fini,
3764 	.suspend = gfx_v12_1_suspend,
3765 	.resume = gfx_v12_1_resume,
3766 	.is_idle = gfx_v12_1_is_idle,
3767 	.wait_for_idle = gfx_v12_1_wait_for_idle,
3768 	.set_clockgating_state = gfx_v12_1_set_clockgating_state,
3769 	.set_powergating_state = gfx_v12_1_set_powergating_state,
3770 	.get_clockgating_state = gfx_v12_1_get_clockgating_state,
3771 };
3772 
3773 static const struct amdgpu_ring_funcs gfx_v12_1_ring_funcs_compute = {
3774 	.type = AMDGPU_RING_TYPE_COMPUTE,
3775 	.align_mask = 0xff,
3776 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
3777 	.support_64bit_ptrs = true,
3778 	.get_rptr = gfx_v12_1_ring_get_rptr_compute,
3779 	.get_wptr = gfx_v12_1_ring_get_wptr_compute,
3780 	.set_wptr = gfx_v12_1_ring_set_wptr_compute,
3781 	.emit_frame_size =
3782 		7 + /* gfx_v12_1_ring_emit_pipeline_sync */
3783 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
3784 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
3785 		2 + /* gfx_v12_1_ring_emit_vm_flush */
3786 		8 + 8 + 8 + /* gfx_v12_1_ring_emit_fence x3 for user fence, vm fence */
3787 		8, /* gfx_v12_1_emit_mem_sync */
3788 	.emit_ib_size =	7, /* gfx_v12_1_ring_emit_ib_compute */
3789 	.emit_ib = gfx_v12_1_ring_emit_ib_compute,
3790 	.emit_fence = gfx_v12_1_ring_emit_fence,
3791 	.emit_pipeline_sync = gfx_v12_1_ring_emit_pipeline_sync,
3792 	.emit_vm_flush = gfx_v12_1_ring_emit_vm_flush,
3793 	.test_ring = gfx_v12_1_ring_test_ring,
3794 	.test_ib = gfx_v12_1_ring_test_ib,
3795 	.insert_nop = amdgpu_ring_insert_nop,
3796 	.pad_ib = amdgpu_ring_generic_pad_ib,
3797 	.emit_wreg = gfx_v12_1_ring_emit_wreg,
3798 	.emit_reg_wait = gfx_v12_1_ring_emit_reg_wait,
3799 	.emit_reg_write_reg_wait = gfx_v12_1_ring_emit_reg_write_reg_wait,
3800 	.emit_mem_sync = gfx_v12_1_emit_mem_sync,
3801 };
3802 
3803 static const struct amdgpu_ring_funcs gfx_v12_1_ring_funcs_kiq = {
3804 	.type = AMDGPU_RING_TYPE_KIQ,
3805 	.align_mask = 0xff,
3806 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
3807 	.support_64bit_ptrs = true,
3808 	.get_rptr = gfx_v12_1_ring_get_rptr_compute,
3809 	.get_wptr = gfx_v12_1_ring_get_wptr_compute,
3810 	.set_wptr = gfx_v12_1_ring_set_wptr_compute,
3811 	.emit_frame_size =
3812 		7 + /* gfx_v12_1_ring_emit_pipeline_sync */
3813 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
3814 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
3815 		2 + /* gfx_v12_1_ring_emit_vm_flush */
3816 		8 + 8 + 8, /* gfx_v12_1_ring_emit_fence_kiq x3 for user fence, vm fence */
3817 	.emit_ib_size =	7, /* gfx_v12_1_ring_emit_ib_compute */
3818 	.emit_ib = gfx_v12_1_ring_emit_ib_compute,
3819 	.emit_fence = gfx_v12_1_ring_emit_fence_kiq,
3820 	.test_ring = gfx_v12_1_ring_test_ring,
3821 	.test_ib = gfx_v12_1_ring_test_ib,
3822 	.insert_nop = amdgpu_ring_insert_nop,
3823 	.pad_ib = amdgpu_ring_generic_pad_ib,
3824 	.emit_rreg = gfx_v12_1_ring_emit_rreg,
3825 	.emit_wreg = gfx_v12_1_ring_emit_wreg,
3826 	.emit_reg_wait = gfx_v12_1_ring_emit_reg_wait,
3827 	.emit_reg_write_reg_wait = gfx_v12_1_ring_emit_reg_write_reg_wait,
3828 };
3829 
3830 static void gfx_v12_1_set_ring_funcs(struct amdgpu_device *adev)
3831 {
3832 	int i, j, num_xcc;
3833 
3834 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3835 	for (i = 0; i < num_xcc; i++) {
3836 		adev->gfx.kiq[i].ring.funcs = &gfx_v12_1_ring_funcs_kiq;
3837 
3838 		for (j = 0; j < adev->gfx.num_compute_rings; j++)
3839 			adev->gfx.compute_ring[j + i * adev->gfx.num_compute_rings].funcs =
3840 						&gfx_v12_1_ring_funcs_compute;
3841 	}
3842 }
3843 
3844 static const struct amdgpu_irq_src_funcs gfx_v12_1_eop_irq_funcs = {
3845 	.set = gfx_v12_1_set_eop_interrupt_state,
3846 	.process = gfx_v12_1_eop_irq,
3847 };
3848 
3849 static const struct amdgpu_irq_src_funcs gfx_v12_1_priv_reg_irq_funcs = {
3850 	.set = gfx_v12_1_set_priv_reg_fault_state,
3851 	.process = gfx_v12_1_priv_reg_irq,
3852 };
3853 
3854 static const struct amdgpu_irq_src_funcs gfx_v12_1_priv_inst_irq_funcs = {
3855 	.set = gfx_v12_1_set_priv_inst_fault_state,
3856 	.process = gfx_v12_1_priv_inst_irq,
3857 };
3858 
3859 static void gfx_v12_1_set_irq_funcs(struct amdgpu_device *adev)
3860 {
3861 	adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
3862 	adev->gfx.eop_irq.funcs = &gfx_v12_1_eop_irq_funcs;
3863 
3864 	adev->gfx.priv_reg_irq.num_types = 1;
3865 	adev->gfx.priv_reg_irq.funcs = &gfx_v12_1_priv_reg_irq_funcs;
3866 
3867 	adev->gfx.priv_inst_irq.num_types = 1;
3868 	adev->gfx.priv_inst_irq.funcs = &gfx_v12_1_priv_inst_irq_funcs;
3869 }
3870 
3871 static void gfx_v12_1_set_imu_funcs(struct amdgpu_device *adev)
3872 {
3873 	if (adev->flags & AMD_IS_APU)
3874 		adev->gfx.imu.mode = MISSION_MODE;
3875 	else
3876 		adev->gfx.imu.mode = DEBUG_MODE;
3877 	if (!amdgpu_sriov_vf(adev))
3878 		adev->gfx.imu.funcs = &gfx_v12_1_imu_funcs;
3879 }
3880 
3881 static void gfx_v12_1_set_rlc_funcs(struct amdgpu_device *adev)
3882 {
3883 	adev->gfx.rlc.funcs = &gfx_v12_1_rlc_funcs;
3884 }
3885 
3886 static void gfx_v12_1_set_mqd_funcs(struct amdgpu_device *adev)
3887 {
3888 	/* set compute eng mqd */
3889 	adev->mqds[AMDGPU_HW_IP_COMPUTE].mqd_size =
3890 		sizeof(struct v12_1_compute_mqd);
3891 	adev->mqds[AMDGPU_HW_IP_COMPUTE].init_mqd =
3892 		gfx_v12_1_compute_mqd_init;
3893 }
3894 
3895 static void gfx_v12_1_set_user_cu_inactive_bitmap_per_sh(struct amdgpu_device *adev,
3896 							  u32 bitmap, int xcc_id)
3897 {
3898 	u32 data;
3899 
3900 	if (!bitmap)
3901 		return;
3902 
3903 	data = bitmap << GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_WGPS__SHIFT;
3904 	data &= GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_WGPS_MASK;
3905 
3906 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG, data);
3907 }
3908 
3909 static u32 gfx_v12_1_get_cu_active_bitmap_per_sh(struct amdgpu_device *adev,
3910 						 int xcc_id)
3911 {
3912 	u32 data, mask;
3913 
3914 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCC_GC_SHADER_ARRAY_CONFIG);
3915 	data |= RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG);
3916 
3917 	data &= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_WGPS_MASK;
3918 	data >>= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_WGPS__SHIFT;
3919 
3920 	mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_cu_per_sh);
3921 
3922 	return (~data) & mask;
3923 }
3924 
3925 static int gfx_v12_1_get_cu_info(struct amdgpu_device *adev,
3926 				 struct amdgpu_cu_info *cu_info)
3927 {
3928 	int i, j, k, counter, xcc_id, active_cu_number = 0;
3929 	u32 mask, bitmap;
3930 	unsigned int disable_masks[2 * 2];
3931 
3932 	if (!adev || !cu_info)
3933 		return -EINVAL;
3934 
3935 	if (adev->gfx.config.max_shader_engines > 2 ||
3936 	    adev->gfx.config.max_sh_per_se > 2) {
3937 		dev_err(adev->dev,
3938 			"Max SE (%d) and Max SA per SE (%d) is greater than expected\n",
3939 			adev->gfx.config.max_shader_engines,
3940 			adev->gfx.config.max_sh_per_se);
3941 		return -EINVAL;
3942 	}
3943 
3944 	amdgpu_gfx_parse_disable_cu(adev, disable_masks,
3945 				    adev->gfx.config.max_shader_engines,
3946 				    adev->gfx.config.max_sh_per_se);
3947 
3948 	mutex_lock(&adev->grbm_idx_mutex);
3949 	for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) {
3950 		for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
3951 			for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
3952 				bitmap = i * adev->gfx.config.max_sh_per_se + j;
3953 				if (!((gfx_v12_1_get_sa_active_bitmap(adev, xcc_id) >> bitmap) & 1))
3954 					continue;
3955 				mask = 1;
3956 				counter = 0;
3957 				gfx_v12_1_xcc_select_se_sh(adev, i, j, 0xffffffff, xcc_id);
3958 				gfx_v12_1_set_user_cu_inactive_bitmap_per_sh(
3959 					adev,
3960 					disable_masks[i * adev->gfx.config.max_sh_per_se + j],
3961 					xcc_id);
3962 				bitmap = gfx_v12_1_get_cu_active_bitmap_per_sh(adev, xcc_id);
3963 
3964 				cu_info->bitmap[xcc_id][i][j] = bitmap;
3965 
3966 				for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
3967 					if (bitmap & mask)
3968 						counter++;
3969 
3970 					mask <<= 1;
3971 				}
3972 				active_cu_number += counter;
3973 			}
3974 		}
3975 		gfx_v12_1_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, xcc_id);
3976 	}
3977 	mutex_unlock(&adev->grbm_idx_mutex);
3978 
3979 	cu_info->number = active_cu_number;
3980 	cu_info->simd_per_cu = NUM_SIMD_PER_CU_GFX12_1;
3981 	cu_info->lds_size = 320;
3982 
3983 	return 0;
3984 }
3985 
3986 const struct amdgpu_ip_block_version gfx_v12_1_ip_block = {
3987 	.type = AMD_IP_BLOCK_TYPE_GFX,
3988 	.major = 12,
3989 	.minor = 1,
3990 	.rev = 0,
3991 	.funcs = &gfx_v12_1_ip_funcs,
3992 };
3993 
3994 static int gfx_v12_1_xcp_resume(void *handle, uint32_t inst_mask)
3995 {
3996 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
3997 	uint32_t tmp_mask;
3998 	int i, r;
3999 
4000 	/* TODO : Initialize golden regs */
4001 	/* gfx_v12_1_init_golden_registers(adev); */
4002 
4003 	tmp_mask = inst_mask;
4004 	for_each_inst(i, tmp_mask)
4005 		gfx_v12_1_xcc_constants_init(adev, i);
4006 
4007 	if (!amdgpu_sriov_vf(adev)) {
4008 		tmp_mask = inst_mask;
4009 		for_each_inst(i, tmp_mask) {
4010 			r = gfx_v12_1_xcc_rlc_resume(adev, i);
4011 			if (r)
4012 				return r;
4013 		}
4014 	}
4015 
4016 	r = gfx_v12_1_xcc_cp_resume(adev, inst_mask);
4017 
4018 	return r;
4019 }
4020 
4021 static int gfx_v12_1_xcp_suspend(void *handle, uint32_t inst_mask)
4022 {
4023 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
4024 	int i;
4025 
4026 	for_each_inst(i, inst_mask)
4027 		gfx_v12_1_xcc_fini(adev, i);
4028 
4029 	return 0;
4030 }
4031 
4032 struct amdgpu_xcp_ip_funcs gfx_v12_1_xcp_funcs = {
4033 	.suspend = &gfx_v12_1_xcp_suspend,
4034 	.resume = &gfx_v12_1_xcp_resume
4035 };
4036