xref: /linux/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c (revision bc35ae1a09ca02c25df54327fb1e7a09d2556e1a)
1 /*
2  * Copyright 2025 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include <linux/delay.h>
24 #include <linux/kernel.h>
25 #include <linux/firmware.h>
26 #include <linux/module.h>
27 #include <linux/pci.h>
28 #include "amdgpu.h"
29 #include "amdgpu_gfx.h"
30 #include "amdgpu_psp.h"
31 #include "amdgpu_smu.h"
32 #include "amdgpu_atomfirmware.h"
33 #include "imu_v12_1.h"
34 #include "soc_v1_0.h"
35 #include "gfx_v12_1_pkt.h"
36 
37 #include "gc/gc_12_1_0_offset.h"
38 #include "gc/gc_12_1_0_sh_mask.h"
39 #include "soc24_enum.h"
40 #include "ivsrcid/gfx/irqsrcs_gfx_12_1_0.h"
41 
42 #include "soc15.h"
43 #include "clearstate_gfx12.h"
44 #include "v12_structs.h"
45 #include "gfx_v12_1.h"
46 #include "mes_v12_1.h"
47 
48 #define GFX12_MEC_HPD_SIZE	2048
49 #define NUM_SIMD_PER_CU_GFX12_1	4
50 
51 #define RLCG_UCODE_LOADING_START_ADDRESS	0x00002000L
52 
53 MODULE_FIRMWARE("amdgpu/gc_12_1_0_mec.bin");
54 MODULE_FIRMWARE("amdgpu/gc_12_1_0_rlc.bin");
55 
56 #define SH_MEM_ALIGNMENT_MODE_UNALIGNED_GFX12_1_0	0x00000001
57 #define DEFAULT_SH_MEM_CONFIG \
58 	((SH_MEM_ADDRESS_MODE_64 << SH_MEM_CONFIG__ADDRESS_MODE__SHIFT) | \
59 	 (SH_MEM_ALIGNMENT_MODE_UNALIGNED_GFX12_1_0 << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | \
60 	 (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT))
61 
62 static void gfx_v12_1_xcc_disable_gpa_mode(struct amdgpu_device *adev, int xcc_id);
63 static void gfx_v12_1_set_ring_funcs(struct amdgpu_device *adev);
64 static void gfx_v12_1_set_irq_funcs(struct amdgpu_device *adev);
65 static void gfx_v12_1_set_rlc_funcs(struct amdgpu_device *adev);
66 static void gfx_v12_1_set_mqd_funcs(struct amdgpu_device *adev);
67 static void gfx_v12_1_set_imu_funcs(struct amdgpu_device *adev);
68 static int gfx_v12_1_get_cu_info(struct amdgpu_device *adev,
69 				 struct amdgpu_cu_info *cu_info);
70 static uint64_t gfx_v12_1_get_gpu_clock_counter(struct amdgpu_device *adev);
71 static void gfx_v12_1_xcc_select_se_sh(struct amdgpu_device *adev, u32 se_num,
72 				       u32 sh_num, u32 instance, int xcc_id);
73 static void gfx_v12_1_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
74 				     uint32_t val);
75 static int gfx_v12_1_wait_for_rlc_autoload_complete(struct amdgpu_device *adev);
76 static void gfx_v12_1_ring_invalidate_tlbs(struct amdgpu_ring *ring,
77 					   uint16_t pasid, uint32_t flush_type,
78 					   bool all_hub, uint8_t dst_sel);
79 static void gfx_v12_1_xcc_set_safe_mode(struct amdgpu_device *adev, int xcc_id);
80 static void gfx_v12_1_xcc_unset_safe_mode(struct amdgpu_device *adev, int xcc_id);
81 static void gfx_v12_1_update_perf_clk(struct amdgpu_device *adev,
82 				      bool enable);
83 static void gfx_v12_1_xcc_update_perf_clk(struct amdgpu_device *adev,
84 					 bool enable, int xcc_id);
85 static int gfx_v12_1_init_cp_compute_microcode_bo(struct amdgpu_device *adev);
86 
87 static void gfx_v12_1_kiq_set_resources(struct amdgpu_ring *kiq_ring,
88 					uint64_t queue_mask)
89 {
90 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_RESOURCES, 6));
91 	amdgpu_ring_write(kiq_ring, PACKET3_SET_RESOURCES_VMID_MASK(0) |
92 			  PACKET3_SET_RESOURCES_QUEUE_TYPE(0));	/* vmid_mask:0 queue_type:0 (KIQ) */
93 	amdgpu_ring_write(kiq_ring, lower_32_bits(queue_mask));	/* queue mask lo */
94 	amdgpu_ring_write(kiq_ring, upper_32_bits(queue_mask));	/* queue mask hi */
95 	amdgpu_ring_write(kiq_ring, 0);	/* gws mask lo */
96 	amdgpu_ring_write(kiq_ring, 0);	/* gws mask hi */
97 	amdgpu_ring_write(kiq_ring, 0);	/* oac mask */
98 	amdgpu_ring_write(kiq_ring, 0);
99 }
100 
101 static void gfx_v12_1_kiq_map_queues(struct amdgpu_ring *kiq_ring,
102 				     struct amdgpu_ring *ring)
103 {
104 	uint64_t mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
105 	uint64_t wptr_addr = ring->wptr_gpu_addr;
106 	uint32_t me = 0, eng_sel = 0;
107 
108 	switch (ring->funcs->type) {
109 	case AMDGPU_RING_TYPE_COMPUTE:
110 		me = 1;
111 		eng_sel = 0;
112 		break;
113 	case AMDGPU_RING_TYPE_MES:
114 		me = 2;
115 		eng_sel = 5;
116 		break;
117 	default:
118 		WARN_ON(1);
119 	}
120 
121 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
122 	/* Q_sel:0, vmid:0, vidmem: 1, engine:0, num_Q:1*/
123 	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
124 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
125 			  PACKET3_MAP_QUEUES_VMID(0) | /* VMID */
126 			  PACKET3_MAP_QUEUES_QUEUE(ring->queue) |
127 			  PACKET3_MAP_QUEUES_PIPE(ring->pipe) |
128 			  PACKET3_MAP_QUEUES_ME((me)) |
129 			  PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
130 			  PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
131 			  PACKET3_MAP_QUEUES_ENGINE_SEL(eng_sel) |
132 			  PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
133 	amdgpu_ring_write(kiq_ring, PACKET3_MAP_QUEUES_DOORBELL_OFFSET(ring->doorbell_index));
134 	amdgpu_ring_write(kiq_ring, lower_32_bits(mqd_addr));
135 	amdgpu_ring_write(kiq_ring, upper_32_bits(mqd_addr));
136 	amdgpu_ring_write(kiq_ring, lower_32_bits(wptr_addr));
137 	amdgpu_ring_write(kiq_ring, upper_32_bits(wptr_addr));
138 }
139 
140 static void gfx_v12_1_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
141 				       struct amdgpu_ring *ring,
142 				       enum amdgpu_unmap_queues_action action,
143 				       u64 gpu_addr, u64 seq)
144 {
145 	struct amdgpu_device *adev = kiq_ring->adev;
146 	uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
147 
148 	if (adev->enable_mes && !adev->gfx.kiq[0].ring.sched.ready) {
149 		amdgpu_mes_unmap_legacy_queue(adev, ring, action, gpu_addr,
150 					      seq, kiq_ring->xcc_id);
151 		return;
152 	}
153 
154 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4));
155 	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
156 			  PACKET3_UNMAP_QUEUES_ACTION(action) |
157 			  PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) |
158 			  PACKET3_UNMAP_QUEUES_ENGINE_SEL(eng_sel) |
159 			  PACKET3_UNMAP_QUEUES_NUM_QUEUES(1));
160 	amdgpu_ring_write(kiq_ring,
161 		  PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
162 
163 	if (action == PREEMPT_QUEUES_NO_UNMAP) {
164 		amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
165 		amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
166 		amdgpu_ring_write(kiq_ring, seq);
167 	} else {
168 		amdgpu_ring_write(kiq_ring, 0);
169 		amdgpu_ring_write(kiq_ring, 0);
170 		amdgpu_ring_write(kiq_ring, 0);
171 	}
172 }
173 
174 static void gfx_v12_1_kiq_query_status(struct amdgpu_ring *kiq_ring,
175 				       struct amdgpu_ring *ring,
176 				       u64 addr, u64 seq)
177 {
178 	uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
179 
180 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_QUERY_STATUS, 5));
181 	amdgpu_ring_write(kiq_ring,
182 			  PACKET3_QUERY_STATUS_CONTEXT_ID(0) |
183 			  PACKET3_QUERY_STATUS_INTERRUPT_SEL(0) |
184 			  PACKET3_QUERY_STATUS_COMMAND(2));
185 	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
186 			  PACKET3_QUERY_STATUS_DOORBELL_OFFSET(ring->doorbell_index) |
187 			  PACKET3_QUERY_STATUS_ENG_SEL(eng_sel));
188 	amdgpu_ring_write(kiq_ring, lower_32_bits(addr));
189 	amdgpu_ring_write(kiq_ring, upper_32_bits(addr));
190 	amdgpu_ring_write(kiq_ring, lower_32_bits(seq));
191 	amdgpu_ring_write(kiq_ring, upper_32_bits(seq));
192 }
193 
194 static void gfx_v12_1_kiq_invalidate_tlbs(struct amdgpu_ring *kiq_ring,
195 					  uint16_t pasid,
196 					  uint32_t flush_type,
197 					  bool all_hub)
198 {
199 	gfx_v12_1_ring_invalidate_tlbs(kiq_ring, pasid, flush_type, all_hub, 1);
200 }
201 
202 static const struct kiq_pm4_funcs gfx_v12_1_kiq_pm4_funcs = {
203 	.kiq_set_resources = gfx_v12_1_kiq_set_resources,
204 	.kiq_map_queues = gfx_v12_1_kiq_map_queues,
205 	.kiq_unmap_queues = gfx_v12_1_kiq_unmap_queues,
206 	.kiq_query_status = gfx_v12_1_kiq_query_status,
207 	.kiq_invalidate_tlbs = gfx_v12_1_kiq_invalidate_tlbs,
208 	.set_resources_size = 8,
209 	.map_queues_size = 7,
210 	.unmap_queues_size = 6,
211 	.query_status_size = 7,
212 	.invalidate_tlbs_size = 2,
213 };
214 
215 static void gfx_v12_1_set_kiq_pm4_funcs(struct amdgpu_device *adev)
216 {
217 	int i, num_xcc;
218 
219 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
220 	for (i =0; i < num_xcc; i++)
221 		adev->gfx.kiq[i].pmf = &gfx_v12_1_kiq_pm4_funcs;
222 }
223 
224 static void gfx_v12_1_wait_reg_mem(struct amdgpu_ring *ring, int eng_sel,
225 				   int mem_space, int opt, uint32_t addr0,
226 				   uint32_t addr1, uint32_t ref,
227 				   uint32_t mask, uint32_t inv)
228 {
229 	if (mem_space == 0) {
230 		addr0 = soc_v1_0_normalize_xcc_reg_offset(addr0);
231 		addr1 = soc_v1_0_normalize_xcc_reg_offset(addr1);
232 	}
233 
234 	amdgpu_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5));
235 	amdgpu_ring_write(ring,
236 			  /* memory (1) or register (0) */
237 			  (WAIT_REG_MEM_MEM_SPACE(mem_space) |
238 			   WAIT_REG_MEM_OPERATION(opt) | /* wait */
239 			   WAIT_REG_MEM_FUNCTION(3) |  /* equal */
240 			   WAIT_REG_MEM_ENGINE(eng_sel)));
241 
242 	if (mem_space)
243 		BUG_ON(addr0 & 0x3); /* Dword align */
244 	amdgpu_ring_write(ring, addr0);
245 	amdgpu_ring_write(ring, addr1);
246 	amdgpu_ring_write(ring, ref);
247 	amdgpu_ring_write(ring, mask);
248 	amdgpu_ring_write(ring, inv); /* poll interval */
249 }
250 
251 static int gfx_v12_1_ring_test_ring(struct amdgpu_ring *ring)
252 {
253 	struct amdgpu_device *adev = ring->adev;
254 	uint32_t scratch_reg0_offset, xcc_offset;
255 	uint32_t tmp = 0;
256 	unsigned i;
257 	int r;
258 
259 	/* Use register offset which is local to XCC in the packet */
260 	xcc_offset = SOC15_REG_OFFSET(GC, 0, regSCRATCH_REG0);
261 	scratch_reg0_offset = SOC15_REG_OFFSET(GC, GET_INST(GC, ring->xcc_id), regSCRATCH_REG0);
262 	WREG32(scratch_reg0_offset, 0xCAFEDEAD);
263 	tmp = RREG32(scratch_reg0_offset);
264 
265 	r = amdgpu_ring_alloc(ring, 5);
266 	if (r) {
267 		dev_err(adev->dev,
268 			"amdgpu: cp failed to lock ring %d (%d).\n",
269 			ring->idx, r);
270 		return r;
271 	}
272 
273 	if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ) {
274 		gfx_v12_1_ring_emit_wreg(ring, xcc_offset, 0xDEADBEEF);
275 	} else {
276 		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1));
277 		amdgpu_ring_write(ring, xcc_offset -
278 				  PACKET3_SET_UCONFIG_REG_START);
279 		amdgpu_ring_write(ring, 0xDEADBEEF);
280 	}
281 	amdgpu_ring_commit(ring);
282 
283 	for (i = 0; i < adev->usec_timeout; i++) {
284 		tmp = RREG32(scratch_reg0_offset);
285 		if (tmp == 0xDEADBEEF)
286 			break;
287 		if (amdgpu_emu_mode == 1)
288 			msleep(1);
289 		else
290 			udelay(1);
291 	}
292 
293 	if (i >= adev->usec_timeout)
294 		r = -ETIMEDOUT;
295 	return r;
296 }
297 
298 static int gfx_v12_1_ring_test_ib(struct amdgpu_ring *ring, long timeout)
299 {
300 	struct amdgpu_device *adev = ring->adev;
301 	struct amdgpu_ib ib;
302 	struct dma_fence *f = NULL;
303 	unsigned index;
304 	uint64_t gpu_addr;
305 	volatile uint32_t *cpu_ptr;
306 	long r;
307 
308 	/* MES KIQ fw hasn't indirect buffer support for now */
309 	if (adev->enable_mes_kiq &&
310 	    ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
311 		return 0;
312 
313 	memset(&ib, 0, sizeof(ib));
314 
315 	r = amdgpu_device_wb_get(adev, &index);
316 	if (r)
317 		return r;
318 
319 	gpu_addr = adev->wb.gpu_addr + (index * 4);
320 	adev->wb.wb[index] = cpu_to_le32(0xCAFEDEAD);
321 	cpu_ptr = &adev->wb.wb[index];
322 
323 	r = amdgpu_ib_get(adev, NULL, 16, AMDGPU_IB_POOL_DIRECT, &ib);
324 	if (r) {
325 		dev_err(adev->dev, "amdgpu: failed to get ib (%ld).\n", r);
326 		goto err1;
327 	}
328 
329 	ib.ptr[0] = PACKET3(PACKET3_WRITE_DATA, 3);
330 	ib.ptr[1] = WRITE_DATA_DST_SEL(5) | WR_CONFIRM;
331 	ib.ptr[2] = lower_32_bits(gpu_addr);
332 	ib.ptr[3] = upper_32_bits(gpu_addr);
333 	ib.ptr[4] = 0xDEADBEEF;
334 	ib.length_dw = 5;
335 
336 	r = amdgpu_ib_schedule(ring, 1, &ib, NULL, &f);
337 	if (r)
338 		goto err2;
339 
340 	r = dma_fence_wait_timeout(f, false, timeout);
341 	if (r == 0) {
342 		r = -ETIMEDOUT;
343 		goto err2;
344 	} else if (r < 0) {
345 		goto err2;
346 	}
347 
348 	if (le32_to_cpu(*cpu_ptr) == 0xDEADBEEF)
349 		r = 0;
350 	else
351 		r = -EINVAL;
352 err2:
353 	amdgpu_ib_free(&ib, NULL);
354 	dma_fence_put(f);
355 err1:
356 	amdgpu_device_wb_free(adev, index);
357 	return r;
358 }
359 
360 static void gfx_v12_1_free_microcode(struct amdgpu_device *adev)
361 {
362 	amdgpu_ucode_release(&adev->gfx.rlc_fw);
363 	amdgpu_ucode_release(&adev->gfx.mec_fw);
364 
365 	kfree(adev->gfx.rlc.register_list_format);
366 }
367 
368 static int gfx_v12_1_init_toc_microcode(struct amdgpu_device *adev, const char *ucode_prefix)
369 {
370 	const struct psp_firmware_header_v1_0 *toc_hdr;
371 	int err = 0;
372 
373 	err = amdgpu_ucode_request(adev, &adev->psp.toc_fw,
374 				   AMDGPU_UCODE_REQUIRED,
375 				   "amdgpu/%s_toc.bin", ucode_prefix);
376 	if (err)
377 		goto out;
378 
379 	toc_hdr = (const struct psp_firmware_header_v1_0 *)adev->psp.toc_fw->data;
380 	adev->psp.toc.fw_version = le32_to_cpu(toc_hdr->header.ucode_version);
381 	adev->psp.toc.feature_version = le32_to_cpu(toc_hdr->sos.fw_version);
382 	adev->psp.toc.size_bytes = le32_to_cpu(toc_hdr->header.ucode_size_bytes);
383 	adev->psp.toc.start_addr = (uint8_t *)toc_hdr +
384 			le32_to_cpu(toc_hdr->header.ucode_array_offset_bytes);
385 	return 0;
386 out:
387 	amdgpu_ucode_release(&adev->psp.toc_fw);
388 	return err;
389 }
390 
391 static int gfx_v12_1_init_microcode(struct amdgpu_device *adev)
392 {
393 	char ucode_prefix[15];
394 	int err;
395 	const struct rlc_firmware_header_v2_0 *rlc_hdr;
396 	uint16_t version_major;
397 	uint16_t version_minor;
398 
399 	DRM_DEBUG("\n");
400 
401 	amdgpu_ucode_ip_version_decode(adev, GC_HWIP, ucode_prefix, sizeof(ucode_prefix));
402 
403 	if (!amdgpu_sriov_vf(adev)) {
404 		err = amdgpu_ucode_request(adev, &adev->gfx.rlc_fw,
405 					   AMDGPU_UCODE_REQUIRED,
406 					   "amdgpu/%s_rlc.bin", ucode_prefix);
407 		if (err)
408 			goto out;
409 		rlc_hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
410 		version_major = le16_to_cpu(rlc_hdr->header.header_version_major);
411 		version_minor = le16_to_cpu(rlc_hdr->header.header_version_minor);
412 		err = amdgpu_gfx_rlc_init_microcode(adev, version_major, version_minor);
413 		if (err)
414 			goto out;
415 	}
416 
417 	err = amdgpu_ucode_request(adev, &adev->gfx.mec_fw,
418 				   AMDGPU_UCODE_REQUIRED,
419 				   "amdgpu/%s_mec.bin", ucode_prefix);
420 	if (err)
421 		goto out;
422 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC);
423 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC_P0_STACK);
424 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC_P1_STACK);
425 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC_P2_STACK);
426 	amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_MEC_P3_STACK);
427 
428 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO)
429 		err = gfx_v12_1_init_toc_microcode(adev, ucode_prefix);
430 
431 	/* only one MEC for gfx 12 */
432 	adev->gfx.mec2_fw = NULL;
433 
434 	if (adev->gfx.imu.funcs) {
435 		if (adev->gfx.imu.funcs->init_microcode) {
436 			err = adev->gfx.imu.funcs->init_microcode(adev);
437 			if (err)
438 				dev_err(adev->dev, "Failed to load imu firmware!\n");
439 		}
440 	}
441 
442 out:
443 	if (err) {
444 		amdgpu_ucode_release(&adev->gfx.rlc_fw);
445 		amdgpu_ucode_release(&adev->gfx.mec_fw);
446 	}
447 
448 	return err;
449 }
450 
451 static u32 gfx_v12_1_get_csb_size(struct amdgpu_device *adev)
452 {
453 	u32 count = 0;
454 	const struct cs_section_def *sect = NULL;
455 	const struct cs_extent_def *ext = NULL;
456 
457 	count += 1;
458 
459 	for (sect = gfx12_cs_data; sect->section != NULL; ++sect) {
460 		if (sect->id == SECT_CONTEXT) {
461 			for (ext = sect->section; ext->extent != NULL; ++ext)
462 				count += 2 + ext->reg_count;
463 		} else
464 			return 0;
465 	}
466 
467 	return count;
468 }
469 
470 static void gfx_v12_1_get_csb_buffer(struct amdgpu_device *adev, u32 *buffer)
471 {
472 	u32 count = 0, clustercount = 0, i;
473 	const struct cs_section_def *sect = NULL;
474 	const struct cs_extent_def *ext = NULL;
475 
476 	if (adev->gfx.rlc.cs_data == NULL)
477 		return;
478 	if (buffer == NULL)
479 		return;
480 
481 	count += 1;
482 
483 	for (sect = adev->gfx.rlc.cs_data; sect->section != NULL; ++sect) {
484 		if (sect->id == SECT_CONTEXT) {
485 			for (ext = sect->section; ext->extent != NULL; ++ext) {
486 				clustercount++;
487 				buffer[count++] = ext->reg_count;
488 				buffer[count++] = ext->reg_index;
489 
490 				for (i = 0; i < ext->reg_count; i++)
491 					buffer[count++] = cpu_to_le32(ext->extent[i]);
492 			}
493 		} else
494 			return;
495 	}
496 
497 	buffer[0] = clustercount;
498 }
499 
500 static void gfx_v12_1_rlc_fini(struct amdgpu_device *adev)
501 {
502 	/* clear state block */
503 	amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
504 			&adev->gfx.rlc.clear_state_gpu_addr,
505 			(void **)&adev->gfx.rlc.cs_ptr);
506 
507 	/* jump table block */
508 	amdgpu_bo_free_kernel(&adev->gfx.rlc.cp_table_obj,
509 			&adev->gfx.rlc.cp_table_gpu_addr,
510 			(void **)&adev->gfx.rlc.cp_table_ptr);
511 }
512 
513 static void gfx_v12_1_init_rlcg_reg_access_ctrl(struct amdgpu_device *adev)
514 {
515 	int xcc_id, num_xcc;
516 	struct amdgpu_rlcg_reg_access_ctrl *reg_access_ctrl;
517 
518 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
519 	for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) {
520 		reg_access_ctrl = &adev->gfx.rlc.reg_access_ctrl[GET_INST(GC, xcc_id)];
521 		reg_access_ctrl->scratch_reg0 =
522 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regSCRATCH_REG0);
523 		reg_access_ctrl->scratch_reg1 =
524 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regSCRATCH_REG1);
525 		reg_access_ctrl->scratch_reg2 =
526 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regSCRATCH_REG2);
527 		reg_access_ctrl->scratch_reg3 =
528 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regSCRATCH_REG3);
529 		reg_access_ctrl->grbm_cntl =
530 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regGRBM_GFX_CNTL);
531 		reg_access_ctrl->grbm_idx =
532 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regGRBM_GFX_INDEX);
533 		reg_access_ctrl->spare_int =
534 			SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SPARE_INT_0);
535 	}
536 	adev->gfx.rlc.rlcg_reg_access_supported = true;
537 }
538 
539 static int gfx_v12_1_rlc_init(struct amdgpu_device *adev)
540 {
541 	const struct cs_section_def *cs_data;
542 	int r, i, num_xcc;
543 
544 	adev->gfx.rlc.cs_data = gfx12_cs_data;
545 
546 	cs_data = adev->gfx.rlc.cs_data;
547 
548 	if (cs_data) {
549 		/* init clear state block */
550 		r = amdgpu_gfx_rlc_init_csb(adev);
551 		if (r)
552 			return r;
553 	}
554 
555 	/* init spm vmid with 0xf */
556 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
557 	for (i = 0; i < num_xcc; i++) {
558 		if (adev->gfx.rlc.funcs->update_spm_vmid)
559 			adev->gfx.rlc.funcs->update_spm_vmid(adev, i, NULL, 0xf);
560 	}
561 
562 	return 0;
563 }
564 
565 static void gfx_v12_1_mec_fini(struct amdgpu_device *adev)
566 {
567 	amdgpu_bo_free_kernel(&adev->gfx.mec.hpd_eop_obj, NULL, NULL);
568 	amdgpu_bo_free_kernel(&adev->gfx.mec.mec_fw_obj, NULL, NULL);
569 	amdgpu_bo_free_kernel(&adev->gfx.mec.mec_fw_data_obj, NULL, NULL);
570 }
571 
572 static int gfx_v12_1_mec_init(struct amdgpu_device *adev)
573 {
574 	int r, i, num_xcc;
575 	u32 *hpd;
576 	size_t mec_hpd_size;
577 
578 	bitmap_zero(adev->gfx.mec_bitmap[0].queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
579 
580 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
581 	for (i = 0; i < num_xcc; i++)
582 		bitmap_zero(adev->gfx.mec_bitmap[i].queue_bitmap,
583 			    AMDGPU_MAX_COMPUTE_QUEUES);
584 
585 	/* take ownership of the relevant compute queues */
586 	amdgpu_gfx_compute_queue_acquire(adev);
587 	mec_hpd_size = adev->gfx.num_compute_rings *
588 		       GFX12_MEC_HPD_SIZE * num_xcc;
589 
590 	if (mec_hpd_size) {
591 		r = amdgpu_bo_create_reserved(adev, mec_hpd_size, PAGE_SIZE,
592 					      AMDGPU_GEM_DOMAIN_GTT,
593 					      &adev->gfx.mec.hpd_eop_obj,
594 					      &adev->gfx.mec.hpd_eop_gpu_addr,
595 					      (void **)&hpd);
596 		if (r) {
597 			dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
598 			gfx_v12_1_mec_fini(adev);
599 			return r;
600 		}
601 
602 		memset(hpd, 0, mec_hpd_size);
603 
604 		amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
605 		amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
606 	}
607 
608 	return 0;
609 }
610 
611 static uint32_t wave_read_ind(struct amdgpu_device *adev,
612 			      uint32_t xcc_id, uint32_t wave,
613 			      uint32_t address)
614 {
615 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_IND_INDEX,
616 		(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
617 		(address << SQ_IND_INDEX__INDEX__SHIFT));
618 	return RREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_IND_DATA);
619 }
620 
621 static void wave_read_regs(struct amdgpu_device *adev,
622 			   uint32_t xcc_id, uint32_t wave,
623 			   uint32_t thread, uint32_t regno,
624 			   uint32_t num, uint32_t *out)
625 {
626 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_IND_INDEX,
627 		(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
628 		(regno << SQ_IND_INDEX__INDEX__SHIFT) |
629 		(thread << SQ_IND_INDEX__WORKITEM_ID__SHIFT) |
630 		(SQ_IND_INDEX__AUTO_INCR_MASK));
631 	while (num--)
632 		*(out++) = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_IND_DATA);
633 }
634 
635 static void gfx_v12_1_read_wave_data(struct amdgpu_device *adev,
636 				     uint32_t xcc_id,
637 				     uint32_t simd, uint32_t wave,
638 				     uint32_t *dst, int *no_fields)
639 {
640 	/* in gfx12 the SIMD_ID is specified as part of the INSTANCE
641 	 * field when performing a select_se_sh so it should be
642 	 * zero here */
643 	WARN_ON(simd != 0);
644 
645 	/* type 4 wave data */
646 	dst[(*no_fields)++] = 4;
647 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_STATUS);
648 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_PC_LO);
649 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_PC_HI);
650 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_EXEC_LO);
651 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_EXEC_HI);
652 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_HW_ID1);
653 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_HW_ID2);
654 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_GPR_ALLOC);
655 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_LDS_ALLOC);
656 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_IB_STS);
657 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_IB_STS2);
658 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_IB_DBG1);
659 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_M0);
660 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_MODE);
661 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_STATE_PRIV);
662 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_EXCP_FLAG_PRIV);
663 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_EXCP_FLAG_USER);
664 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_TRAP_CTRL);
665 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_ACTIVE);
666 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_VALID_AND_IDLE);
667 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_DVGPR_ALLOC_LO);
668 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_DVGPR_ALLOC_HI);
669 	dst[(*no_fields)++] = wave_read_ind(adev, xcc_id, wave, ixSQ_WAVE_SCHED_MODE);
670 }
671 
672 static void gfx_v12_1_read_wave_sgprs(struct amdgpu_device *adev,
673 				      uint32_t xcc_id, uint32_t simd,
674 				      uint32_t wave, uint32_t start,
675 				      uint32_t size, uint32_t *dst)
676 {
677 	WARN_ON(simd != 0);
678 
679 	wave_read_regs(adev, xcc_id, wave, 0,
680 		       start + SQIND_WAVE_SGPRS_OFFSET,
681 		       size, dst);
682 }
683 
684 static void gfx_v12_1_read_wave_vgprs(struct amdgpu_device *adev,
685 				      uint32_t xcc_id, uint32_t simd,
686 				      uint32_t wave, uint32_t thread,
687 				      uint32_t start, uint32_t size,
688 				      uint32_t *dst)
689 {
690 	wave_read_regs(adev, xcc_id, wave, thread,
691 		       start + SQIND_WAVE_VGPRS_OFFSET,
692 		       size, dst);
693 }
694 
695 static void gfx_v12_1_select_me_pipe_q(struct amdgpu_device *adev,
696 				       u32 me, u32 pipe, u32 q, u32 vm, u32 xcc_id)
697 {
698 	soc_v1_0_grbm_select(adev, me, pipe, q, vm, GET_INST(GC, xcc_id));
699 }
700 
701 static int gfx_v12_1_get_xccs_per_xcp(struct amdgpu_device *adev)
702 {
703 	/* Fill this in when the interface is ready */
704 	return 1;
705 }
706 
707 static int gfx_v12_1_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node)
708 {
709 	int logic_xcc;
710 	int xcc = (ih_node & 0x7) - 2 + (ih_node >> 3) * 4;
711 
712 	for (logic_xcc = 0; logic_xcc < NUM_XCC(adev->gfx.xcc_mask); logic_xcc++) {
713 		if (xcc == GET_INST(GC, logic_xcc))
714 			return logic_xcc;
715 	}
716 
717 	dev_err(adev->dev, "Couldn't find xcc mapping from IH node");
718 	return -EINVAL;
719 }
720 
721 static const struct amdgpu_gfx_funcs gfx_v12_1_gfx_funcs = {
722 	.get_gpu_clock_counter = &gfx_v12_1_get_gpu_clock_counter,
723 	.select_se_sh = &gfx_v12_1_xcc_select_se_sh,
724 	.read_wave_data = &gfx_v12_1_read_wave_data,
725 	.read_wave_sgprs = &gfx_v12_1_read_wave_sgprs,
726 	.read_wave_vgprs = &gfx_v12_1_read_wave_vgprs,
727 	.select_me_pipe_q = &gfx_v12_1_select_me_pipe_q,
728 	.update_perfmon_mgcg = &gfx_v12_1_update_perf_clk,
729 	.get_xccs_per_xcp = &gfx_v12_1_get_xccs_per_xcp,
730 	.ih_node_to_logical_xcc = &gfx_v12_1_ih_to_xcc_inst,
731 };
732 
733 static int gfx_v12_1_gpu_early_init(struct amdgpu_device *adev)
734 {
735 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
736 	case IP_VERSION(12, 1, 0):
737 		adev->gfx.config.max_hw_contexts = 8;
738 		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
739 		adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
740 		adev->gfx.config.sc_hiz_tile_fifo_size = 0;
741 		adev->gfx.config.sc_earlyz_tile_fifo_size = 0x4C0;
742 		break;
743 	default:
744 		BUG();
745 		break;
746 	}
747 
748 	return 0;
749 }
750 
751 static int gfx_v12_1_compute_ring_init(struct amdgpu_device *adev, int ring_id,
752 				       int xcc_id, int mec, int pipe, int queue)
753 {
754 	int r;
755 	unsigned irq_type;
756 	struct amdgpu_ring *ring;
757 	unsigned int hw_prio;
758 	uint32_t xcc_doorbell_start;
759 
760 	ring = &adev->gfx.compute_ring[xcc_id * adev->gfx.num_compute_rings +
761 				       ring_id];
762 
763 	/* mec0 is me1 */
764 	ring->xcc_id = xcc_id;
765 	ring->me = mec + 1;
766 	ring->pipe = pipe;
767 	ring->queue = queue;
768 
769 	ring->ring_obj = NULL;
770 	ring->use_doorbell = true;
771 	xcc_doorbell_start = adev->doorbell_index.mec_ring0 +
772 			     xcc_id * adev->doorbell_index.xcc_doorbell_range;
773 	ring->doorbell_index = (xcc_doorbell_start + ring_id) << 1;
774 	ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr +
775 			     (ring_id + xcc_id * adev->gfx.num_compute_rings) *
776 			     GFX12_MEC_HPD_SIZE;
777 	ring->vm_hub = AMDGPU_GFXHUB(xcc_id);
778 	sprintf(ring->name, "comp_%d.%d.%d.%d",
779 			ring->xcc_id, ring->me, ring->pipe, ring->queue);
780 
781 	irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
782 		+ ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
783 		+ ring->pipe;
784 	hw_prio = amdgpu_gfx_is_high_priority_compute_queue(adev, ring) ?
785 			AMDGPU_GFX_PIPE_PRIO_HIGH : AMDGPU_GFX_PIPE_PRIO_NORMAL;
786 	/* type-2 packets are deprecated on MEC, use type-3 instead */
787 	r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq, irq_type,
788 			     hw_prio, NULL);
789 	if (r)
790 		return r;
791 
792 	return 0;
793 }
794 
795 static struct {
796 	SOC24_FIRMWARE_ID	id;
797 	unsigned int		offset;
798 	unsigned int		size;
799 	unsigned int		size_x16;
800 	unsigned int		num_inst;
801 } rlc_autoload_info[SOC24_FIRMWARE_ID_MAX];
802 
803 #define RLC_TOC_OFFSET_DWUNIT   8
804 #define RLC_SIZE_MULTIPLE       1024
805 #define RLC_TOC_UMF_SIZE_inM	23ULL
806 #define RLC_TOC_FORMAT_API	165ULL
807 
808 #define RLC_NUM_INS_CODE0   1
809 #define RLC_NUM_INS_CODE1   8
810 #define RLC_NUM_INS_CODE2   2
811 #define RLC_NUM_INS_CODE3   16
812 
813 static void gfx_v12_1_parse_rlc_toc(struct amdgpu_device *adev, void *rlc_toc)
814 {
815 	RLC_TABLE_OF_CONTENT_V2 *ucode = rlc_toc;
816 
817 	while (ucode && (ucode->id > SOC24_FIRMWARE_ID_INVALID)) {
818 		rlc_autoload_info[ucode->id].id = ucode->id;
819 		rlc_autoload_info[ucode->id].offset =
820 			ucode->offset * RLC_TOC_OFFSET_DWUNIT * 4;
821 		rlc_autoload_info[ucode->id].size =
822 			ucode->size_x16 ? ucode->size * RLC_SIZE_MULTIPLE * 4 :
823 					  ucode->size * 4;
824 		switch (ucode->vfflr_image_code) {
825 		case 0:
826 			rlc_autoload_info[ucode->id].num_inst =
827 				RLC_NUM_INS_CODE0;
828 			break;
829 		case 1:
830 			rlc_autoload_info[ucode->id].num_inst =
831 				RLC_NUM_INS_CODE1;
832 			break;
833 		case 2:
834 			rlc_autoload_info[ucode->id].num_inst =
835 				RLC_NUM_INS_CODE2;
836 			break;
837 		case 3:
838 			rlc_autoload_info[ucode->id].num_inst =
839 				RLC_NUM_INS_CODE3;
840 			break;
841 		default:
842 			dev_err(adev->dev,
843 				"Invalid Instance number detected\n");
844 			break;
845 		}
846 		ucode++;
847 	}
848 }
849 
850 static uint32_t gfx_v12_1_calc_toc_total_size(struct amdgpu_device *adev)
851 {
852 	uint32_t total_size = 0;
853 	SOC24_FIRMWARE_ID id;
854 
855 	gfx_v12_1_parse_rlc_toc(adev, adev->psp.toc.start_addr);
856 
857 	for (id = SOC24_FIRMWARE_ID_RLC_G_UCODE; id < SOC24_FIRMWARE_ID_MAX; id++)
858 		total_size += rlc_autoload_info[id].size;
859 
860 	/* In case the offset in rlc toc ucode is aligned */
861 	if (total_size < rlc_autoload_info[SOC24_FIRMWARE_ID_MAX-1].offset)
862 		total_size = rlc_autoload_info[SOC24_FIRMWARE_ID_MAX-1].offset +
863 			rlc_autoload_info[SOC24_FIRMWARE_ID_MAX-1].size;
864 	if (total_size < (RLC_TOC_UMF_SIZE_inM << 20))
865 		total_size = RLC_TOC_UMF_SIZE_inM << 20;
866 
867 	return total_size;
868 }
869 
870 static int gfx_v12_1_rlc_autoload_buffer_init(struct amdgpu_device *adev)
871 {
872 	int r;
873 	uint32_t total_size;
874 
875 	total_size = gfx_v12_1_calc_toc_total_size(adev);
876 
877 	r = amdgpu_bo_create_reserved(adev, total_size, 64 * 1024,
878 				      AMDGPU_GEM_DOMAIN_VRAM,
879 				      &adev->gfx.rlc.rlc_autoload_bo,
880 				      &adev->gfx.rlc.rlc_autoload_gpu_addr,
881 				      (void **)&adev->gfx.rlc.rlc_autoload_ptr);
882 
883 	if (r) {
884 		dev_err(adev->dev, "(%d) failed to create fw autoload bo\n", r);
885 		return r;
886 	}
887 
888 	return 0;
889 }
890 
891 static void gfx_v12_1_rlc_backdoor_autoload_copy_ucode(struct amdgpu_device *adev,
892 						       SOC24_FIRMWARE_ID id,
893 						       const void *fw_data,
894 						       uint32_t fw_size)
895 {
896 	uint32_t toc_offset;
897 	uint32_t toc_fw_size, toc_fw_inst_size;
898 	char *ptr = adev->gfx.rlc.rlc_autoload_ptr;
899 	int i, num_inst;
900 
901 	if (id <= SOC24_FIRMWARE_ID_INVALID || id >= SOC24_FIRMWARE_ID_MAX)
902 		return;
903 
904 	toc_offset = rlc_autoload_info[id].offset;
905 	toc_fw_size = rlc_autoload_info[id].size;
906 	num_inst = rlc_autoload_info[id].num_inst;
907 	toc_fw_inst_size = toc_fw_size / num_inst;
908 
909 	if (fw_size == 0)
910 		fw_size = toc_fw_inst_size;
911 
912 	if (fw_size > toc_fw_inst_size)
913 		fw_size = toc_fw_inst_size;
914 
915 	for (i = 0; i < num_inst; i++) {
916 		if ((num_inst == RLC_NUM_INS_CODE0) ||
917 		    ((1 << (i / 2)) & adev->gfx.xcc_mask)) {
918 			memcpy(ptr + toc_offset + i * toc_fw_inst_size, fw_data, fw_size);
919 
920 			if (fw_size < toc_fw_inst_size)
921 				memset(ptr + toc_offset + fw_size + i * toc_fw_inst_size,
922 				       0, toc_fw_inst_size - fw_size);
923 		}
924 	}
925 }
926 
927 static void
928 gfx_v12_1_rlc_backdoor_autoload_copy_toc_ucode(struct amdgpu_device *adev)
929 {
930 	void *data;
931 	uint32_t size;
932 	uint32_t *toc_ptr;
933 
934 	data = adev->psp.toc.start_addr;
935 	size = rlc_autoload_info[SOC24_FIRMWARE_ID_RLC_TOC].size;
936 
937 	toc_ptr = (uint32_t *)data + size / 4 - 2;
938 	*toc_ptr = (RLC_TOC_FORMAT_API << 24) | 0x1;
939 
940 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLC_TOC,
941 						   data, size);
942 }
943 
944 static void
945 gfx_v12_1_rlc_backdoor_autoload_copy_gfx_ucode(struct amdgpu_device *adev)
946 {
947 	const __le32 *fw_data;
948 	uint32_t fw_size;
949 	const struct gfx_firmware_header_v2_0 *cpv2_hdr;
950 	const struct rlc_firmware_header_v2_0 *rlc_hdr;
951 	const struct rlc_firmware_header_v2_1 *rlcv21_hdr;
952 	const struct rlc_firmware_header_v2_2 *rlcv22_hdr;
953 	uint16_t version_major, version_minor;
954 
955 	/* mec ucode */
956 	cpv2_hdr = (const struct gfx_firmware_header_v2_0 *)
957 		adev->gfx.mec_fw->data;
958 	/* instruction */
959 	fw_data = (const __le32 *) (adev->gfx.mec_fw->data +
960 		le32_to_cpu(cpv2_hdr->ucode_offset_bytes));
961 	fw_size = le32_to_cpu(cpv2_hdr->ucode_size_bytes);
962 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC,
963 						   fw_data, fw_size);
964 	/* data */
965 	fw_data = (const __le32 *) (adev->gfx.mec_fw->data +
966 		le32_to_cpu(cpv2_hdr->data_offset_bytes));
967 	fw_size = le32_to_cpu(cpv2_hdr->data_size_bytes);
968 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC_P0_STACK,
969 						   fw_data, fw_size);
970 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC_P1_STACK,
971 						   fw_data, fw_size);
972 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC_P2_STACK,
973 						   fw_data, fw_size);
974 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RS64_MEC_P3_STACK,
975 						   fw_data, fw_size);
976 
977 	/* rlc ucode */
978 	rlc_hdr = (const struct rlc_firmware_header_v2_0 *)
979 		adev->gfx.rlc_fw->data;
980 	fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
981 			le32_to_cpu(rlc_hdr->header.ucode_array_offset_bytes));
982 	fw_size = le32_to_cpu(rlc_hdr->header.ucode_size_bytes);
983 	gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLC_G_UCODE,
984 						   fw_data, fw_size);
985 
986 	version_major = le16_to_cpu(rlc_hdr->header.header_version_major);
987 	version_minor = le16_to_cpu(rlc_hdr->header.header_version_minor);
988 	if (version_major == 2) {
989 		if (version_minor >= 1) {
990 			rlcv21_hdr = (const struct rlc_firmware_header_v2_1 *)adev->gfx.rlc_fw->data;
991 
992 			fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
993 					le32_to_cpu(rlcv21_hdr->save_restore_list_gpm_offset_bytes));
994 			fw_size = le32_to_cpu(rlcv21_hdr->save_restore_list_gpm_size_bytes);
995 			gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLCG_SCRATCH,
996 						   fw_data, fw_size);
997 
998 			fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
999 					le32_to_cpu(rlcv21_hdr->save_restore_list_srm_offset_bytes));
1000 			fw_size = le32_to_cpu(rlcv21_hdr->save_restore_list_srm_size_bytes);
1001 			gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLC_SRM_ARAM,
1002 						   fw_data, fw_size);
1003 		}
1004 		if (version_minor >= 2) {
1005 			rlcv22_hdr = (const struct rlc_firmware_header_v2_2 *)adev->gfx.rlc_fw->data;
1006 
1007 			fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1008 					le32_to_cpu(rlcv22_hdr->rlc_iram_ucode_offset_bytes));
1009 			fw_size = le32_to_cpu(rlcv22_hdr->rlc_iram_ucode_size_bytes);
1010 			gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLX6_UCODE,
1011 						   fw_data, fw_size);
1012 
1013 			fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1014 					le32_to_cpu(rlcv22_hdr->rlc_dram_ucode_offset_bytes));
1015 			fw_size = le32_to_cpu(rlcv22_hdr->rlc_dram_ucode_size_bytes);
1016 			gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_RLX6_DRAM_BOOT,
1017 						   fw_data, fw_size);
1018 		}
1019 	}
1020 }
1021 
1022 static void
1023 gfx_v12_1_rlc_backdoor_autoload_copy_sdma_ucode(struct amdgpu_device *adev)
1024 {
1025 	const __le32 *fw_data;
1026 	uint32_t fw_size;
1027 	const struct sdma_firmware_header_v3_0 *sdma_hdr;
1028 
1029 	if (adev->sdma.instance[0].fw) {
1030 		sdma_hdr = (const struct sdma_firmware_header_v3_0 *)
1031 			adev->sdma.instance[0].fw->data;
1032 		fw_data = (const __le32 *) (adev->sdma.instance[0].fw->data +
1033 				le32_to_cpu(sdma_hdr->ucode_offset_bytes));
1034 		fw_size = le32_to_cpu(sdma_hdr->ucode_size_bytes);
1035 
1036 		gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, SOC24_FIRMWARE_ID_SDMA_UCODE_TH0,
1037 							   fw_data, fw_size);
1038 	}
1039 }
1040 
1041 static void
1042 gfx_v12_1_rlc_backdoor_autoload_copy_mes_ucode(struct amdgpu_device *adev)
1043 {
1044 	const __le32 *fw_data;
1045 	unsigned fw_size;
1046 	const struct mes_firmware_header_v1_0 *mes_hdr;
1047 	int pipe, ucode_id, data_id;
1048 
1049 	for (pipe = 0; pipe < 2; pipe++) {
1050 		if (pipe == 0) {
1051 			ucode_id = SOC24_FIRMWARE_ID_RS64_MES_P0;
1052 			data_id  = SOC24_FIRMWARE_ID_RS64_MES_P0_STACK;
1053 		} else {
1054 			ucode_id = SOC24_FIRMWARE_ID_RS64_MES_P1;
1055 			data_id  = SOC24_FIRMWARE_ID_RS64_MES_P1_STACK;
1056 		}
1057 
1058 		mes_hdr = (const struct mes_firmware_header_v1_0 *)
1059 			adev->mes.fw[pipe]->data;
1060 
1061 		fw_data = (const __le32 *)(adev->mes.fw[pipe]->data +
1062 				le32_to_cpu(mes_hdr->mes_ucode_offset_bytes));
1063 		fw_size = le32_to_cpu(mes_hdr->mes_ucode_size_bytes);
1064 
1065 		gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, ucode_id, fw_data, fw_size);
1066 
1067 		fw_data = (const __le32 *)(adev->mes.fw[pipe]->data +
1068 				le32_to_cpu(mes_hdr->mes_ucode_data_offset_bytes));
1069 		fw_size = le32_to_cpu(mes_hdr->mes_ucode_data_size_bytes);
1070 
1071 		gfx_v12_1_rlc_backdoor_autoload_copy_ucode(adev, data_id, fw_data, fw_size);
1072 	}
1073 }
1074 
1075 static int gfx_v12_1_rlc_backdoor_autoload_enable(struct amdgpu_device *adev)
1076 {
1077 	uint32_t rlc_g_offset, rlc_g_size;
1078 	uint64_t gpu_addr;
1079 	uint32_t data;
1080 	int i, num_xcc;
1081 
1082 	/* RLC autoload sequence 2: copy ucode */
1083 	gfx_v12_1_rlc_backdoor_autoload_copy_sdma_ucode(adev);
1084 	gfx_v12_1_rlc_backdoor_autoload_copy_gfx_ucode(adev);
1085 	gfx_v12_1_rlc_backdoor_autoload_copy_mes_ucode(adev);
1086 	gfx_v12_1_rlc_backdoor_autoload_copy_toc_ucode(adev);
1087 
1088 	rlc_g_offset = rlc_autoload_info[SOC24_FIRMWARE_ID_RLC_G_UCODE].offset;
1089 	rlc_g_size = rlc_autoload_info[SOC24_FIRMWARE_ID_RLC_G_UCODE].size;
1090 	gpu_addr = adev->gfx.rlc.rlc_autoload_gpu_addr + rlc_g_offset - adev->gmc.vram_start;
1091 
1092 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1093 	for (i = 0; i < num_xcc; i++) {
1094 		WREG32_SOC15(GC, GET_INST(GC, i),
1095 			     regGFX_IMU_RLC_BOOTLOADER_ADDR_HI,
1096 			     upper_32_bits(gpu_addr));
1097 		WREG32_SOC15(GC, GET_INST(GC, i),
1098 			     regGFX_IMU_RLC_BOOTLOADER_ADDR_LO,
1099 			     lower_32_bits(gpu_addr));
1100 		WREG32_SOC15(GC, GET_INST(GC, i),
1101 			     regGFX_IMU_RLC_BOOTLOADER_SIZE,
1102 			     rlc_g_size);
1103 	}
1104 
1105 	if (adev->gfx.imu.funcs) {
1106 		/* RLC autoload sequence 3: load IMU fw */
1107 		if (adev->gfx.imu.funcs->load_microcode)
1108 			adev->gfx.imu.funcs->load_microcode(adev);
1109 	}
1110 
1111 	/* unhalt rlc to start autoload */
1112 	for (i = 0; i < num_xcc; i++) {
1113 		data = RREG32_SOC15(GC, GET_INST(GC, i), regRLC_GPM_THREAD_ENABLE);
1114 		data = REG_SET_FIELD(data, RLC_GPM_THREAD_ENABLE, THREAD0_ENABLE, 1);
1115 		data = REG_SET_FIELD(data, RLC_GPM_THREAD_ENABLE, THREAD1_ENABLE, 1);
1116 		WREG32_SOC15(GC, GET_INST(GC, i), regRLC_GPM_THREAD_ENABLE, data);
1117 		WREG32_SOC15(GC, GET_INST(GC, i), regRLC_CNTL, RLC_CNTL__RLC_ENABLE_F32_MASK);
1118 	}
1119 
1120 	return 0;
1121 }
1122 
1123 static int gfx_v12_1_sw_init(struct amdgpu_ip_block *ip_block)
1124 {
1125 	int i, j, k, r, ring_id = 0;
1126 	unsigned num_compute_rings;
1127 	int xcc_id, num_xcc;
1128 	struct amdgpu_device *adev = ip_block->adev;
1129 
1130 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
1131 	case IP_VERSION(12, 1, 0):
1132 		adev->gfx.mec.num_mec = 1;
1133 		adev->gfx.mec.num_pipe_per_mec = 4;
1134 		adev->gfx.mec.num_queue_per_pipe = 8;
1135 		break;
1136 	default:
1137 		adev->gfx.mec.num_mec = 2;
1138 		adev->gfx.mec.num_pipe_per_mec = 2;
1139 		adev->gfx.mec.num_queue_per_pipe = 4;
1140 		break;
1141 	}
1142 
1143 	/* recalculate compute rings to use based on hardware configuration */
1144 	num_compute_rings = (adev->gfx.mec.num_pipe_per_mec *
1145 			     adev->gfx.mec.num_queue_per_pipe) / 2;
1146 	adev->gfx.num_compute_rings = min(adev->gfx.num_compute_rings,
1147 					  num_compute_rings);
1148 
1149 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1150 
1151 	/* EOP Event */
1152 	r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_GRBM_CP,
1153 			      GFX_12_1_0__SRCID__CP_EOP_INTERRUPT,
1154 			      &adev->gfx.eop_irq);
1155 	if (r)
1156 		return r;
1157 
1158 	/* Privileged reg */
1159 	r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_GRBM_CP,
1160 			      GFX_12_1_0__SRCID__CP_PRIV_REG_FAULT,
1161 			      &adev->gfx.priv_reg_irq);
1162 	if (r)
1163 		return r;
1164 
1165 	/* Privileged inst */
1166 	r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_GRBM_CP,
1167 			      GFX_12_1_0__SRCID__CP_PRIV_INSTR_FAULT,
1168 			      &adev->gfx.priv_inst_irq);
1169 	if (r)
1170 		return r;
1171 
1172 	adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE;
1173 
1174 	r = gfx_v12_1_rlc_init(adev);
1175 	if (r) {
1176 		dev_err(adev->dev, "Failed to init rlc BOs!\n");
1177 		return r;
1178 	}
1179 
1180 	r = gfx_v12_1_mec_init(adev);
1181 	if (r) {
1182 		dev_err(adev->dev, "Failed to init MEC BOs!\n");
1183 		return r;
1184 	}
1185 
1186 	/* set up the compute queues - allocate horizontally across pipes */
1187 	for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) {
1188 		ring_id = 0;
1189 		for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
1190 			for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) {
1191 				for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; k++) {
1192 					if (!amdgpu_gfx_is_mec_queue_enabled(adev,
1193 								xcc_id, i, k, j))
1194 						continue;
1195 
1196 					r = gfx_v12_1_compute_ring_init(adev, ring_id,
1197 								xcc_id, i, k, j);
1198 					if (r)
1199 						return r;
1200 
1201 					ring_id++;
1202 				}
1203 			}
1204 		}
1205 
1206 		if (!adev->enable_mes_kiq) {
1207 			r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, xcc_id);
1208 			if (r) {
1209 				dev_err(adev->dev, "Failed to init KIQ BOs!\n");
1210 				return r;
1211 			}
1212 
1213 			r = amdgpu_gfx_kiq_init_ring(adev, xcc_id);
1214 			if (r)
1215 				return r;
1216 		}
1217 
1218 		r = amdgpu_gfx_mqd_sw_init(adev, sizeof(struct v12_1_compute_mqd), xcc_id);
1219 		if (r)
1220 			return r;
1221 	}
1222 
1223 	/* allocate visible FB for rlc auto-loading fw */
1224 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {
1225 		r = gfx_v12_1_rlc_autoload_buffer_init(adev);
1226 		if (r)
1227 			return r;
1228 	} else if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
1229 		r = gfx_v12_1_init_cp_compute_microcode_bo(adev);
1230 		if (r)
1231 			return r;
1232 	}
1233 
1234 	r = gfx_v12_1_gpu_early_init(adev);
1235 	if (r)
1236 		return r;
1237 
1238 	r = amdgpu_gfx_sysfs_init(adev);
1239 	if (r)
1240 		return r;
1241 
1242 	return 0;
1243 }
1244 
1245 static void gfx_v12_1_rlc_autoload_buffer_fini(struct amdgpu_device *adev)
1246 {
1247 	amdgpu_bo_free_kernel(&adev->gfx.rlc.rlc_autoload_bo,
1248 			&adev->gfx.rlc.rlc_autoload_gpu_addr,
1249 			(void **)&adev->gfx.rlc.rlc_autoload_ptr);
1250 }
1251 
1252 static int gfx_v12_1_sw_fini(struct amdgpu_ip_block *ip_block)
1253 {
1254 	int i, num_xcc;
1255 	struct amdgpu_device *adev = ip_block->adev;
1256 
1257 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1258 	for (i = 0; i < adev->gfx.num_compute_rings * num_xcc; i++)
1259 		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
1260 
1261 	for (i = 0; i < num_xcc; i++) {
1262 		amdgpu_gfx_mqd_sw_fini(adev, i);
1263 
1264 		if (!adev->enable_mes_kiq) {
1265 			amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq[i].ring);
1266 			amdgpu_gfx_kiq_fini(adev, i);
1267 		}
1268 	}
1269 
1270 	gfx_v12_1_rlc_fini(adev);
1271 	gfx_v12_1_mec_fini(adev);
1272 
1273 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO)
1274 		gfx_v12_1_rlc_autoload_buffer_fini(adev);
1275 
1276 	gfx_v12_1_free_microcode(adev);
1277 
1278 	return 0;
1279 }
1280 
1281 static void gfx_v12_1_xcc_select_se_sh(struct amdgpu_device *adev, u32 se_num,
1282 				       u32 sh_num, u32 instance, int xcc_id)
1283 {
1284 	u32 data;
1285 
1286 	if (instance == 0xffffffff)
1287 		data = REG_SET_FIELD(0, GRBM_GFX_INDEX,
1288 				     INSTANCE_BROADCAST_WRITES, 1);
1289 	else
1290 		data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX,
1291 				     instance);
1292 
1293 	if (se_num == 0xffffffff)
1294 		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES,
1295 				     1);
1296 	else
1297 		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
1298 
1299 	if (sh_num == 0xffffffff)
1300 		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SA_BROADCAST_WRITES,
1301 				     1);
1302 	else
1303 		data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SA_INDEX, sh_num);
1304 
1305 	WREG32_SOC15_RLC_SHADOW_EX(reg, GC, GET_INST(GC, xcc_id), regGRBM_GFX_INDEX, data);
1306 }
1307 
1308 static u32 gfx_v12_1_get_sa_active_bitmap(struct amdgpu_device *adev,
1309 					  int xcc_id)
1310 {
1311 	u32 gc_disabled_sa_mask, gc_user_disabled_sa_mask, sa_mask;
1312 
1313 	gc_disabled_sa_mask = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCC_GC_SA_UNIT_DISABLE);
1314 	gc_disabled_sa_mask = REG_GET_FIELD(gc_disabled_sa_mask,
1315 					    CC_GC_SA_UNIT_DISABLE,
1316 					    SA_DISABLE);
1317 	gc_user_disabled_sa_mask = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SA_UNIT_DISABLE);
1318 	gc_user_disabled_sa_mask = REG_GET_FIELD(gc_user_disabled_sa_mask,
1319 						 GC_USER_SA_UNIT_DISABLE,
1320 						 SA_DISABLE);
1321 	sa_mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_sh_per_se *
1322 					    adev->gfx.config.max_shader_engines);
1323 
1324 	return sa_mask & (~(gc_disabled_sa_mask | gc_user_disabled_sa_mask));
1325 }
1326 
1327 static u32 gfx_v12_1_get_rb_active_bitmap(struct amdgpu_device *adev,
1328 					  int xcc_id)
1329 {
1330 	u32 gc_disabled_rb_mask, gc_user_disabled_rb_mask;
1331 	u32 rb_mask;
1332 
1333 	gc_disabled_rb_mask = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
1334 					   regCC_RB_BACKEND_DISABLE);
1335 	gc_disabled_rb_mask = REG_GET_FIELD(gc_disabled_rb_mask,
1336 					    CC_RB_BACKEND_DISABLE,
1337 					    BACKEND_DISABLE);
1338 	gc_user_disabled_rb_mask = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
1339 						regGC_USER_RB_BACKEND_DISABLE);
1340 	gc_user_disabled_rb_mask = REG_GET_FIELD(gc_user_disabled_rb_mask,
1341 						 GC_USER_RB_BACKEND_DISABLE,
1342 						 BACKEND_DISABLE);
1343 	rb_mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_backends_per_se *
1344 					    adev->gfx.config.max_shader_engines);
1345 
1346 	return rb_mask & (~(gc_disabled_rb_mask | gc_user_disabled_rb_mask));
1347 }
1348 
1349 static void gfx_v12_1_setup_rb(struct amdgpu_device *adev)
1350 {
1351 	u32 rb_bitmap_width_per_sa;
1352 	u32 max_sa;
1353 	u32 active_sa_bitmap;
1354 	u32 global_active_rb_bitmap;
1355 	u32 active_rb_bitmap = 0;
1356 	u32 i;
1357 	int xcc_id;
1358 
1359 	for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) {
1360 		/* query sa bitmap from SA_UNIT_DISABLE registers */
1361 		active_sa_bitmap = gfx_v12_1_get_sa_active_bitmap(adev, xcc_id);
1362 		/* query rb bitmap from RB_BACKEND_DISABLE registers */
1363 		global_active_rb_bitmap = gfx_v12_1_get_rb_active_bitmap(adev, xcc_id);
1364 
1365 		/* generate active rb bitmap according to active sa bitmap */
1366 		max_sa = adev->gfx.config.max_shader_engines *
1367 			 adev->gfx.config.max_sh_per_se;
1368 		rb_bitmap_width_per_sa = adev->gfx.config.max_backends_per_se /
1369 					 adev->gfx.config.max_sh_per_se;
1370 		for (i = 0; i < max_sa; i++) {
1371 			if (active_sa_bitmap & (1 << i))
1372 				active_rb_bitmap |= (0x3 << (i * rb_bitmap_width_per_sa));
1373 		}
1374 
1375 		active_rb_bitmap |= global_active_rb_bitmap;
1376 	}
1377 
1378 	adev->gfx.config.backend_enable_mask = active_rb_bitmap;
1379 	adev->gfx.config.num_rbs = hweight32(active_rb_bitmap);
1380 }
1381 
1382 static void gfx_v12_1_xcc_init_compute_vmid(struct amdgpu_device *adev,
1383 					    int xcc_id)
1384 {
1385 	int i;
1386 	uint32_t sh_mem_bases;
1387 	uint32_t data;
1388 
1389 	/*
1390 	 * Configure apertures:
1391 	 * LDS:         0x20000000'00000000 - 0x20000001'00000000 (4GB)
1392 	 * Scratch:     0x10000000'00000000 - 0x10000001'00000000 (4GB)
1393 	 */
1394 	sh_mem_bases = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,
1395 				     (adev->gmc.private_aperture_start >> 58));
1396 	sh_mem_bases = REG_SET_FIELD(sh_mem_bases, SH_MEM_BASES, SHARED_BASE,
1397 				     (adev->gmc.shared_aperture_start >> 48));
1398 
1399 	mutex_lock(&adev->srbm_mutex);
1400 	for (i = adev->vm_manager.first_kfd_vmid; i < AMDGPU_NUM_VMID; i++) {
1401 		soc_v1_0_grbm_select(adev, 0, 0, 0, i, GET_INST(GC, xcc_id));
1402 		/* CP and shaders */
1403 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSH_MEM_CONFIG, DEFAULT_SH_MEM_CONFIG);
1404 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSH_MEM_BASES, sh_mem_bases);
1405 
1406 		/* Enable trap for each kfd vmid. */
1407 		data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regSPI_GDBG_PER_VMID_CNTL);
1408 		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
1409 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSPI_GDBG_PER_VMID_CNTL, data);
1410 
1411 		/* Disable VGPR deallocation instruction for each KFD vmid. */
1412 		data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_DEBUG);
1413 		data = REG_SET_FIELD(data, SQ_DEBUG, DISABLE_VGPR_DEALLOC, 1);
1414 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSQ_DEBUG, data);
1415 	}
1416 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
1417 	mutex_unlock(&adev->srbm_mutex);
1418 }
1419 
1420 static void gfx_v12_1_tcp_harvest(struct amdgpu_device *adev)
1421 {
1422 	/* TODO: harvest feature to be added later. */
1423 }
1424 
1425 static void gfx_v12_1_get_tcc_info(struct amdgpu_device *adev)
1426 {
1427 }
1428 
1429 static void gfx_v12_1_xcc_constants_init(struct amdgpu_device *adev,
1430 					 int xcc_id)
1431 {
1432 	u32 tmp;
1433 	int i;
1434 
1435 	/* XXX SH_MEM regs */
1436 	/* where to put LDS, scratch, GPUVM in FSA64 space */
1437 	mutex_lock(&adev->srbm_mutex);
1438 	for (i = 0; i < adev->vm_manager.id_mgr[AMDGPU_GFXHUB(0)].num_ids; i++) {
1439 		soc_v1_0_grbm_select(adev, 0, 0, 0, i, GET_INST(GC, xcc_id));
1440 		/* CP and shaders */
1441 		WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1442 			     regSH_MEM_CONFIG, DEFAULT_SH_MEM_CONFIG);
1443 		if (i != 0) {
1444 			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,
1445 				(adev->gmc.private_aperture_start >> 58));
1446 			tmp = REG_SET_FIELD(tmp, SH_MEM_BASES, SHARED_BASE,
1447 				(adev->gmc.shared_aperture_start >> 48));
1448 			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regSH_MEM_BASES, tmp);
1449 		}
1450 	}
1451 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
1452 
1453 	mutex_unlock(&adev->srbm_mutex);
1454 
1455 	gfx_v12_1_xcc_init_compute_vmid(adev, xcc_id);
1456 }
1457 
1458 static void gfx_v12_1_constants_init(struct amdgpu_device *adev)
1459 {
1460 	int i, num_xcc;
1461 
1462 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1463 
1464 	gfx_v12_1_setup_rb(adev);
1465 	gfx_v12_1_get_cu_info(adev, &adev->gfx.cu_info);
1466 	gfx_v12_1_get_tcc_info(adev);
1467 	adev->gfx.config.pa_sc_tile_steering_override = 0;
1468 
1469 	for (i = 0; i < num_xcc; i++)
1470 		gfx_v12_1_xcc_constants_init(adev, i);
1471 }
1472 
1473 static void gfx_v12_1_xcc_enable_gui_idle_interrupt(struct amdgpu_device *adev,
1474 						    bool enable, int xcc_id)
1475 {
1476 	u32 tmp;
1477 
1478 	if (amdgpu_sriov_vf(adev))
1479 		return;
1480 
1481 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_INT_CNTL_RING0);
1482 
1483 	tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CNTX_BUSY_INT_ENABLE,
1484 			    enable ? 1 : 0);
1485 	tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CNTX_EMPTY_INT_ENABLE,
1486 			    enable ? 1 : 0);
1487 	tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CMP_BUSY_INT_ENABLE,
1488 			    enable ? 1 : 0);
1489 	tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, GFX_IDLE_INT_ENABLE,
1490 			    enable ? 1 : 0);
1491 
1492 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_INT_CNTL_RING0, tmp);
1493 }
1494 
1495 static int gfx_v12_1_xcc_init_csb(struct amdgpu_device *adev,
1496 				  int xcc_id)
1497 {
1498 	adev->gfx.rlc.funcs->get_csb_buffer(adev, adev->gfx.rlc.cs_ptr);
1499 
1500 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CSIB_ADDR_HI,
1501 			adev->gfx.rlc.clear_state_gpu_addr >> 32);
1502 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CSIB_ADDR_LO,
1503 			adev->gfx.rlc.clear_state_gpu_addr & 0xfffffffc);
1504 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1505 		     regRLC_CSIB_LENGTH, adev->gfx.rlc.clear_state_size);
1506 
1507 	return 0;
1508 }
1509 
1510 static void gfx_v12_1_xcc_rlc_stop(struct amdgpu_device *adev,
1511 				   int xcc_id)
1512 {
1513 	u32 tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CNTL);
1514 
1515 	tmp = REG_SET_FIELD(tmp, RLC_CNTL, RLC_ENABLE_F32, 0);
1516 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CNTL, tmp);
1517 }
1518 
1519 static void gfx_v12_1_rlc_stop(struct amdgpu_device *adev)
1520 {
1521 	int i, num_xcc;
1522 
1523 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1524 	for (i = 0; i < num_xcc; i++)
1525 		gfx_v12_1_xcc_rlc_stop(adev, i);
1526 }
1527 
1528 static void gfx_v12_1_xcc_rlc_reset(struct amdgpu_device *adev,
1529 				    int xcc_id)
1530 {
1531 	WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id),
1532 			      GRBM_SOFT_RESET, SOFT_RESET_RLC, 1);
1533 	udelay(50);
1534 	WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id),
1535 			      GRBM_SOFT_RESET, SOFT_RESET_RLC, 0);
1536 	udelay(50);
1537 }
1538 
1539 static void gfx_v12_1_rlc_reset(struct amdgpu_device *adev)
1540 {
1541 	int i, num_xcc;
1542 
1543 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1544 	for (i = 0; i < num_xcc; i++)
1545 		gfx_v12_1_xcc_rlc_reset(adev, i);
1546 }
1547 
1548 static void gfx_v12_1_xcc_rlc_smu_handshake_cntl(struct amdgpu_device *adev,
1549 						 bool enable, int xcc_id)
1550 {
1551 	uint32_t rlc_pg_cntl;
1552 
1553 	rlc_pg_cntl = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_PG_CNTL);
1554 
1555 	if (!enable) {
1556 		/* RLC_PG_CNTL[23] = 0 (default)
1557 		 * RLC will wait for handshake acks with SMU
1558 		 * GFXOFF will be enabled
1559 		 * RLC_PG_CNTL[23] = 1
1560 		 * RLC will not issue any message to SMU
1561 		 * hence no handshake between SMU & RLC
1562 		 * GFXOFF will be disabled
1563 		 */
1564 		rlc_pg_cntl |= RLC_PG_CNTL__SMU_HANDSHAKE_DISABLE_MASK;
1565 	} else
1566 		rlc_pg_cntl &= ~RLC_PG_CNTL__SMU_HANDSHAKE_DISABLE_MASK;
1567 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_PG_CNTL, rlc_pg_cntl);
1568 }
1569 
1570 static void gfx_v12_1_xcc_rlc_start(struct amdgpu_device *adev,
1571 				    int xcc_id)
1572 {
1573 	/* TODO: enable rlc & smu handshake until smu
1574 	 * and gfxoff feature works as expected */
1575 	if (!(amdgpu_pp_feature_mask & PP_GFXOFF_MASK))
1576 		gfx_v12_1_xcc_rlc_smu_handshake_cntl(adev, false, xcc_id);
1577 
1578 	WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id), RLC_CNTL, RLC_ENABLE_F32, 1);
1579 	udelay(50);
1580 }
1581 
1582 static void gfx_v12_1_rlc_start(struct amdgpu_device *adev)
1583 {
1584 	int i, num_xcc;
1585 
1586 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1587 	for (i = 0; i < num_xcc; i++) {
1588 		gfx_v12_1_xcc_rlc_start(adev, i);
1589 	}
1590 }
1591 
1592 static void gfx_v12_1_xcc_rlc_enable_srm(struct amdgpu_device *adev,
1593 					 int xcc_id)
1594 {
1595 	uint32_t tmp;
1596 
1597 	/* enable Save Restore Machine */
1598 	tmp = RREG32(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SRM_CNTL));
1599 	tmp |= RLC_SRM_CNTL__AUTO_INCR_ADDR_MASK;
1600 	tmp |= RLC_SRM_CNTL__SRM_ENABLE_MASK;
1601 	WREG32(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SRM_CNTL), tmp);
1602 }
1603 
1604 static void gfx_v12_1_xcc_load_rlcg_microcode(struct amdgpu_device *adev,
1605 					      int xcc_id)
1606 {
1607 	const struct rlc_firmware_header_v2_0 *hdr;
1608 	const __le32 *fw_data;
1609 	unsigned i, fw_size;
1610 
1611 	hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
1612 	fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1613 			   le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1614 	fw_size = le32_to_cpu(hdr->header.ucode_size_bytes) / 4;
1615 
1616 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_GPM_UCODE_ADDR,
1617 		     RLCG_UCODE_LOADING_START_ADDRESS);
1618 
1619 	for (i = 0; i < fw_size; i++)
1620 		WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1621 			     regRLC_GPM_UCODE_DATA,
1622 			     le32_to_cpup(fw_data++));
1623 
1624 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1625 		     regRLC_GPM_UCODE_ADDR,
1626 		     adev->gfx.rlc_fw_version);
1627 }
1628 
1629 static void gfx_v12_1_xcc_load_rlc_iram_dram_microcode(struct amdgpu_device *adev,
1630 						       int xcc_id)
1631 {
1632 	const struct rlc_firmware_header_v2_2 *hdr;
1633 	const __le32 *fw_data;
1634 	unsigned i, fw_size;
1635 	u32 tmp;
1636 
1637 	hdr = (const struct rlc_firmware_header_v2_2 *)adev->gfx.rlc_fw->data;
1638 
1639 	fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1640 			le32_to_cpu(hdr->rlc_iram_ucode_offset_bytes));
1641 	fw_size = le32_to_cpu(hdr->rlc_iram_ucode_size_bytes) / 4;
1642 
1643 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_LX6_IRAM_ADDR, 0);
1644 
1645 	for (i = 0; i < fw_size; i++) {
1646 		if ((amdgpu_emu_mode == 1) && (i % 100 == 99))
1647 			msleep(1);
1648 		WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1649 			     regRLC_LX6_IRAM_DATA,
1650 			     le32_to_cpup(fw_data++));
1651 	}
1652 
1653 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1654 		     regRLC_LX6_IRAM_ADDR, adev->gfx.rlc_fw_version);
1655 
1656 	fw_data = (const __le32 *)(adev->gfx.rlc_fw->data +
1657 			le32_to_cpu(hdr->rlc_dram_ucode_offset_bytes));
1658 	fw_size = le32_to_cpu(hdr->rlc_dram_ucode_size_bytes) / 4;
1659 
1660 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1661 		     regRLC_LX6_DRAM_ADDR, 0);
1662 	for (i = 0; i < fw_size; i++) {
1663 		if ((amdgpu_emu_mode == 1) && (i % 100 == 99))
1664 			msleep(1);
1665 		WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1666 			     regRLC_LX6_DRAM_DATA,
1667 			     le32_to_cpup(fw_data++));
1668 	}
1669 
1670 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
1671 		     regRLC_LX6_IRAM_ADDR, adev->gfx.rlc_fw_version);
1672 
1673 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_LX6_CNTL);
1674 	tmp = REG_SET_FIELD(tmp, RLC_LX6_CNTL, PDEBUG_ENABLE, 1);
1675 	tmp = REG_SET_FIELD(tmp, RLC_LX6_CNTL, BRESET, 0);
1676 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_LX6_CNTL, tmp);
1677 }
1678 
1679 static int gfx_v12_1_xcc_rlc_load_microcode(struct amdgpu_device *adev,
1680 					    int xcc_id)
1681 {
1682 	const struct rlc_firmware_header_v2_0 *hdr;
1683 	uint16_t version_major;
1684 	uint16_t version_minor;
1685 
1686 	if (!adev->gfx.rlc_fw)
1687 		return -EINVAL;
1688 
1689 	hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
1690 	amdgpu_ucode_print_rlc_hdr(&hdr->header);
1691 
1692 	version_major = le16_to_cpu(hdr->header.header_version_major);
1693 	version_minor = le16_to_cpu(hdr->header.header_version_minor);
1694 
1695 	if (version_major == 2) {
1696 		gfx_v12_1_xcc_load_rlcg_microcode(adev, xcc_id);
1697 		if (amdgpu_dpm == 1) {
1698 			if (version_minor >= 2)
1699 				gfx_v12_1_xcc_load_rlc_iram_dram_microcode(adev, xcc_id);
1700 		}
1701 
1702 		return 0;
1703 	}
1704 
1705 	return -EINVAL;
1706 }
1707 
1708 static int gfx_v12_1_xcc_rlc_resume(struct amdgpu_device *adev,
1709 				    int xcc_id)
1710 {
1711 	int r;
1712 
1713 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
1714 		gfx_v12_1_xcc_init_csb(adev, xcc_id);
1715 
1716 		if (!amdgpu_sriov_vf(adev)) /* enable RLC SRM */
1717 			gfx_v12_1_xcc_rlc_enable_srm(adev, xcc_id);
1718 	} else {
1719 		if (amdgpu_sriov_vf(adev)) {
1720 			gfx_v12_1_xcc_init_csb(adev, xcc_id);
1721 			return 0;
1722 		}
1723 
1724 		gfx_v12_1_xcc_rlc_stop(adev, xcc_id);
1725 
1726 		/* disable CG */
1727 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL, 0);
1728 
1729 		/* disable PG */
1730 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_PG_CNTL, 0);
1731 
1732 		if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
1733 			/* legacy rlc firmware loading */
1734 			r = gfx_v12_1_xcc_rlc_load_microcode(adev, xcc_id);
1735 			if (r)
1736 				return r;
1737 		}
1738 
1739 		gfx_v12_1_xcc_init_csb(adev, xcc_id);
1740 
1741 		gfx_v12_1_xcc_rlc_start(adev, xcc_id);
1742 	}
1743 
1744 	return 0;
1745 }
1746 
1747 static int gfx_v12_1_rlc_resume(struct amdgpu_device *adev)
1748 {
1749 	int r, i, num_xcc;
1750 
1751 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1752 	for (i = 0; i < num_xcc; i++) {
1753 		r = gfx_v12_1_xcc_rlc_resume(adev, i);
1754 		if (r)
1755 			return r;
1756 	}
1757 
1758 	return 0;
1759 }
1760 
1761 static void gfx_v12_1_xcc_config_gfx_rs64(struct amdgpu_device *adev,
1762 					  int xcc_id)
1763 {
1764 	const struct gfx_firmware_header_v2_0 *mec_hdr;
1765 	uint32_t pipe_id, tmp;
1766 
1767 	mec_hdr = (const struct gfx_firmware_header_v2_0 *)
1768 		adev->gfx.mec_fw->data;
1769 
1770 	/* config mec program start addr */
1771 	for (pipe_id = 0; pipe_id < 4; pipe_id++) {
1772 		soc_v1_0_grbm_select(adev, 1, pipe_id, 0, 0, GET_INST(GC, xcc_id));
1773 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_PRGRM_CNTR_START,
1774 					mec_hdr->ucode_start_addr_lo >> 2 |
1775 					mec_hdr->ucode_start_addr_hi << 30);
1776 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_PRGRM_CNTR_START_HI,
1777 					mec_hdr->ucode_start_addr_hi >> 2);
1778 	}
1779 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
1780 
1781 	/* reset mec pipe */
1782 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL);
1783 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE0_RESET, 1);
1784 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE1_RESET, 1);
1785 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE2_RESET, 1);
1786 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE3_RESET, 1);
1787 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL, tmp);
1788 
1789 	/* clear mec pipe reset */
1790 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE0_RESET, 0);
1791 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE1_RESET, 0);
1792 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE2_RESET, 0);
1793 	tmp = REG_SET_FIELD(tmp, CP_MEC_RS64_CNTL, MEC_PIPE3_RESET, 0);
1794 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL, tmp);
1795 }
1796 
1797 static void gfx_v12_1_config_gfx_rs64(struct amdgpu_device *adev)
1798 {
1799 	int i, num_xcc;
1800 
1801 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
1802 
1803 	for (i = 0; i < num_xcc; i++)
1804 		gfx_v12_1_xcc_config_gfx_rs64(adev, i);
1805 }
1806 
1807 static void gfx_v12_1_xcc_set_mec_ucode_start_addr(struct amdgpu_device *adev,
1808 						   int xcc_id)
1809 {
1810 	const struct gfx_firmware_header_v2_0 *cp_hdr;
1811 	unsigned pipe_id;
1812 
1813 	cp_hdr = (const struct gfx_firmware_header_v2_0 *)
1814 		adev->gfx.mec_fw->data;
1815 	mutex_lock(&adev->srbm_mutex);
1816 	for (pipe_id = 0; pipe_id < adev->gfx.mec.num_pipe_per_mec; pipe_id++) {
1817 		soc_v1_0_grbm_select(adev, 1, pipe_id, 0, 0, GET_INST(GC, xcc_id));
1818 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_PRGRM_CNTR_START,
1819 			     cp_hdr->ucode_start_addr_lo >> 2 |
1820 			     cp_hdr->ucode_start_addr_hi << 30);
1821 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_PRGRM_CNTR_START_HI,
1822 			     cp_hdr->ucode_start_addr_hi >> 2);
1823 	}
1824 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
1825 	mutex_unlock(&adev->srbm_mutex);
1826 }
1827 
1828 static int gfx_v12_1_xcc_wait_for_rlc_autoload_complete(struct amdgpu_device *adev,
1829 							int xcc_id)
1830 {
1831 	uint32_t cp_status;
1832 	uint32_t bootload_status;
1833 	int i;
1834 
1835 	for (i = 0; i < adev->usec_timeout; i++) {
1836 		cp_status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_STAT);
1837 		bootload_status = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
1838 					       regRLC_RLCS_BOOTLOAD_STATUS);
1839 
1840 		if ((cp_status == 0) &&
1841 		    (REG_GET_FIELD(bootload_status,
1842 			RLC_RLCS_BOOTLOAD_STATUS, BOOTLOAD_COMPLETE) == 1)) {
1843 			break;
1844 		}
1845 		udelay(1);
1846 		if (amdgpu_emu_mode)
1847 			msleep(10);
1848 	}
1849 
1850 	if (i >= adev->usec_timeout) {
1851 		dev_err(adev->dev,
1852 			"rlc autoload: xcc%d gc ucode autoload timeout\n", xcc_id);
1853 		return -ETIMEDOUT;
1854 	}
1855 
1856 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {
1857 		gfx_v12_1_xcc_set_mec_ucode_start_addr(adev, xcc_id);
1858 	}
1859 
1860 	return 0;
1861 }
1862 
1863 static int gfx_v12_1_wait_for_rlc_autoload_complete(struct amdgpu_device *adev)
1864 {
1865 	int xcc_id;
1866 
1867 	for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++)
1868 		gfx_v12_1_xcc_wait_for_rlc_autoload_complete(adev, xcc_id);
1869 
1870 	return 0;
1871 }
1872 
1873 static void gfx_v12_1_xcc_cp_compute_enable(struct amdgpu_device *adev,
1874 					    bool enable, int xcc_id)
1875 {
1876 	u32 data;
1877 
1878 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL);
1879 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_INVALIDATE_ICACHE,
1880 						 enable ? 0 : 1);
1881 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE0_RESET,
1882 						 enable ? 0 : 1);
1883 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE1_RESET,
1884 						 enable ? 0 : 1);
1885 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE2_RESET,
1886 						 enable ? 0 : 1);
1887 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE3_RESET,
1888 						 enable ? 0 : 1);
1889 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE0_ACTIVE,
1890 						 enable ? 1 : 0);
1891 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE1_ACTIVE,
1892 			                         enable ? 1 : 0);
1893 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE2_ACTIVE,
1894 						 enable ? 1 : 0);
1895 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_PIPE3_ACTIVE,
1896 						 enable ? 1 : 0);
1897 	data = REG_SET_FIELD(data, CP_MEC_RS64_CNTL, MEC_HALT,
1898 						 enable ? 0 : 1);
1899 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_RS64_CNTL, data);
1900 
1901 	adev->gfx.kiq[xcc_id].ring.sched.ready = enable;
1902 
1903 	udelay(50);
1904 }
1905 
1906 static int gfx_v12_1_init_cp_compute_microcode_bo(struct amdgpu_device *adev)
1907 {
1908 	const struct gfx_firmware_header_v2_0 *mec_hdr;
1909 	const __le32 *fw_ucode, *fw_data;
1910 	u32 fw_ucode_size, fw_data_size;
1911 	u32 *fw_ucode_ptr, *fw_data_ptr;
1912 	int i, r, xcc_id;
1913 
1914 	if (!adev->gfx.mec_fw)
1915 		return -EINVAL;
1916 
1917 	mec_hdr = (const struct gfx_firmware_header_v2_0 *)adev->gfx.mec_fw->data;
1918 	amdgpu_ucode_print_gfx_hdr(&mec_hdr->header);
1919 
1920 	fw_ucode = (const __le32 *) (adev->gfx.mec_fw->data +
1921 				le32_to_cpu(mec_hdr->ucode_offset_bytes));
1922 	fw_ucode_size = le32_to_cpu(mec_hdr->ucode_size_bytes);
1923 
1924 	fw_data = (const __le32 *) (adev->gfx.mec_fw->data +
1925 				le32_to_cpu(mec_hdr->data_offset_bytes));
1926 	fw_data_size = le32_to_cpu(mec_hdr->data_size_bytes);
1927 
1928 	if (adev->gfx.mec.mec_fw_obj == NULL) {
1929 		r = amdgpu_bo_create_reserved(adev, fw_ucode_size,
1930 					      64 * 1024, AMDGPU_GEM_DOMAIN_VRAM,
1931 					      &adev->gfx.mec.mec_fw_obj,
1932 					      &adev->gfx.mec.mec_fw_gpu_addr,
1933 					      (void **)&fw_ucode_ptr);
1934 		if (r) {
1935 			dev_err(adev->dev, "(%d) failed to create mec fw ucode bo\n", r);
1936 			gfx_v12_1_mec_fini(adev);
1937 			return r;
1938 		}
1939 
1940 		memcpy(fw_ucode_ptr, fw_ucode, fw_ucode_size);
1941 
1942 		amdgpu_bo_kunmap(adev->gfx.mec.mec_fw_obj);
1943 		amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_obj);
1944 	}
1945 
1946 	if (adev->gfx.mec.mec_fw_data_obj == NULL) {
1947 		r = amdgpu_bo_create_reserved(adev,
1948 					      ALIGN(fw_data_size, 64 * 1024) *
1949 					      adev->gfx.mec.num_pipe_per_mec * NUM_XCC(adev->gfx.xcc_mask),
1950 					      64 * 1024, AMDGPU_GEM_DOMAIN_VRAM,
1951 					      &adev->gfx.mec.mec_fw_data_obj,
1952 					      &adev->gfx.mec.mec_fw_data_gpu_addr,
1953 					      (void **)&fw_data_ptr);
1954 		if (r) {
1955 			dev_err(adev->dev, "(%d) failed to create mec fw data bo\n", r);
1956 			gfx_v12_1_mec_fini(adev);
1957 			return r;
1958 		}
1959 
1960 		for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) {
1961 			for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++) {
1962 				u32 offset = (xcc_id * adev->gfx.mec.num_pipe_per_mec + i) *
1963 					     ALIGN(fw_data_size, 64 * 1024) / 4;
1964 				memcpy(fw_data_ptr + offset, fw_data, fw_data_size);
1965 			}
1966 		}
1967 
1968 		amdgpu_bo_kunmap(adev->gfx.mec.mec_fw_data_obj);
1969 		amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_data_obj);
1970 	}
1971 
1972 	return 0;
1973 }
1974 
1975 static int gfx_v12_1_xcc_cp_compute_load_microcode_rs64(struct amdgpu_device *adev,
1976 							int xcc_id)
1977 {
1978 	const struct gfx_firmware_header_v2_0 *mec_hdr;
1979 	u32 fw_data_size;
1980 	u32 tmp, i, usec_timeout = 50000; /* Wait for 50 ms */
1981 
1982 	if (!adev->gfx.mec_fw)
1983 		return -EINVAL;
1984 
1985 	mec_hdr = (const struct gfx_firmware_header_v2_0 *)adev->gfx.mec_fw->data;
1986 	fw_data_size = le32_to_cpu(mec_hdr->data_size_bytes);
1987 
1988 	gfx_v12_1_xcc_cp_compute_enable(adev, false, xcc_id);
1989 
1990 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_CNTL);
1991 	tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, VMID, 0);
1992 	tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, EXE_DISABLE, 0);
1993 	tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, CACHE_POLICY, 0);
1994 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_CNTL, tmp);
1995 
1996 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_BASE_CNTL);
1997 	tmp = REG_SET_FIELD(tmp, CP_MEC_DC_BASE_CNTL, VMID, 0);
1998 	tmp = REG_SET_FIELD(tmp, CP_MEC_DC_BASE_CNTL, CACHE_POLICY, 0);
1999 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_BASE_CNTL, tmp);
2000 
2001 	mutex_lock(&adev->srbm_mutex);
2002 	for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++) {
2003 		soc_v1_0_grbm_select(adev, 1, i, 0, 0, GET_INST(GC, xcc_id));
2004 
2005 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_MDBASE_LO,
2006 			     lower_32_bits(adev->gfx.mec.mec_fw_data_gpu_addr +
2007 					   (xcc_id * adev->gfx.mec.num_pipe_per_mec + i) *
2008 					   ALIGN(fw_data_size, 64 * 1024)));
2009 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_MDBASE_HI,
2010 			     upper_32_bits(adev->gfx.mec.mec_fw_data_gpu_addr +
2011 					   (xcc_id * adev->gfx.mec.num_pipe_per_mec + i) *
2012 					   ALIGN(fw_data_size, 64 * 1024)));
2013 
2014 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_LO,
2015 				lower_32_bits(adev->gfx.mec.mec_fw_gpu_addr));
2016 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_HI,
2017 				upper_32_bits(adev->gfx.mec.mec_fw_gpu_addr));
2018 	}
2019 	mutex_unlock(&adev->srbm_mutex);
2020 	soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
2021 
2022 	/* Trigger an invalidation of the L1 instruction caches */
2023 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL);
2024 	tmp = REG_SET_FIELD(tmp, CP_MEC_DC_OP_CNTL, INVALIDATE_DCACHE, 1);
2025 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL, tmp);
2026 
2027 	/* Wait for invalidation complete */
2028 	for (i = 0; i < usec_timeout; i++) {
2029 		tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL);
2030 		if (1 == REG_GET_FIELD(tmp, CP_MEC_DC_OP_CNTL,
2031 				       INVALIDATE_DCACHE_COMPLETE))
2032 			break;
2033 		udelay(1);
2034 	}
2035 
2036 	if (i >= usec_timeout) {
2037 		dev_err(adev->dev, "failed to invalidate data cache\n");
2038 		return -EINVAL;
2039 	}
2040 
2041 	/* Trigger an invalidation of the L1 instruction caches */
2042 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL);
2043 	tmp = REG_SET_FIELD(tmp, CP_CPC_IC_OP_CNTL, INVALIDATE_CACHE, 1);
2044 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL, tmp);
2045 
2046 	/* Wait for invalidation complete */
2047 	for (i = 0; i < usec_timeout; i++) {
2048 		tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL);
2049 		if (1 == REG_GET_FIELD(tmp, CP_CPC_IC_OP_CNTL,
2050 				       INVALIDATE_CACHE_COMPLETE))
2051 			break;
2052 		udelay(1);
2053 	}
2054 
2055 	if (i >= usec_timeout) {
2056 		dev_err(adev->dev, "failed to invalidate instruction cache\n");
2057 		return -EINVAL;
2058 	}
2059 
2060 	gfx_v12_1_xcc_set_mec_ucode_start_addr(adev, xcc_id);
2061 
2062 	return 0;
2063 }
2064 
2065 static void gfx_v12_1_xcc_kiq_setting(struct amdgpu_ring *ring,
2066 				      int xcc_id)
2067 {
2068 	uint32_t tmp;
2069 	struct amdgpu_device *adev = ring->adev;
2070 
2071 	/* tell RLC which is KIQ queue */
2072 	tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS);
2073 	tmp &= 0xffffff00;
2074 	tmp |= (ring->me << 5) | (ring->pipe << 3) | (ring->queue);
2075 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS, tmp);
2076 	tmp |= 0x80;
2077 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS, tmp);
2078 }
2079 
2080 static void gfx_v12_1_xcc_cp_set_doorbell_range(struct amdgpu_device *adev,
2081 						int xcc_id)
2082 {
2083 	/* disable gfx engine doorbell range */
2084 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_DOORBELL_RANGE_LOWER, 0);
2085 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_DOORBELL_RANGE_UPPER, 0);
2086 
2087 	/* set compute engine doorbell range */
2088 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DOORBELL_RANGE_LOWER,
2089 		     ((adev->doorbell_index.kiq +
2090 		       xcc_id * adev->doorbell_index.xcc_doorbell_range) *
2091 		      2) << 2);
2092 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DOORBELL_RANGE_UPPER,
2093 		     ((adev->doorbell_index.userqueue_end +
2094 		       xcc_id * adev->doorbell_index.xcc_doorbell_range) *
2095 		      2) << 2);
2096 }
2097 
2098 static int gfx_v12_1_compute_mqd_init(struct amdgpu_device *adev, void *m,
2099 				      struct amdgpu_mqd_prop *prop)
2100 {
2101 	struct v12_1_compute_mqd *mqd = m;
2102 	uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr;
2103 	uint32_t tmp;
2104 
2105 	mqd->header = 0xC0310800;
2106 	mqd->compute_pipelinestat_enable = 0x00000001;
2107 	mqd->compute_static_thread_mgmt_se0 = 0xffffffff;
2108 	mqd->compute_static_thread_mgmt_se1 = 0xffffffff;
2109 	mqd->compute_static_thread_mgmt_se2 = 0xffffffff;
2110 	mqd->compute_static_thread_mgmt_se3 = 0xffffffff;
2111 	mqd->compute_misc_reserved = 0x00000007;
2112 
2113 	eop_base_addr = prop->eop_gpu_addr >> 8;
2114 	mqd->cp_hqd_eop_base_addr_lo = eop_base_addr;
2115 	mqd->cp_hqd_eop_base_addr_hi = upper_32_bits(eop_base_addr);
2116 
2117 	/* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
2118 	tmp = RREG32_SOC15(GC, GET_INST(GC, 0), regCP_HQD_EOP_CONTROL);
2119 	tmp = REG_SET_FIELD(tmp, CP_HQD_EOP_CONTROL, EOP_SIZE,
2120 			(order_base_2(GFX12_MEC_HPD_SIZE / 4) - 1));
2121 
2122 	mqd->cp_hqd_eop_control = tmp;
2123 
2124 	/* enable doorbell? */
2125 	tmp = RREG32_SOC15(GC, GET_INST(GC, 0), regCP_HQD_PQ_DOORBELL_CONTROL);
2126 
2127 	if (prop->use_doorbell) {
2128 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2129 				    DOORBELL_OFFSET, prop->doorbell_index);
2130 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2131 				    DOORBELL_EN, 1);
2132 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2133 				    DOORBELL_SOURCE, 0);
2134 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2135 				    DOORBELL_HIT, 0);
2136 	} else {
2137 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2138 				    DOORBELL_EN, 0);
2139 	}
2140 
2141 	mqd->cp_hqd_pq_doorbell_control = tmp;
2142 
2143 	/* disable the queue if it's active */
2144 	mqd->cp_hqd_dequeue_request = 0;
2145 	mqd->cp_hqd_pq_rptr = 0;
2146 	mqd->cp_hqd_pq_wptr_lo = 0;
2147 	mqd->cp_hqd_pq_wptr_hi = 0;
2148 
2149 	/* set the pointer to the MQD */
2150 	mqd->cp_mqd_base_addr_lo = prop->mqd_gpu_addr & 0xfffffffc;
2151 	mqd->cp_mqd_base_addr_hi = upper_32_bits(prop->mqd_gpu_addr);
2152 
2153 	/* set MQD vmid to 0 */
2154 	tmp = RREG32_SOC15(GC, GET_INST(GC, 0), regCP_MQD_CONTROL);
2155 	tmp = REG_SET_FIELD(tmp, CP_MQD_CONTROL, VMID, 0);
2156 	mqd->cp_mqd_control = tmp;
2157 
2158 	/* set the pointer to the HQD, this is similar CP_RB0_BASE/_HI */
2159 	hqd_gpu_addr = prop->hqd_base_gpu_addr >> 8;
2160 	mqd->cp_hqd_pq_base_lo = hqd_gpu_addr;
2161 	mqd->cp_hqd_pq_base_hi = upper_32_bits(hqd_gpu_addr);
2162 
2163 	/* set up the HQD, this is similar to CP_RB0_CNTL */
2164 	tmp = RREG32_SOC15(GC, GET_INST(GC, 0), regCP_HQD_PQ_CONTROL);
2165 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, QUEUE_SIZE,
2166 			    (order_base_2(prop->queue_size / 4) - 1));
2167 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE,
2168 			    (order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1));
2169 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
2170 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
2171 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
2172 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
2173 	mqd->cp_hqd_pq_control = tmp;
2174 
2175 	/* set the wb address whether it's enabled or not */
2176 	wb_gpu_addr = prop->rptr_gpu_addr;
2177 	mqd->cp_hqd_pq_rptr_report_addr_lo = wb_gpu_addr & 0xfffffffc;
2178 	mqd->cp_hqd_pq_rptr_report_addr_hi =
2179 		upper_32_bits(wb_gpu_addr) & 0xffff;
2180 
2181 	/* only used if CP_PQ_WPTR_POLL_CNTL.CP_PQ_WPTR_POLL_CNTL__EN_MASK=1 */
2182 	wb_gpu_addr = prop->wptr_gpu_addr;
2183 	mqd->cp_hqd_pq_wptr_poll_addr_lo = wb_gpu_addr & 0xfffffffc;
2184 	mqd->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits(wb_gpu_addr) & 0xffff;
2185 
2186 	tmp = 0;
2187 	/* enable the doorbell if requested */
2188 	if (prop->use_doorbell) {
2189 		tmp = RREG32_SOC15(GC, GET_INST(GC, 0), regCP_HQD_PQ_DOORBELL_CONTROL);
2190 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2191 				DOORBELL_OFFSET, prop->doorbell_index);
2192 
2193 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2194 				    DOORBELL_EN, 1);
2195 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2196 				    DOORBELL_SOURCE, 0);
2197 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
2198 				    DOORBELL_HIT, 0);
2199 	}
2200 
2201 	mqd->cp_hqd_pq_doorbell_control = tmp;
2202 
2203 	/* reset read and write pointers, similar to CP_RB0_WPTR/_RPTR */
2204 	mqd->cp_hqd_pq_rptr = RREG32_SOC15(GC, GET_INST(GC, 0), regCP_HQD_PQ_RPTR);
2205 
2206 	/* set the vmid for the queue */
2207 	mqd->cp_hqd_vmid = 0;
2208 
2209 	tmp = RREG32_SOC15(GC, GET_INST(GC, 0), regCP_HQD_PERSISTENT_STATE);
2210 	tmp = REG_SET_FIELD(tmp, CP_HQD_PERSISTENT_STATE, PRELOAD_SIZE, 0x63);
2211 	mqd->cp_hqd_persistent_state = tmp;
2212 
2213 	/* set MIN_IB_AVAIL_SIZE */
2214 	tmp = RREG32_SOC15(GC, GET_INST(GC, 0), regCP_HQD_IB_CONTROL);
2215 	tmp = REG_SET_FIELD(tmp, CP_HQD_IB_CONTROL, MIN_IB_AVAIL_SIZE, 1);
2216 	mqd->cp_hqd_ib_control = tmp;
2217 
2218 	/* set static priority for a compute queue/ring */
2219 	mqd->cp_hqd_pipe_priority = prop->hqd_pipe_priority;
2220 	mqd->cp_hqd_queue_priority = prop->hqd_queue_priority;
2221 
2222 	mqd->cp_mqd_stride_size = prop->mqd_stride_size ? prop->mqd_stride_size :
2223 		sizeof(struct v12_1_compute_mqd);
2224 
2225 	mqd->cp_hqd_active = prop->hqd_active;
2226 
2227 	return 0;
2228 }
2229 
2230 static int gfx_v12_1_xcc_kiq_init_register(struct amdgpu_ring *ring,
2231 					   int xcc_id)
2232 {
2233 	struct amdgpu_device *adev = ring->adev;
2234 	struct v12_1_compute_mqd *mqd = ring->mqd_ptr;
2235 	int j;
2236 
2237 	/* inactivate the queue */
2238 	if (amdgpu_sriov_vf(adev))
2239 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE, 0);
2240 
2241 	/* disable wptr polling */
2242 	WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id), CP_PQ_WPTR_POLL_CNTL, EN, 0);
2243 
2244 	/* write the EOP addr */
2245 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_EOP_BASE_ADDR,
2246 	       mqd->cp_hqd_eop_base_addr_lo);
2247 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_EOP_BASE_ADDR_HI,
2248 	       mqd->cp_hqd_eop_base_addr_hi);
2249 
2250 	/* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
2251 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_EOP_CONTROL,
2252 	       mqd->cp_hqd_eop_control);
2253 
2254 	/* enable doorbell? */
2255 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_DOORBELL_CONTROL,
2256 	       mqd->cp_hqd_pq_doorbell_control);
2257 
2258 	/* disable the queue if it's active */
2259 	if (RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1) {
2260 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_DEQUEUE_REQUEST, 1);
2261 		for (j = 0; j < adev->usec_timeout; j++) {
2262 			if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1))
2263 				break;
2264 			udelay(1);
2265 		}
2266 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_DEQUEUE_REQUEST,
2267 		       mqd->cp_hqd_dequeue_request);
2268 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_RPTR,
2269 		       mqd->cp_hqd_pq_rptr);
2270 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_LO,
2271 		       mqd->cp_hqd_pq_wptr_lo);
2272 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_HI,
2273 		       mqd->cp_hqd_pq_wptr_hi);
2274 	}
2275 
2276 	/* set the pointer to the MQD */
2277 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MQD_BASE_ADDR,
2278 	       mqd->cp_mqd_base_addr_lo);
2279 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MQD_BASE_ADDR_HI,
2280 	       mqd->cp_mqd_base_addr_hi);
2281 
2282 	/* set MQD vmid to 0 */
2283 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MQD_CONTROL,
2284 	       mqd->cp_mqd_control);
2285 
2286 	/* set the pointer to the HQD, this is similar CP_RB0_BASE/_HI */
2287 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_BASE,
2288 	       mqd->cp_hqd_pq_base_lo);
2289 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_BASE_HI,
2290 	       mqd->cp_hqd_pq_base_hi);
2291 
2292 	/* set up the HQD, this is similar to CP_RB0_CNTL */
2293 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_CONTROL,
2294 	       mqd->cp_hqd_pq_control);
2295 
2296 	/* set the wb address whether it's enabled or not */
2297 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_RPTR_REPORT_ADDR,
2298 		mqd->cp_hqd_pq_rptr_report_addr_lo);
2299 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_RPTR_REPORT_ADDR_HI,
2300 		mqd->cp_hqd_pq_rptr_report_addr_hi);
2301 
2302 	/* only used if CP_PQ_WPTR_POLL_CNTL.CP_PQ_WPTR_POLL_CNTL__EN_MASK=1 */
2303 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_POLL_ADDR,
2304 	       mqd->cp_hqd_pq_wptr_poll_addr_lo);
2305 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_POLL_ADDR_HI,
2306 	       mqd->cp_hqd_pq_wptr_poll_addr_hi);
2307 
2308 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_DOORBELL_CONTROL,
2309 	       mqd->cp_hqd_pq_doorbell_control);
2310 
2311 	/* reset read and write pointers, similar to CP_RB0_WPTR/_RPTR */
2312 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_LO,
2313 	       mqd->cp_hqd_pq_wptr_lo);
2314 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_WPTR_HI,
2315 	       mqd->cp_hqd_pq_wptr_hi);
2316 
2317 	/* set the vmid for the queue */
2318 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_VMID, mqd->cp_hqd_vmid);
2319 
2320 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_PERSISTENT_STATE,
2321 	       mqd->cp_hqd_persistent_state);
2322 
2323 	/* activate the queue */
2324 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE,
2325 	       mqd->cp_hqd_active);
2326 
2327 	if (ring->use_doorbell)
2328 		WREG32_FIELD15_PREREG(GC, GET_INST(GC, xcc_id), CP_PQ_STATUS, DOORBELL_ENABLE, 1);
2329 
2330 	return 0;
2331 }
2332 
2333 static int gfx_v12_1_xcc_kiq_init_queue(struct amdgpu_ring *ring,
2334 					int xcc_id)
2335 {
2336 	struct amdgpu_device *adev = ring->adev;
2337 	struct v12_1_compute_mqd *mqd = ring->mqd_ptr;
2338 
2339 	gfx_v12_1_xcc_kiq_setting(ring, xcc_id);
2340 
2341 	if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
2342 		/* reset MQD to a clean status */
2343 		if (adev->gfx.kiq[xcc_id].mqd_backup)
2344 			memcpy(mqd, adev->gfx.kiq[xcc_id].mqd_backup, sizeof(*mqd));
2345 
2346 		/* reset ring buffer */
2347 		ring->wptr = 0;
2348 		amdgpu_ring_clear_ring(ring);
2349 
2350 		mutex_lock(&adev->srbm_mutex);
2351 		soc_v1_0_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, xcc_id));
2352 		gfx_v12_1_xcc_kiq_init_register(ring, xcc_id);
2353 		soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
2354 		mutex_unlock(&adev->srbm_mutex);
2355 	} else {
2356 		memset((void *)mqd, 0, sizeof(*mqd));
2357 		if (amdgpu_sriov_vf(adev) && adev->in_suspend)
2358 			amdgpu_ring_clear_ring(ring);
2359 		mutex_lock(&adev->srbm_mutex);
2360 		soc_v1_0_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, xcc_id));
2361 		amdgpu_ring_init_mqd(ring);
2362 		gfx_v12_1_xcc_kiq_init_register(ring, xcc_id);
2363 		soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
2364 		mutex_unlock(&adev->srbm_mutex);
2365 
2366 		if (adev->gfx.kiq[xcc_id].mqd_backup)
2367 			memcpy(adev->gfx.kiq[xcc_id].mqd_backup, mqd, sizeof(*mqd));
2368 	}
2369 
2370 	return 0;
2371 }
2372 
2373 static int gfx_v12_1_xcc_kcq_init_queue(struct amdgpu_ring *ring,
2374 					int xcc_id)
2375 {
2376 	struct amdgpu_device *adev = ring->adev;
2377 	struct v12_1_compute_mqd *mqd = ring->mqd_ptr;
2378 	int mqd_idx = ring - &adev->gfx.compute_ring[0];
2379 
2380 	if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
2381 		memset((void *)mqd, 0, sizeof(*mqd));
2382 		mutex_lock(&adev->srbm_mutex);
2383 		soc_v1_0_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, xcc_id));
2384 		amdgpu_ring_init_mqd(ring);
2385 		soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
2386 		mutex_unlock(&adev->srbm_mutex);
2387 
2388 		if (adev->gfx.mec.mqd_backup[mqd_idx])
2389 			memcpy_fromio(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
2390 	} else {
2391 		/* restore MQD to a clean status */
2392 		if (adev->gfx.mec.mqd_backup[mqd_idx])
2393 			memcpy_toio(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
2394 		/* reset ring buffer */
2395 		ring->wptr = 0;
2396 		atomic64_set((atomic64_t *)ring->wptr_cpu_addr, 0);
2397 		amdgpu_ring_clear_ring(ring);
2398 	}
2399 
2400 	return 0;
2401 }
2402 
2403 static int gfx_v12_1_xcc_kiq_resume(struct amdgpu_device *adev,
2404 				    int xcc_id)
2405 {
2406 	struct amdgpu_ring *ring;
2407 	int r;
2408 
2409 	ring = &adev->gfx.kiq[xcc_id].ring;
2410 
2411 	r = amdgpu_bo_reserve(ring->mqd_obj, false);
2412 	if (unlikely(r != 0))
2413 		return r;
2414 
2415 	r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
2416 	if (unlikely(r != 0)) {
2417 		amdgpu_bo_unreserve(ring->mqd_obj);
2418 		return r;
2419 	}
2420 
2421 	gfx_v12_1_xcc_kiq_init_queue(ring, xcc_id);
2422 	amdgpu_bo_kunmap(ring->mqd_obj);
2423 	ring->mqd_ptr = NULL;
2424 	amdgpu_bo_unreserve(ring->mqd_obj);
2425 	ring->sched.ready = true;
2426 	return 0;
2427 }
2428 
2429 static int gfx_v12_1_xcc_kcq_resume(struct amdgpu_device *adev,
2430 				    int xcc_id)
2431 {
2432 	struct amdgpu_ring *ring = NULL;
2433 	int r = 0, i;
2434 
2435 	if (!amdgpu_async_gfx_ring)
2436 		gfx_v12_1_xcc_cp_compute_enable(adev, true, xcc_id);
2437 
2438 	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
2439 		ring = &adev->gfx.compute_ring[i + xcc_id * adev->gfx.num_compute_rings];
2440 
2441 		r = amdgpu_bo_reserve(ring->mqd_obj, false);
2442 		if (unlikely(r != 0))
2443 			goto done;
2444 		r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
2445 		if (!r) {
2446 			r = gfx_v12_1_xcc_kcq_init_queue(ring, xcc_id);
2447 			amdgpu_bo_kunmap(ring->mqd_obj);
2448 			ring->mqd_ptr = NULL;
2449 		}
2450 		amdgpu_bo_unreserve(ring->mqd_obj);
2451 		if (r)
2452 			goto done;
2453 	}
2454 
2455 	r = amdgpu_gfx_enable_kcq(adev, xcc_id);
2456 done:
2457 	return r;
2458 }
2459 
2460 static int gfx_v12_1_xcc_cp_resume(struct amdgpu_device *adev, uint16_t xcc_mask)
2461 {
2462 	int r, i, xcc_id;
2463 	struct amdgpu_ring *ring;
2464 
2465 	for_each_inst(xcc_id, xcc_mask) {
2466 		if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
2467 			/* legacy firmware loading */
2468 			r = gfx_v12_1_xcc_cp_compute_load_microcode_rs64(adev, xcc_id);
2469 			if (r)
2470 				return r;
2471 		}
2472 
2473 		if (!(adev->flags & AMD_IS_APU))
2474 			gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, false, xcc_id);
2475 
2476 		gfx_v12_1_xcc_cp_set_doorbell_range(adev, xcc_id);
2477 
2478 		gfx_v12_1_xcc_cp_compute_enable(adev, true, xcc_id);
2479 
2480 		if (adev->enable_mes_kiq && adev->mes.kiq_hw_init)
2481 			r = amdgpu_mes_kiq_hw_init(adev, xcc_id);
2482 		else
2483 			r = gfx_v12_1_xcc_kiq_resume(adev, xcc_id);
2484 		if (r)
2485 			return r;
2486 
2487 		r = gfx_v12_1_xcc_kcq_resume(adev, xcc_id);
2488 		if (r)
2489 			return r;
2490 
2491 		for (i = 0; i < adev->gfx.num_compute_rings; i++) {
2492 			ring = &adev->gfx.compute_ring[i + xcc_id * adev->gfx.num_compute_rings];
2493 			r = amdgpu_ring_test_helper(ring);
2494 			if (r)
2495 				return r;
2496 		}
2497 	}
2498 
2499 	return 0;
2500 }
2501 
2502 static int gfx_v12_1_cp_resume(struct amdgpu_device *adev)
2503 {
2504 	int num_xcc, num_xcp, num_xcc_per_xcp;
2505 	uint16_t xcc_mask;
2506 	int r = 0;
2507 
2508 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2509 	if (amdgpu_sriov_vf(adev)) {
2510 		enum amdgpu_gfx_partition mode;
2511 
2512 		mode = amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
2513 						       AMDGPU_XCP_FL_NONE);
2514 		if (mode == AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
2515 			return -EINVAL;
2516 		if (adev->gfx.funcs &&
2517 		    adev->gfx.funcs->get_xccs_per_xcp) {
2518 			num_xcc_per_xcp = adev->gfx.funcs->get_xccs_per_xcp(adev);
2519 			adev->gfx.num_xcc_per_xcp = num_xcc_per_xcp;
2520 			num_xcp = num_xcc / num_xcc_per_xcp;
2521 		} else {
2522 			return -EINVAL;
2523 		}
2524 		r = amdgpu_xcp_init(adev->xcp_mgr, num_xcp, mode);
2525 
2526 	} else {
2527 		if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
2528 						    AMDGPU_XCP_FL_NONE) ==
2529 		    AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
2530 			r = amdgpu_xcp_switch_partition_mode(adev->xcp_mgr,
2531 							     amdgpu_user_partt_mode);
2532 	}
2533 
2534 	if (r)
2535 		return r;
2536 
2537 	xcc_mask = GENMASK(NUM_XCC(adev->gfx.xcc_mask) - 1, 0);
2538 
2539 	return gfx_v12_1_xcc_cp_resume(adev, xcc_mask);
2540 }
2541 
2542 static int gfx_v12_1_gfxhub_enable(struct amdgpu_device *adev)
2543 {
2544 	int r, i;
2545 	bool value;
2546 
2547 	r = adev->gfxhub.funcs->gart_enable(adev);
2548 	if (r)
2549 		return r;
2550 
2551 	value = (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS) ?
2552 		false : true;
2553 
2554 	adev->gfxhub.funcs->set_fault_enable_default(adev, value);
2555 	/* TODO investigate why TLB flush is needed,
2556 	 * are we missing a flush somewhere else? */
2557 	for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS) {
2558 		if (AMDGPU_IS_GFXHUB(i))
2559 			adev->gmc.gmc_funcs->flush_gpu_tlb(adev, 0, AMDGPU_GFXHUB(i), 0);
2560 	}
2561 
2562 	return 0;
2563 }
2564 
2565 static int get_gb_addr_config(struct amdgpu_device *adev)
2566 {
2567 	u32 gb_addr_config;
2568 
2569 	gb_addr_config = RREG32_SOC15(GC, GET_INST(GC, 0), regGB_ADDR_CONFIG_READ);
2570 	if (gb_addr_config == 0)
2571 		return -EINVAL;
2572 
2573 	adev->gfx.config.gb_addr_config_fields.num_pkrs =
2574 		1 << REG_GET_FIELD(gb_addr_config, GB_ADDR_CONFIG_READ, NUM_PKRS);
2575 
2576 	adev->gfx.config.gb_addr_config = gb_addr_config;
2577 
2578 	adev->gfx.config.gb_addr_config_fields.num_pipes = 1 <<
2579 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2580 				      GB_ADDR_CONFIG_READ, NUM_PIPES);
2581 
2582 	adev->gfx.config.max_tile_pipes =
2583 		adev->gfx.config.gb_addr_config_fields.num_pipes;
2584 
2585 	adev->gfx.config.gb_addr_config_fields.max_compress_frags = 1 <<
2586 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2587 				      GB_ADDR_CONFIG_READ, MAX_COMPRESSED_FRAGS);
2588 	adev->gfx.config.gb_addr_config_fields.num_rb_per_se = 1 <<
2589 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2590 				      GB_ADDR_CONFIG_READ, NUM_RB_PER_SE);
2591 	adev->gfx.config.gb_addr_config_fields.num_se = 1 <<
2592 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2593 				      GB_ADDR_CONFIG_READ, NUM_SHADER_ENGINES);
2594 	adev->gfx.config.gb_addr_config_fields.pipe_interleave_size = 1 << (8 +
2595 			REG_GET_FIELD(adev->gfx.config.gb_addr_config,
2596 				      GB_ADDR_CONFIG_READ, PIPE_INTERLEAVE_SIZE));
2597 
2598 	return 0;
2599 }
2600 
2601 static void gfx_v12_1_xcc_disable_gpa_mode(struct amdgpu_device *adev,
2602 					   int xcc_id)
2603 {
2604 	uint32_t data;
2605 
2606 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPC_PSP_DEBUG);
2607 	data |= CPC_PSP_DEBUG__GPA_OVERRIDE_MASK;
2608 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPC_PSP_DEBUG, data);
2609 
2610 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPG_PSP_DEBUG);
2611 	data |= CPG_PSP_DEBUG__GPA_OVERRIDE_MASK;
2612 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPG_PSP_DEBUG, data);
2613 }
2614 
2615 static void gfx_v12_1_xcc_setup_tcp_thrashing_ctrl(struct amdgpu_device *adev,
2616 					 int xcc_id)
2617 {
2618 	uint32_t val;
2619 
2620 	/* Set the TCP UTCL0 register to enable atomics */
2621 	val = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
2622 					regTCP_UTCL0_THRASHING_CTRL);
2623 	val = REG_SET_FIELD(val, TCP_UTCL0_THRASHING_CTRL, THRASHING_EN, 0x2);
2624 	val = REG_SET_FIELD(val, TCP_UTCL0_THRASHING_CTRL,
2625 					RETRY_FRAGMENT_THRESHOLD_UP_EN, 0x1);
2626 	val = REG_SET_FIELD(val, TCP_UTCL0_THRASHING_CTRL,
2627 					RETRY_FRAGMENT_THRESHOLD_DOWN_EN, 0x1);
2628 
2629 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
2630 					regTCP_UTCL0_THRASHING_CTRL, val);
2631 }
2632 
2633 static void gfx_v12_1_xcc_enable_atomics(struct amdgpu_device *adev,
2634 					 int xcc_id)
2635 {
2636 	uint32_t data;
2637 
2638 	/* Set the TCP UTCL0 register to enable atomics */
2639 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regTCP_UTCL0_CNTL1);
2640 	data = REG_SET_FIELD(data, TCP_UTCL0_CNTL1, ATOMIC_REQUESTER_EN, 0x1);
2641 
2642 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regTCP_UTCL0_CNTL1, data);
2643 }
2644 
2645 static void gfx_v12_1_xcc_disable_burst(struct amdgpu_device *adev,
2646 					int xcc_id)
2647 {
2648 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGL1_DRAM_BURST_CTRL, 0xf);
2649 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGLARB_DRAM_BURST_CTRL, 0xf);
2650 }
2651 
2652 static void gfx_v12_1_init_golden_registers(struct amdgpu_device *adev)
2653 {
2654 	int i;
2655 
2656 	for (i = 0; i < NUM_XCC(adev->gfx.xcc_mask); i++) {
2657 		gfx_v12_1_xcc_disable_burst(adev, i);
2658 		gfx_v12_1_xcc_enable_atomics(adev, i);
2659 		gfx_v12_1_xcc_setup_tcp_thrashing_ctrl(adev, i);
2660 	}
2661 }
2662 
2663 static int gfx_v12_1_hw_init(struct amdgpu_ip_block *ip_block)
2664 {
2665 	int r, i, num_xcc;
2666 	struct amdgpu_device *adev = ip_block->adev;
2667 
2668 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {
2669 		/* rlc autoload firmware */
2670 		r = gfx_v12_1_rlc_backdoor_autoload_enable(adev);
2671 		if (r)
2672 			return r;
2673 	} else {
2674 		if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
2675 			num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2676 
2677 			if (adev->gfx.imu.funcs) {
2678 				if (adev->gfx.imu.funcs->load_microcode)
2679 					adev->gfx.imu.funcs->load_microcode(adev);
2680 			}
2681 
2682 			for (i = 0; i < num_xcc; i++) {
2683 				/* disable gpa mode in backdoor loading */
2684 				gfx_v12_1_xcc_disable_gpa_mode(adev, i);
2685 			}
2686 		}
2687 	}
2688 
2689 	if ((adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) ||
2690 	    (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP)) {
2691 		r = gfx_v12_1_wait_for_rlc_autoload_complete(adev);
2692 		if (r) {
2693 			dev_err(adev->dev, "(%d) failed to wait rlc autoload complete\n", r);
2694 			return r;
2695 		}
2696 	}
2697 
2698 	adev->gfx.is_poweron = true;
2699 
2700 	if (get_gb_addr_config(adev))
2701 		DRM_WARN("Invalid gb_addr_config !\n");
2702 
2703 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP)
2704 		gfx_v12_1_config_gfx_rs64(adev);
2705 
2706 	r = gfx_v12_1_gfxhub_enable(adev);
2707 	if (r)
2708 		return r;
2709 
2710 	gfx_v12_1_init_golden_registers(adev);
2711 
2712 	gfx_v12_1_constants_init(adev);
2713 
2714 	if (adev->nbio.funcs->gc_doorbell_init)
2715 		adev->nbio.funcs->gc_doorbell_init(adev);
2716 
2717 	r = gfx_v12_1_rlc_resume(adev);
2718 	if (r)
2719 		return r;
2720 
2721 	/*
2722 	 * init golden registers and rlc resume may override some registers,
2723 	 * reconfig them here
2724 	 */
2725 	gfx_v12_1_tcp_harvest(adev);
2726 
2727 	r = gfx_v12_1_cp_resume(adev);
2728 	if (r)
2729 		return r;
2730 
2731 	return r;
2732 }
2733 
2734 static void gfx_v12_1_xcc_fini(struct amdgpu_device *adev,
2735 			      int xcc_id)
2736 {
2737 	uint32_t tmp;
2738 
2739 	if (!adev->no_hw_access) {
2740 		if (amdgpu_gfx_disable_kcq(adev, xcc_id))
2741 			DRM_ERROR("KCQ disable failed\n");
2742 
2743 		amdgpu_mes_kiq_hw_fini(adev, xcc_id);
2744 	}
2745 
2746 	if (amdgpu_sriov_vf(adev)) {
2747 		/* Program KIQ position of RLC_CP_SCHEDULERS during destroy */
2748 		tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS);
2749 		tmp &= 0xffffff00;
2750 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CP_SCHEDULERS, tmp);
2751 	}
2752 	gfx_v12_1_xcc_cp_compute_enable(adev, false, xcc_id);
2753 	gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, false, xcc_id);
2754 }
2755 
2756 static int gfx_v12_1_hw_fini(struct amdgpu_ip_block *ip_block)
2757 {
2758 	struct amdgpu_device *adev = ip_block->adev;
2759 	int i, num_xcc;
2760 
2761 	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
2762 	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
2763 
2764 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2765 	for (i = 0; i < num_xcc; i++) {
2766 		gfx_v12_1_xcc_fini(adev, i);
2767 	}
2768 
2769 	adev->gfxhub.funcs->gart_disable(adev);
2770 
2771 	adev->gfx.is_poweron = false;
2772 
2773 	return 0;
2774 }
2775 
2776 static int gfx_v12_1_suspend(struct amdgpu_ip_block *ip_block)
2777 {
2778 	return gfx_v12_1_hw_fini(ip_block);
2779 }
2780 
2781 static int gfx_v12_1_resume(struct amdgpu_ip_block *ip_block)
2782 {
2783 	return gfx_v12_1_hw_init(ip_block);
2784 }
2785 
2786 static bool gfx_v12_1_is_idle(struct amdgpu_ip_block *ip_block)
2787 {
2788 	struct amdgpu_device *adev = ip_block->adev;
2789 	int i, num_xcc;
2790 
2791 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2792 	for (i = 0; i < num_xcc; i++) {
2793 		if (REG_GET_FIELD(RREG32_SOC15(GC, GET_INST(GC, i),
2794 				regGRBM_STATUS), GRBM_STATUS, GUI_ACTIVE))
2795 			return false;
2796 	}
2797 	return true;
2798 }
2799 
2800 static int gfx_v12_1_wait_for_idle(struct amdgpu_ip_block *ip_block)
2801 {
2802 	unsigned i;
2803 	struct amdgpu_device *adev = ip_block->adev;
2804 
2805 	for (i = 0; i < adev->usec_timeout; i++) {
2806 		if (gfx_v12_1_is_idle(ip_block))
2807 			return 0;
2808 		udelay(1);
2809 	}
2810 	return -ETIMEDOUT;
2811 }
2812 
2813 static uint64_t gfx_v12_1_get_gpu_clock_counter(struct amdgpu_device *adev)
2814 {
2815 	uint64_t clock = 0;
2816 
2817 	if (adev->smuio.funcs &&
2818 	    adev->smuio.funcs->get_gpu_clock_counter)
2819 		clock = adev->smuio.funcs->get_gpu_clock_counter(adev);
2820 	else
2821 		dev_warn(adev->dev, "query gpu clock counter is not supported\n");
2822 
2823 	return clock;
2824 }
2825 
2826 static int gfx_v12_1_early_init(struct amdgpu_ip_block *ip_block)
2827 {
2828 	struct amdgpu_device *adev = ip_block->adev;
2829 
2830 	adev->gfx.funcs = &gfx_v12_1_gfx_funcs;
2831 
2832 	adev->gfx.num_compute_rings = min(amdgpu_gfx_get_num_kcq(adev),
2833 					  AMDGPU_MAX_COMPUTE_RINGS);
2834 
2835 	gfx_v12_1_set_kiq_pm4_funcs(adev);
2836 	gfx_v12_1_set_ring_funcs(adev);
2837 	gfx_v12_1_set_irq_funcs(adev);
2838 	gfx_v12_1_set_rlc_funcs(adev);
2839 	gfx_v12_1_set_mqd_funcs(adev);
2840 	gfx_v12_1_set_imu_funcs(adev);
2841 
2842 	gfx_v12_1_init_rlcg_reg_access_ctrl(adev);
2843 
2844 	return gfx_v12_1_init_microcode(adev);
2845 }
2846 
2847 static int gfx_v12_1_late_init(struct amdgpu_ip_block *ip_block)
2848 {
2849 	struct amdgpu_device *adev = ip_block->adev;
2850 	int r;
2851 
2852 	r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0);
2853 	if (r)
2854 		return r;
2855 
2856 	r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0);
2857 	if (r)
2858 		return r;
2859 
2860 	return 0;
2861 }
2862 
2863 static bool gfx_v12_1_is_rlc_enabled(struct amdgpu_device *adev)
2864 {
2865 	uint32_t rlc_cntl;
2866 
2867 	/* if RLC is not enabled, do nothing */
2868 	rlc_cntl = RREG32_SOC15(GC, GET_INST(GC, 0), regRLC_CNTL);
2869 	return (REG_GET_FIELD(rlc_cntl, RLC_CNTL, RLC_ENABLE_F32)) ? true : false;
2870 }
2871 
2872 static void gfx_v12_1_xcc_set_safe_mode(struct amdgpu_device *adev,
2873 					int xcc_id)
2874 {
2875 	uint32_t data;
2876 	unsigned i;
2877 
2878 	data = RLC_SAFE_MODE__CMD_MASK;
2879 	data |= (1 << RLC_SAFE_MODE__MESSAGE__SHIFT);
2880 
2881 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_SAFE_MODE, data);
2882 
2883 	/* wait for RLC_SAFE_MODE */
2884 	for (i = 0; i < adev->usec_timeout; i++) {
2885 		if (!REG_GET_FIELD(RREG32_SOC15(GC, GET_INST(GC, xcc_id),
2886 						regRLC_SAFE_MODE), RLC_SAFE_MODE, CMD))
2887 			break;
2888 		udelay(1);
2889 	}
2890 }
2891 
2892 static void gfx_v12_1_xcc_unset_safe_mode(struct amdgpu_device *adev,
2893 					  int xcc_id)
2894 {
2895 	WREG32_SOC15(GC, GET_INST(GC, xcc_id),
2896 		     regRLC_SAFE_MODE, RLC_SAFE_MODE__CMD_MASK);
2897 }
2898 
2899 static void gfx_v12_1_update_perf_clk(struct amdgpu_device *adev,
2900 				      bool enable)
2901 {
2902 	int i, num_xcc;
2903 
2904 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
2905 	for (i = 0; i < num_xcc; i++)
2906 		gfx_v12_1_xcc_update_perf_clk(adev, enable, i);
2907 }
2908 
2909 static void gfx_v12_1_update_spm_vmid(struct amdgpu_device *adev,
2910 				      int xcc_id,
2911 				      struct amdgpu_ring *ring,
2912 				      unsigned vmid)
2913 {
2914 	u32 reg, data;
2915 
2916 	reg = SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SPM_MC_CNTL);
2917 	if (amdgpu_sriov_is_pp_one_vf(adev))
2918 		data = RREG32_NO_KIQ(reg);
2919 	else
2920 		data = RREG32(reg);
2921 
2922 	data &= ~RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK;
2923 	data |= (vmid & RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK) << RLC_SPM_MC_CNTL__RLC_SPM_VMID__SHIFT;
2924 
2925 	if (amdgpu_sriov_is_pp_one_vf(adev))
2926 		WREG32_SOC15_NO_KIQ(GC, GET_INST(GC, xcc_id), regRLC_SPM_MC_CNTL, data);
2927 	else
2928 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_SPM_MC_CNTL, data);
2929 
2930 	if (ring
2931 	    && amdgpu_sriov_is_pp_one_vf(adev)
2932 	    && ((ring->funcs->type == AMDGPU_RING_TYPE_GFX)
2933 		|| (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE))) {
2934 		uint32_t reg = SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), regRLC_SPM_MC_CNTL);
2935 		amdgpu_ring_emit_wreg(ring, reg, data);
2936 	}
2937 }
2938 
2939 static const struct amdgpu_rlc_funcs gfx_v12_1_rlc_funcs = {
2940 	.is_rlc_enabled = gfx_v12_1_is_rlc_enabled,
2941 	.set_safe_mode = gfx_v12_1_xcc_set_safe_mode,
2942 	.unset_safe_mode = gfx_v12_1_xcc_unset_safe_mode,
2943 	.init = gfx_v12_1_rlc_init,
2944 	.get_csb_size = gfx_v12_1_get_csb_size,
2945 	.get_csb_buffer = gfx_v12_1_get_csb_buffer,
2946 	.resume = gfx_v12_1_rlc_resume,
2947 	.stop = gfx_v12_1_rlc_stop,
2948 	.reset = gfx_v12_1_rlc_reset,
2949 	.start = gfx_v12_1_rlc_start,
2950 	.update_spm_vmid = gfx_v12_1_update_spm_vmid,
2951 };
2952 
2953 #if 0
2954 static void gfx_v12_cntl_power_gating(struct amdgpu_device *adev, bool enable)
2955 {
2956 	/* TODO */
2957 }
2958 
2959 static void gfx_v12_cntl_pg(struct amdgpu_device *adev, bool enable)
2960 {
2961 	/* TODO */
2962 }
2963 #endif
2964 
2965 static int gfx_v12_1_set_powergating_state(struct amdgpu_ip_block *ip_block,
2966 					   enum amd_powergating_state state)
2967 {
2968 	struct amdgpu_device *adev = ip_block->adev;
2969 	bool enable = (state == AMD_PG_STATE_GATE);
2970 
2971 	if (amdgpu_sriov_vf(adev))
2972 		return 0;
2973 
2974 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
2975 	case IP_VERSION(12, 1, 0):
2976 		amdgpu_gfx_off_ctrl(adev, enable);
2977 		break;
2978 	default:
2979 		break;
2980 	}
2981 
2982 	return 0;
2983 }
2984 
2985 static void gfx_v12_1_xcc_update_coarse_grain_clock_gating(struct amdgpu_device *adev,
2986 							   bool enable, int xcc_id)
2987 {
2988 	uint32_t def, data;
2989 
2990 	if (!(adev->cg_flags &
2991 	      (AMD_CG_SUPPORT_GFX_CGCG |
2992 	      AMD_CG_SUPPORT_GFX_CGLS |
2993 	      AMD_CG_SUPPORT_GFX_3D_CGCG |
2994 	      AMD_CG_SUPPORT_GFX_3D_CGLS)))
2995 		return;
2996 
2997 	if (enable) {
2998 		def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
2999 					  regRLC_CGTT_MGCG_OVERRIDE);
3000 
3001 		/* unset CGCG override */
3002 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGCG)
3003 			data &= ~RLC_CGTT_MGCG_OVERRIDE__GFXIP_CGCG_OVERRIDE_MASK;
3004 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS)
3005 			data &= ~RLC_CGTT_MGCG_OVERRIDE__GFXIP_CGLS_OVERRIDE_MASK;
3006 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_3D_CGCG ||
3007 		    adev->cg_flags & AMD_CG_SUPPORT_GFX_3D_CGLS)
3008 			data &= ~RLC_CGTT_MGCG_OVERRIDE__GFXIP_GFX3D_CG_OVERRIDE_MASK;
3009 
3010 		/* update CGCG override bits */
3011 		if (def != data)
3012 			WREG32_SOC15(GC, GET_INST(GC, xcc_id),
3013 				     regRLC_CGTT_MGCG_OVERRIDE, data);
3014 
3015 		/* enable cgcg FSM(0x0000363F) */
3016 		def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL);
3017 
3018 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGCG) {
3019 			data &= ~RLC_CGCG_CGLS_CTRL__CGCG_GFX_IDLE_THRESHOLD_MASK;
3020 			data |= (0x36 << RLC_CGCG_CGLS_CTRL__CGCG_GFX_IDLE_THRESHOLD__SHIFT) |
3021 				 RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK;
3022 		}
3023 
3024 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS) {
3025 			data &= ~RLC_CGCG_CGLS_CTRL__CGLS_REP_COMPANSAT_DELAY_MASK;
3026 			data |= (0x000F << RLC_CGCG_CGLS_CTRL__CGLS_REP_COMPANSAT_DELAY__SHIFT) |
3027 				 RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK;
3028 		}
3029 
3030 		if (def != data)
3031 			WREG32_SOC15(GC, GET_INST(GC, xcc_id),
3032 				     regRLC_CGCG_CGLS_CTRL, data);
3033 
3034 		/* set IDLE_POLL_COUNT(0x00900100) */
3035 		def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL);
3036 
3037 		data &= ~CP_RB_WPTR_POLL_CNTL__POLL_FREQUENCY_MASK;
3038 		data &= ~CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT_MASK;
3039 		data |= (0x0100 << CP_RB_WPTR_POLL_CNTL__POLL_FREQUENCY__SHIFT) |
3040 			(0x0090 << CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT__SHIFT);
3041 
3042 		if (def != data)
3043 			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL, data);
3044 
3045 		data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_INT_CNTL);
3046 		data = REG_SET_FIELD(data, CP_INT_CNTL, CNTX_BUSY_INT_ENABLE, 1);
3047 		data = REG_SET_FIELD(data, CP_INT_CNTL, CNTX_EMPTY_INT_ENABLE, 1);
3048 		data = REG_SET_FIELD(data, CP_INT_CNTL, CMP_BUSY_INT_ENABLE, 1);
3049 		data = REG_SET_FIELD(data, CP_INT_CNTL, GFX_IDLE_INT_ENABLE, 1);
3050 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_INT_CNTL, data);
3051 	} else {
3052 		/* Program RLC_CGCG_CGLS_CTRL */
3053 		def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL);
3054 
3055 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGCG)
3056 			data &= ~RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK;
3057 
3058 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS)
3059 			data &= ~RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK;
3060 
3061 		if (def != data)
3062 			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL, data);
3063 	}
3064 }
3065 
3066 static void gfx_v12_1_xcc_update_medium_grain_clock_gating(struct amdgpu_device *adev,
3067 							   bool enable, int xcc_id)
3068 {
3069 	uint32_t data, def;
3070 	if (!(adev->cg_flags & (AMD_CG_SUPPORT_GFX_MGCG | AMD_CG_SUPPORT_GFX_MGLS)))
3071 		return;
3072 
3073 	/* It is disabled by HW by default */
3074 	if (enable) {
3075 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_MGCG) {
3076 			/* 1 - RLC_CGTT_MGCG_OVERRIDE */
3077 			def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3078 
3079 			data &= ~(RLC_CGTT_MGCG_OVERRIDE__GRBM_CGTT_SCLK_OVERRIDE_MASK |
3080 				  RLC_CGTT_MGCG_OVERRIDE__RLC_CGTT_SCLK_OVERRIDE_MASK |
3081 				  RLC_CGTT_MGCG_OVERRIDE__GFXIP_MGCG_OVERRIDE_MASK);
3082 
3083 			if (def != data)
3084 				WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3085 		}
3086 	} else {
3087 		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_MGCG) {
3088 			def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3089 
3090 			data |= (RLC_CGTT_MGCG_OVERRIDE__RLC_CGTT_SCLK_OVERRIDE_MASK |
3091 				 RLC_CGTT_MGCG_OVERRIDE__GRBM_CGTT_SCLK_OVERRIDE_MASK |
3092 				 RLC_CGTT_MGCG_OVERRIDE__GFXIP_MGCG_OVERRIDE_MASK);
3093 
3094 			if (def != data)
3095 				WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3096 		}
3097 	}
3098 }
3099 
3100 static void gfx_v12_1_xcc_update_repeater_fgcg(struct amdgpu_device *adev,
3101 					       bool enable, int xcc_id)
3102 {
3103 	uint32_t def, data;
3104 
3105 	if (!(adev->cg_flags & AMD_CG_SUPPORT_REPEATER_FGCG))
3106 		return;
3107 
3108 	def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3109 
3110 	if (enable)
3111 		data &= ~(RLC_CGTT_MGCG_OVERRIDE__GFXIP_REPEATER_FGCG_OVERRIDE_MASK |
3112 				  RLC_CGTT_MGCG_OVERRIDE__RLC_REPEATER_FGCG_OVERRIDE_MASK);
3113 	else
3114 		data |= RLC_CGTT_MGCG_OVERRIDE__GFXIP_REPEATER_FGCG_OVERRIDE_MASK |
3115 				RLC_CGTT_MGCG_OVERRIDE__RLC_REPEATER_FGCG_OVERRIDE_MASK;
3116 
3117 	if (def != data)
3118 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3119 }
3120 
3121 static void gfx_v12_1_xcc_update_sram_fgcg(struct amdgpu_device *adev,
3122 					   bool enable, int xcc_id)
3123 {
3124 	uint32_t def, data;
3125 
3126 	if (!(adev->cg_flags & AMD_CG_SUPPORT_GFX_FGCG))
3127 		return;
3128 
3129 	def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3130 
3131 	if (enable)
3132 		data &= ~RLC_CGTT_MGCG_OVERRIDE__GFXIP_FGCG_OVERRIDE_MASK;
3133 	else
3134 		data |= RLC_CGTT_MGCG_OVERRIDE__GFXIP_FGCG_OVERRIDE_MASK;
3135 
3136 	if (def != data)
3137 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3138 }
3139 
3140 static void gfx_v12_1_xcc_update_perf_clk(struct amdgpu_device *adev,
3141 					  bool enable, int xcc_id)
3142 {
3143 	uint32_t def, data;
3144 
3145 	if (!(adev->cg_flags & AMD_CG_SUPPORT_GFX_PERF_CLK))
3146 		return;
3147 
3148 	def = data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE);
3149 
3150 	if (enable)
3151 		data &= ~RLC_CGTT_MGCG_OVERRIDE__PERFMON_CLOCK_STATE_MASK;
3152 	else
3153 		data |= RLC_CGTT_MGCG_OVERRIDE__PERFMON_CLOCK_STATE_MASK;
3154 
3155 	if (def != data)
3156 		WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
3157 }
3158 
3159 static int gfx_v12_1_xcc_update_gfx_clock_gating(struct amdgpu_device *adev,
3160 					     bool enable, int xcc_id)
3161 {
3162 	amdgpu_gfx_rlc_enter_safe_mode(adev, xcc_id);
3163 
3164 	gfx_v12_1_xcc_update_coarse_grain_clock_gating(adev, enable, xcc_id);
3165 
3166 	gfx_v12_1_xcc_update_medium_grain_clock_gating(adev, enable, xcc_id);
3167 
3168 	gfx_v12_1_xcc_update_repeater_fgcg(adev, enable, xcc_id);
3169 
3170 	gfx_v12_1_xcc_update_sram_fgcg(adev, enable, xcc_id);
3171 
3172 	gfx_v12_1_xcc_update_perf_clk(adev, enable, xcc_id);
3173 
3174 	if (adev->cg_flags &
3175 	    (AMD_CG_SUPPORT_GFX_MGCG |
3176 	     AMD_CG_SUPPORT_GFX_CGLS |
3177 	     AMD_CG_SUPPORT_GFX_CGCG |
3178 	     AMD_CG_SUPPORT_GFX_3D_CGCG |
3179 	     AMD_CG_SUPPORT_GFX_3D_CGLS))
3180 		gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, enable, xcc_id);
3181 
3182 	amdgpu_gfx_rlc_exit_safe_mode(adev, xcc_id);
3183 
3184 	return 0;
3185 }
3186 
3187 static int gfx_v12_1_set_clockgating_state(struct amdgpu_ip_block *ip_block,
3188 					   enum amd_clockgating_state state)
3189 {
3190 	struct amdgpu_device *adev = ip_block->adev;
3191 	int i, num_xcc;
3192 
3193 	if (amdgpu_sriov_vf(adev))
3194 		return 0;
3195 
3196 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3197 	switch (adev->ip_versions[GC_HWIP][0]) {
3198 	case IP_VERSION(12, 1, 0):
3199 		for (i = 0; i < num_xcc; i++)
3200 			gfx_v12_1_xcc_update_gfx_clock_gating(adev,
3201 				  state == AMD_CG_STATE_GATE, i);
3202 		break;
3203 	default:
3204 		break;
3205 	}
3206 
3207 	return 0;
3208 }
3209 
3210 static void gfx_v12_1_get_clockgating_state(struct amdgpu_ip_block *ip_block, u64 *flags)
3211 {
3212 	struct amdgpu_device *adev = ip_block->adev;
3213 	int data;
3214 
3215 	/* AMD_CG_SUPPORT_GFX_MGCG */
3216 	data = RREG32_SOC15(GC, GET_INST(GC, 0), regRLC_CGTT_MGCG_OVERRIDE);
3217 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__GFXIP_MGCG_OVERRIDE_MASK))
3218 		*flags |= AMD_CG_SUPPORT_GFX_MGCG;
3219 
3220 	/* AMD_CG_SUPPORT_REPEATER_FGCG */
3221 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__GFXIP_REPEATER_FGCG_OVERRIDE_MASK))
3222 		*flags |= AMD_CG_SUPPORT_REPEATER_FGCG;
3223 
3224 	/* AMD_CG_SUPPORT_GFX_FGCG */
3225 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__GFXIP_FGCG_OVERRIDE_MASK))
3226 		*flags |= AMD_CG_SUPPORT_GFX_FGCG;
3227 
3228 	/* AMD_CG_SUPPORT_GFX_PERF_CLK */
3229 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__PERFMON_CLOCK_STATE_MASK))
3230 		*flags |= AMD_CG_SUPPORT_GFX_PERF_CLK;
3231 
3232 	/* AMD_CG_SUPPORT_GFX_CGCG */
3233 	data = RREG32_SOC15(GC, GET_INST(GC, 0), regRLC_CGCG_CGLS_CTRL);
3234 	if (data & RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK)
3235 		*flags |= AMD_CG_SUPPORT_GFX_CGCG;
3236 
3237 	/* AMD_CG_SUPPORT_GFX_CGLS */
3238 	if (data & RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK)
3239 		*flags |= AMD_CG_SUPPORT_GFX_CGLS;
3240 }
3241 
3242 static u64 gfx_v12_1_ring_get_rptr_compute(struct amdgpu_ring *ring)
3243 {
3244 	/* gfx12 hardware is 32bit rptr */
3245 	return *(uint32_t *)ring->rptr_cpu_addr;
3246 }
3247 
3248 static u64 gfx_v12_1_ring_get_wptr_compute(struct amdgpu_ring *ring)
3249 {
3250 	u64 wptr;
3251 
3252 	/* XXX check if swapping is necessary on BE */
3253 	if (ring->use_doorbell)
3254 		wptr = atomic64_read((atomic64_t *)ring->wptr_cpu_addr);
3255 	else
3256 		BUG();
3257 	return wptr;
3258 }
3259 
3260 static void gfx_v12_1_ring_set_wptr_compute(struct amdgpu_ring *ring)
3261 {
3262 	struct amdgpu_device *adev = ring->adev;
3263 
3264 	/* XXX check if swapping is necessary on BE */
3265 	if (ring->use_doorbell) {
3266 		atomic64_set((atomic64_t *)ring->wptr_cpu_addr,
3267 			     ring->wptr);
3268 		WDOORBELL64(ring->doorbell_index, ring->wptr);
3269 	} else {
3270 		BUG(); /* only DOORBELL method supported on gfx12 now */
3271 	}
3272 }
3273 
3274 static void gfx_v12_1_ring_emit_ib_compute(struct amdgpu_ring *ring,
3275 					   struct amdgpu_job *job,
3276 					   struct amdgpu_ib *ib,
3277 					   uint32_t flags)
3278 {
3279 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
3280 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
3281 
3282 	/* Currently, there is a high possibility to get wave ID mismatch
3283 	 * between ME and GDS, leading to a hw deadlock, because ME generates
3284 	 * different wave IDs than the GDS expects. This situation happens
3285 	 * randomly when at least 5 compute pipes use GDS ordered append.
3286 	 * The wave IDs generated by ME are also wrong after suspend/resume.
3287 	 * Those are probably bugs somewhere else in the kernel driver.
3288 	 *
3289 	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
3290 	 * GDS to 0 for this ring (me/pipe).
3291 	 */
3292 	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
3293 		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
3294 		amdgpu_ring_write(ring, regGDS_COMPUTE_MAX_WAVE_ID);
3295 	}
3296 
3297 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
3298 	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
3299 	amdgpu_ring_write(ring,
3300 #ifdef __BIG_ENDIAN
3301 				(2 << 0) |
3302 #endif
3303 				lower_32_bits(ib->gpu_addr));
3304 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
3305 	amdgpu_ring_write(ring, control);
3306 }
3307 
3308 static void gfx_v12_1_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
3309 				     u64 seq, unsigned flags)
3310 {
3311 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
3312 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
3313 
3314 	/* RELEASE_MEM - flush caches, send int */
3315 	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
3316 	amdgpu_ring_write(ring, (PACKET3_RELEASE_MEM_GCR_SEQ(1) |
3317 				 PACKET3_RELEASE_MEM_GCR_GLV_WB |
3318 				 PACKET3_RELEASE_MEM_GCR_GL2_WB |
3319 				 PACKET3_RELEASE_MEM_GCR_GL2_SCOPE(2) |
3320 				 PACKET3_RELEASE_MEM_TEMPORAL(3) |
3321 				 PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
3322 				 PACKET3_RELEASE_MEM_EVENT_INDEX(5)));
3323 	amdgpu_ring_write(ring, (PACKET3_RELEASE_MEM_DATA_SEL(write64bit ? 2 : 1) |
3324 				 PACKET3_RELEASE_MEM_INT_SEL(int_sel ? 2 : 0)));
3325 
3326 	/*
3327 	 * the address should be Qword aligned if 64bit write, Dword
3328 	 * aligned if only send 32bit data low (discard data high)
3329 	 */
3330 	if (write64bit)
3331 		BUG_ON(addr & 0x7);
3332 	else
3333 		BUG_ON(addr & 0x3);
3334 	amdgpu_ring_write(ring, lower_32_bits(addr));
3335 	amdgpu_ring_write(ring, upper_32_bits(addr));
3336 	amdgpu_ring_write(ring, lower_32_bits(seq));
3337 	amdgpu_ring_write(ring, upper_32_bits(seq));
3338 	amdgpu_ring_write(ring, 0);
3339 }
3340 
3341 static void gfx_v12_1_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
3342 {
3343 	int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
3344 	uint32_t seq = ring->fence_drv.sync_seq;
3345 	uint64_t addr = ring->fence_drv.gpu_addr;
3346 
3347 	gfx_v12_1_wait_reg_mem(ring, usepfp, 1, 0, lower_32_bits(addr),
3348 			       upper_32_bits(addr), seq, 0xffffffff, 4);
3349 }
3350 
3351 static void gfx_v12_1_ring_invalidate_tlbs(struct amdgpu_ring *ring,
3352 				   uint16_t pasid, uint32_t flush_type,
3353 				   bool all_hub, uint8_t dst_sel)
3354 {
3355 	amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
3356 	amdgpu_ring_write(ring,
3357 			  PACKET3_INVALIDATE_TLBS_DST_SEL(dst_sel) |
3358 			  PACKET3_INVALIDATE_TLBS_ALL_HUB(all_hub) |
3359 			  PACKET3_INVALIDATE_TLBS_PASID(pasid) |
3360 			  PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(flush_type));
3361 }
3362 
3363 static void gfx_v12_1_ring_emit_vm_flush(struct amdgpu_ring *ring,
3364 					 unsigned vmid, uint64_t pd_addr)
3365 {
3366 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
3367 
3368 	/* compute doesn't have PFP */
3369 	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
3370 		/* sync PFP to ME, otherwise we might get invalid PFP reads */
3371 		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
3372 		amdgpu_ring_write(ring, 0x0);
3373 	}
3374 }
3375 
3376 static void gfx_v12_1_ring_emit_fence_kiq(struct amdgpu_ring *ring, u64 addr,
3377 					  u64 seq, unsigned int flags)
3378 {
3379 	struct amdgpu_device *adev = ring->adev;
3380 
3381 	/* we only allocate 32bit for each seq wb address */
3382 	BUG_ON(flags & AMDGPU_FENCE_FLAG_64BIT);
3383 
3384 	/* write fence seq to the "addr" */
3385 	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
3386 	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
3387 				 WRITE_DATA_DST_SEL(5) | WR_CONFIRM));
3388 	amdgpu_ring_write(ring, lower_32_bits(addr));
3389 	amdgpu_ring_write(ring, upper_32_bits(addr));
3390 	amdgpu_ring_write(ring, lower_32_bits(seq));
3391 
3392 	if (flags & AMDGPU_FENCE_FLAG_INT) {
3393 		/* set register to trigger INT */
3394 		amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
3395 		amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
3396 					 WRITE_DATA_DST_SEL(0) | WR_CONFIRM));
3397 		amdgpu_ring_write(ring, SOC15_REG_OFFSET(GC, GET_INST(GC, 0), regCPC_INT_STATUS));
3398 		amdgpu_ring_write(ring, 0);
3399 		amdgpu_ring_write(ring, 0x20000000); /* src_id is 178 */
3400 	}
3401 }
3402 
3403 static void gfx_v12_1_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
3404 				     uint32_t reg_val_offs)
3405 {
3406 	struct amdgpu_device *adev = ring->adev;
3407 
3408 	reg = soc_v1_0_normalize_xcc_reg_offset(reg);
3409 
3410 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
3411 	amdgpu_ring_write(ring, 0 |	/* src: register*/
3412 				(5 << 8) |	/* dst: memory */
3413 				(1 << 20));	/* write confirm */
3414 	amdgpu_ring_write(ring, reg);
3415 	amdgpu_ring_write(ring, 0);
3416 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
3417 				reg_val_offs * 4));
3418 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
3419 				reg_val_offs * 4));
3420 }
3421 
3422 static void gfx_v12_1_ring_emit_wreg(struct amdgpu_ring *ring,
3423 				     uint32_t reg,
3424 				     uint32_t val)
3425 {
3426 	uint32_t cmd = 0;
3427 
3428 	reg = soc_v1_0_normalize_xcc_reg_offset(reg);
3429 
3430 	switch (ring->funcs->type) {
3431 	case AMDGPU_RING_TYPE_KIQ:
3432 		cmd = (1 << 16); /* no inc addr */
3433 		break;
3434 	default:
3435 		cmd = WR_CONFIRM;
3436 		break;
3437 	}
3438 	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
3439 	amdgpu_ring_write(ring, cmd);
3440 	amdgpu_ring_write(ring, reg);
3441 	amdgpu_ring_write(ring, 0);
3442 	amdgpu_ring_write(ring, val);
3443 }
3444 
3445 static void gfx_v12_1_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
3446 					uint32_t val, uint32_t mask)
3447 {
3448 	gfx_v12_1_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
3449 }
3450 
3451 static void gfx_v12_1_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
3452 						   uint32_t reg0, uint32_t reg1,
3453 						   uint32_t ref, uint32_t mask)
3454 {
3455 	int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
3456 
3457 	gfx_v12_1_wait_reg_mem(ring, usepfp, 0, 1, reg0, reg1,
3458 			       ref, mask, 0x20);
3459 }
3460 
3461 static void gfx_v12_1_xcc_set_compute_eop_interrupt_state(struct amdgpu_device *adev,
3462 							int me, int pipe,
3463 							enum amdgpu_interrupt_state state,
3464 							int xcc_id)
3465 {
3466 	u32 mec_int_cntl, mec_int_cntl_reg;
3467 
3468 	/*
3469 	 * amdgpu controls only the first MEC. That's why this function only
3470 	 * handles the setting of interrupts for this specific MEC. All other
3471 	 * pipes' interrupts are set by amdkfd.
3472 	 */
3473 
3474 	if (me == 1) {
3475 		switch (pipe) {
3476 		case 0:
3477 			mec_int_cntl_reg = SOC15_REG_OFFSET(
3478 					GC, GET_INST(GC, xcc_id),
3479 					regCP_ME1_PIPE0_INT_CNTL);
3480 			break;
3481 		case 1:
3482 			mec_int_cntl_reg = SOC15_REG_OFFSET(
3483 					GC, GET_INST(GC, xcc_id),
3484 					regCP_ME1_PIPE1_INT_CNTL);
3485 			break;
3486 		case 2:
3487 			mec_int_cntl_reg = SOC15_REG_OFFSET(
3488 					GC, GET_INST(GC, xcc_id),
3489 					regCP_ME1_PIPE2_INT_CNTL);
3490 			break;
3491 		case 3:
3492 			mec_int_cntl_reg = SOC15_REG_OFFSET(
3493 					GC, GET_INST(GC, xcc_id),
3494 					regCP_ME1_PIPE3_INT_CNTL);
3495 			break;
3496 		default:
3497 			DRM_DEBUG("invalid pipe %d\n", pipe);
3498 			return;
3499 		}
3500 	} else {
3501 		DRM_DEBUG("invalid me %d\n", me);
3502 		return;
3503 	}
3504 
3505 	switch (state) {
3506 	case AMDGPU_IRQ_STATE_DISABLE:
3507 		mec_int_cntl = RREG32_XCC(mec_int_cntl_reg, xcc_id);
3508 		mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
3509 					     TIME_STAMP_INT_ENABLE, 0);
3510 		mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
3511 					     GENERIC0_INT_ENABLE, 0);
3512 		WREG32_XCC(mec_int_cntl_reg, mec_int_cntl, xcc_id);
3513 		break;
3514 	case AMDGPU_IRQ_STATE_ENABLE:
3515 		mec_int_cntl = RREG32_XCC(mec_int_cntl_reg, xcc_id);
3516 		mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
3517 					     TIME_STAMP_INT_ENABLE, 1);
3518 		mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
3519 					     GENERIC0_INT_ENABLE, 1);
3520 		WREG32_XCC(mec_int_cntl_reg, mec_int_cntl, xcc_id);
3521 		break;
3522 	default:
3523 		break;
3524 	}
3525 }
3526 
3527 static int gfx_v12_1_set_eop_interrupt_state(struct amdgpu_device *adev,
3528 					    struct amdgpu_irq_src *src,
3529 					    unsigned type,
3530 					    enum amdgpu_interrupt_state state)
3531 {
3532 	int i, num_xcc;
3533 
3534 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3535 	for (i = 0; i < num_xcc; i++) {
3536 		switch (type) {
3537 		case AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP:
3538 			gfx_v12_1_xcc_set_compute_eop_interrupt_state(
3539 					adev, 1, 0, state, i);
3540 			break;
3541 		case AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE1_EOP:
3542 			gfx_v12_1_xcc_set_compute_eop_interrupt_state(
3543 					adev, 1, 1, state, i);
3544 			break;
3545 		case AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE2_EOP:
3546 			gfx_v12_1_xcc_set_compute_eop_interrupt_state(
3547 					adev, 1, 2, state, i);
3548 			break;
3549 		case AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE3_EOP:
3550 			gfx_v12_1_xcc_set_compute_eop_interrupt_state(
3551 					adev, 1, 3, state, i);
3552 			break;
3553 		default:
3554 			break;
3555 		}
3556 	}
3557 
3558 	return 0;
3559 }
3560 
3561 static int gfx_v12_1_eop_irq(struct amdgpu_device *adev,
3562 			     struct amdgpu_irq_src *source,
3563 			     struct amdgpu_iv_entry *entry)
3564 {
3565 	int i, xcc_id;
3566 	u8 me_id, pipe_id, queue_id;
3567 	struct amdgpu_ring *ring;
3568 	uint32_t mes_queue_id = entry->src_data[0];
3569 
3570 	DRM_DEBUG("IH: CP EOP\n");
3571 
3572 	if (adev->enable_mes && (mes_queue_id & AMDGPU_FENCE_MES_QUEUE_FLAG)) {
3573 		struct amdgpu_mes_queue *queue;
3574 
3575 		mes_queue_id &= AMDGPU_FENCE_MES_QUEUE_ID_MASK;
3576 
3577 		spin_lock(&adev->mes.queue_id_lock);
3578 		queue = idr_find(&adev->mes.queue_id_idr, mes_queue_id);
3579 		if (queue) {
3580 			DRM_DEBUG("process mes queue id = %d\n", mes_queue_id);
3581 			amdgpu_fence_process(queue->ring);
3582 		}
3583 		spin_unlock(&adev->mes.queue_id_lock);
3584 	} else {
3585 		me_id = (entry->ring_id & 0x0c) >> 2;
3586 		pipe_id = (entry->ring_id & 0x03) >> 0;
3587 		queue_id = (entry->ring_id & 0x70) >> 4;
3588 		xcc_id = gfx_v12_1_ih_to_xcc_inst(adev, entry->node_id);
3589 
3590 		if (xcc_id == -EINVAL)
3591 			return -EINVAL;
3592 
3593 		switch (me_id) {
3594 		case 0:
3595 			if (pipe_id == 0)
3596 				amdgpu_fence_process(&adev->gfx.gfx_ring[0]);
3597 			else
3598 				amdgpu_fence_process(&adev->gfx.gfx_ring[1]);
3599 			break;
3600 		case 1:
3601 		case 2:
3602 			for (i = 0; i < adev->gfx.num_compute_rings; i++) {
3603 				ring = &adev->gfx.compute_ring
3604 						[i +
3605 						 xcc_id * adev->gfx.num_compute_rings];
3606 				/* Per-queue interrupt is supported for MEC starting from VI.
3607 				 * The interrupt can only be enabled/disabled per pipe instead
3608 				 * of per queue.
3609 				 */
3610 				if ((ring->me == me_id) &&
3611 				    (ring->pipe == pipe_id) &&
3612 				    (ring->queue == queue_id))
3613 					amdgpu_fence_process(ring);
3614 			}
3615 			break;
3616 		}
3617 	}
3618 
3619 	return 0;
3620 }
3621 
3622 static int gfx_v12_1_set_priv_reg_fault_state(struct amdgpu_device *adev,
3623 					      struct amdgpu_irq_src *source,
3624 					      unsigned type,
3625 					      enum amdgpu_interrupt_state state)
3626 {
3627 	int i, num_xcc;
3628 
3629 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3630 	switch (state) {
3631 	case AMDGPU_IRQ_STATE_DISABLE:
3632 	case AMDGPU_IRQ_STATE_ENABLE:
3633 		for (i = 0; i < num_xcc; i++)
3634 			WREG32_FIELD15_PREREG(GC, GET_INST(GC, i), CP_INT_CNTL_RING0,
3635 					      PRIV_REG_INT_ENABLE,
3636 					      state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
3637 		break;
3638 	default:
3639 		break;
3640 	}
3641 
3642 	return 0;
3643 }
3644 
3645 static int gfx_v12_1_set_priv_inst_fault_state(struct amdgpu_device *adev,
3646 					       struct amdgpu_irq_src *source,
3647 					       unsigned type,
3648 					       enum amdgpu_interrupt_state state)
3649 {
3650 	int i, num_xcc;
3651 
3652 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3653 	switch (state) {
3654 	case AMDGPU_IRQ_STATE_DISABLE:
3655 	case AMDGPU_IRQ_STATE_ENABLE:
3656 		for (i = 0; i < num_xcc; i++)
3657 			WREG32_FIELD15_PREREG(GC, GET_INST(GC, i), CP_INT_CNTL_RING0,
3658 				       PRIV_INSTR_INT_ENABLE,
3659 				       state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
3660 		break;
3661 	default:
3662 		break;
3663 	}
3664 
3665 	return 0;
3666 }
3667 
3668 static void gfx_v12_1_handle_priv_fault(struct amdgpu_device *adev,
3669 					struct amdgpu_iv_entry *entry)
3670 {
3671 	u8 me_id, pipe_id, queue_id;
3672 	struct amdgpu_ring *ring;
3673 	int i, xcc_id;
3674 
3675 	me_id = (entry->ring_id & 0x0c) >> 2;
3676 	pipe_id = (entry->ring_id & 0x03) >> 0;
3677 	queue_id = (entry->ring_id & 0x70) >> 4;
3678 	xcc_id = gfx_v12_1_ih_to_xcc_inst(adev, entry->node_id);
3679 
3680 	if (xcc_id == -EINVAL)
3681 		return;
3682 
3683 	switch (me_id) {
3684 	case 0:
3685 		for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
3686 			ring = &adev->gfx.gfx_ring[i];
3687 			/* we only enabled 1 gfx queue per pipe for now */
3688 			if (ring->me == me_id && ring->pipe == pipe_id)
3689 				drm_sched_fault(&ring->sched);
3690 		}
3691 		break;
3692 	case 1:
3693 	case 2:
3694 		for (i = 0; i < adev->gfx.num_compute_rings; i++) {
3695 			ring = &adev->gfx.compute_ring
3696 					[i +
3697 					 xcc_id * adev->gfx.num_compute_rings];
3698 			if (ring->me == me_id && ring->pipe == pipe_id &&
3699 			    ring->queue == queue_id)
3700 				drm_sched_fault(&ring->sched);
3701 		}
3702 		break;
3703 	default:
3704 		BUG();
3705 		break;
3706 	}
3707 }
3708 
3709 static int gfx_v12_1_priv_reg_irq(struct amdgpu_device *adev,
3710 				  struct amdgpu_irq_src *source,
3711 				  struct amdgpu_iv_entry *entry)
3712 {
3713 	DRM_ERROR("Illegal register access in command stream\n");
3714 	gfx_v12_1_handle_priv_fault(adev, entry);
3715 	return 0;
3716 }
3717 
3718 static int gfx_v12_1_priv_inst_irq(struct amdgpu_device *adev,
3719 				   struct amdgpu_irq_src *source,
3720 				   struct amdgpu_iv_entry *entry)
3721 {
3722 	DRM_ERROR("Illegal instruction in command stream\n");
3723 	gfx_v12_1_handle_priv_fault(adev, entry);
3724 	return 0;
3725 }
3726 
3727 static void gfx_v12_1_emit_mem_sync(struct amdgpu_ring *ring)
3728 {
3729 	const unsigned int gcr_cntl =
3730 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(1) |
3731 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(1) |
3732 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(1) |
3733 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(1) |
3734 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(1) |
3735 			PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_SCOPE(2);
3736 
3737 	/* ACQUIRE_MEM - make one or more surfaces valid for use by the subsequent operations */
3738 	amdgpu_ring_write(ring, PACKET3(PACKET3_ACQUIRE_MEM, 6));
3739 	amdgpu_ring_write(ring, 0); /* CP_COHER_CNTL */
3740 	amdgpu_ring_write(ring, 0xffffffff);  /* CP_COHER_SIZE */
3741 	amdgpu_ring_write(ring, 0xffffff);  /* CP_COHER_SIZE_HI */
3742 	amdgpu_ring_write(ring, 0); /* CP_COHER_BASE */
3743 	amdgpu_ring_write(ring, 0);  /* CP_COHER_BASE_HI */
3744 	amdgpu_ring_write(ring, 0x0000000A); /* POLL_INTERVAL */
3745 	amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
3746 }
3747 
3748 static const struct amd_ip_funcs gfx_v12_1_ip_funcs = {
3749 	.name = "gfx_v12_1",
3750 	.early_init = gfx_v12_1_early_init,
3751 	.late_init = gfx_v12_1_late_init,
3752 	.sw_init = gfx_v12_1_sw_init,
3753 	.sw_fini = gfx_v12_1_sw_fini,
3754 	.hw_init = gfx_v12_1_hw_init,
3755 	.hw_fini = gfx_v12_1_hw_fini,
3756 	.suspend = gfx_v12_1_suspend,
3757 	.resume = gfx_v12_1_resume,
3758 	.is_idle = gfx_v12_1_is_idle,
3759 	.wait_for_idle = gfx_v12_1_wait_for_idle,
3760 	.set_clockgating_state = gfx_v12_1_set_clockgating_state,
3761 	.set_powergating_state = gfx_v12_1_set_powergating_state,
3762 	.get_clockgating_state = gfx_v12_1_get_clockgating_state,
3763 };
3764 
3765 static const struct amdgpu_ring_funcs gfx_v12_1_ring_funcs_compute = {
3766 	.type = AMDGPU_RING_TYPE_COMPUTE,
3767 	.align_mask = 0xff,
3768 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
3769 	.support_64bit_ptrs = true,
3770 	.get_rptr = gfx_v12_1_ring_get_rptr_compute,
3771 	.get_wptr = gfx_v12_1_ring_get_wptr_compute,
3772 	.set_wptr = gfx_v12_1_ring_set_wptr_compute,
3773 	.emit_frame_size =
3774 		7 + /* gfx_v12_1_ring_emit_pipeline_sync */
3775 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
3776 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
3777 		2 + /* gfx_v12_1_ring_emit_vm_flush */
3778 		8 + 8 + 8 + /* gfx_v12_1_ring_emit_fence x3 for user fence, vm fence */
3779 		8, /* gfx_v12_1_emit_mem_sync */
3780 	.emit_ib_size =	7, /* gfx_v12_1_ring_emit_ib_compute */
3781 	.emit_ib = gfx_v12_1_ring_emit_ib_compute,
3782 	.emit_fence = gfx_v12_1_ring_emit_fence,
3783 	.emit_pipeline_sync = gfx_v12_1_ring_emit_pipeline_sync,
3784 	.emit_vm_flush = gfx_v12_1_ring_emit_vm_flush,
3785 	.test_ring = gfx_v12_1_ring_test_ring,
3786 	.test_ib = gfx_v12_1_ring_test_ib,
3787 	.insert_nop = amdgpu_ring_insert_nop,
3788 	.pad_ib = amdgpu_ring_generic_pad_ib,
3789 	.emit_wreg = gfx_v12_1_ring_emit_wreg,
3790 	.emit_reg_wait = gfx_v12_1_ring_emit_reg_wait,
3791 	.emit_reg_write_reg_wait = gfx_v12_1_ring_emit_reg_write_reg_wait,
3792 	.emit_mem_sync = gfx_v12_1_emit_mem_sync,
3793 };
3794 
3795 static const struct amdgpu_ring_funcs gfx_v12_1_ring_funcs_kiq = {
3796 	.type = AMDGPU_RING_TYPE_KIQ,
3797 	.align_mask = 0xff,
3798 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
3799 	.support_64bit_ptrs = true,
3800 	.get_rptr = gfx_v12_1_ring_get_rptr_compute,
3801 	.get_wptr = gfx_v12_1_ring_get_wptr_compute,
3802 	.set_wptr = gfx_v12_1_ring_set_wptr_compute,
3803 	.emit_frame_size =
3804 		7 + /* gfx_v12_1_ring_emit_pipeline_sync */
3805 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
3806 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
3807 		2 + /* gfx_v12_1_ring_emit_vm_flush */
3808 		8 + 8 + 8, /* gfx_v12_1_ring_emit_fence_kiq x3 for user fence, vm fence */
3809 	.emit_ib_size =	7, /* gfx_v12_1_ring_emit_ib_compute */
3810 	.emit_ib = gfx_v12_1_ring_emit_ib_compute,
3811 	.emit_fence = gfx_v12_1_ring_emit_fence_kiq,
3812 	.test_ring = gfx_v12_1_ring_test_ring,
3813 	.test_ib = gfx_v12_1_ring_test_ib,
3814 	.insert_nop = amdgpu_ring_insert_nop,
3815 	.pad_ib = amdgpu_ring_generic_pad_ib,
3816 	.emit_rreg = gfx_v12_1_ring_emit_rreg,
3817 	.emit_wreg = gfx_v12_1_ring_emit_wreg,
3818 	.emit_reg_wait = gfx_v12_1_ring_emit_reg_wait,
3819 	.emit_reg_write_reg_wait = gfx_v12_1_ring_emit_reg_write_reg_wait,
3820 };
3821 
3822 static void gfx_v12_1_set_ring_funcs(struct amdgpu_device *adev)
3823 {
3824 	int i, j, num_xcc;
3825 
3826 	num_xcc = NUM_XCC(adev->gfx.xcc_mask);
3827 	for (i = 0; i < num_xcc; i++) {
3828 		adev->gfx.kiq[i].ring.funcs = &gfx_v12_1_ring_funcs_kiq;
3829 
3830 		for (j = 0; j < adev->gfx.num_compute_rings; j++)
3831 			adev->gfx.compute_ring[j + i * adev->gfx.num_compute_rings].funcs =
3832 						&gfx_v12_1_ring_funcs_compute;
3833 	}
3834 }
3835 
3836 static const struct amdgpu_irq_src_funcs gfx_v12_1_eop_irq_funcs = {
3837 	.set = gfx_v12_1_set_eop_interrupt_state,
3838 	.process = gfx_v12_1_eop_irq,
3839 };
3840 
3841 static const struct amdgpu_irq_src_funcs gfx_v12_1_priv_reg_irq_funcs = {
3842 	.set = gfx_v12_1_set_priv_reg_fault_state,
3843 	.process = gfx_v12_1_priv_reg_irq,
3844 };
3845 
3846 static const struct amdgpu_irq_src_funcs gfx_v12_1_priv_inst_irq_funcs = {
3847 	.set = gfx_v12_1_set_priv_inst_fault_state,
3848 	.process = gfx_v12_1_priv_inst_irq,
3849 };
3850 
3851 static void gfx_v12_1_set_irq_funcs(struct amdgpu_device *adev)
3852 {
3853 	adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
3854 	adev->gfx.eop_irq.funcs = &gfx_v12_1_eop_irq_funcs;
3855 
3856 	adev->gfx.priv_reg_irq.num_types = 1;
3857 	adev->gfx.priv_reg_irq.funcs = &gfx_v12_1_priv_reg_irq_funcs;
3858 
3859 	adev->gfx.priv_inst_irq.num_types = 1;
3860 	adev->gfx.priv_inst_irq.funcs = &gfx_v12_1_priv_inst_irq_funcs;
3861 }
3862 
3863 static void gfx_v12_1_set_imu_funcs(struct amdgpu_device *adev)
3864 {
3865 	if (adev->flags & AMD_IS_APU)
3866 		adev->gfx.imu.mode = MISSION_MODE;
3867 	else
3868 		adev->gfx.imu.mode = DEBUG_MODE;
3869 	if (!amdgpu_sriov_vf(adev))
3870 		adev->gfx.imu.funcs = &gfx_v12_1_imu_funcs;
3871 }
3872 
3873 static void gfx_v12_1_set_rlc_funcs(struct amdgpu_device *adev)
3874 {
3875 	adev->gfx.rlc.funcs = &gfx_v12_1_rlc_funcs;
3876 }
3877 
3878 static void gfx_v12_1_set_mqd_funcs(struct amdgpu_device *adev)
3879 {
3880 	/* set compute eng mqd */
3881 	adev->mqds[AMDGPU_HW_IP_COMPUTE].mqd_size =
3882 		sizeof(struct v12_1_compute_mqd);
3883 	adev->mqds[AMDGPU_HW_IP_COMPUTE].init_mqd =
3884 		gfx_v12_1_compute_mqd_init;
3885 }
3886 
3887 static void gfx_v12_1_set_user_cu_inactive_bitmap_per_sh(struct amdgpu_device *adev,
3888 							  u32 bitmap, int xcc_id)
3889 {
3890 	u32 data;
3891 
3892 	if (!bitmap)
3893 		return;
3894 
3895 	data = bitmap << GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_WGPS__SHIFT;
3896 	data &= GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_WGPS_MASK;
3897 
3898 	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG, data);
3899 }
3900 
3901 static u32 gfx_v12_1_get_cu_active_bitmap_per_sh(struct amdgpu_device *adev,
3902 						 int xcc_id)
3903 {
3904 	u32 data, mask;
3905 
3906 	data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCC_GC_SHADER_ARRAY_CONFIG);
3907 	data |= RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG);
3908 
3909 	data &= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_WGPS_MASK;
3910 	data >>= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_WGPS__SHIFT;
3911 
3912 	mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_cu_per_sh);
3913 
3914 	return (~data) & mask;
3915 }
3916 
3917 static int gfx_v12_1_get_cu_info(struct amdgpu_device *adev,
3918 				 struct amdgpu_cu_info *cu_info)
3919 {
3920 	int i, j, k, counter, xcc_id, active_cu_number = 0;
3921 	u32 mask, bitmap;
3922 	unsigned int disable_masks[2 * 2];
3923 
3924 	if (!adev || !cu_info)
3925 		return -EINVAL;
3926 
3927 	if (adev->gfx.config.max_shader_engines > 2 ||
3928 	    adev->gfx.config.max_sh_per_se > 2) {
3929 		dev_err(adev->dev,
3930 			"Max SE (%d) and Max SA per SE (%d) is greater than expected\n",
3931 			adev->gfx.config.max_shader_engines,
3932 			adev->gfx.config.max_sh_per_se);
3933 		return -EINVAL;
3934 	}
3935 
3936 	amdgpu_gfx_parse_disable_cu(disable_masks,
3937 				    adev->gfx.config.max_shader_engines,
3938 				    adev->gfx.config.max_sh_per_se);
3939 
3940 	mutex_lock(&adev->grbm_idx_mutex);
3941 	for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) {
3942 		for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
3943 			for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
3944 				bitmap = i * adev->gfx.config.max_sh_per_se + j;
3945 				if (!((gfx_v12_1_get_sa_active_bitmap(adev, xcc_id) >> bitmap) & 1))
3946 					continue;
3947 				mask = 1;
3948 				counter = 0;
3949 				gfx_v12_1_xcc_select_se_sh(adev, i, j, 0xffffffff, xcc_id);
3950 				gfx_v12_1_set_user_cu_inactive_bitmap_per_sh(
3951 					adev,
3952 					disable_masks[i * adev->gfx.config.max_sh_per_se + j],
3953 					xcc_id);
3954 				bitmap = gfx_v12_1_get_cu_active_bitmap_per_sh(adev, xcc_id);
3955 
3956 				cu_info->bitmap[xcc_id][i][j] = bitmap;
3957 
3958 				for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
3959 					if (bitmap & mask)
3960 						counter++;
3961 
3962 					mask <<= 1;
3963 				}
3964 				active_cu_number += counter;
3965 			}
3966 		}
3967 		gfx_v12_1_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, xcc_id);
3968 	}
3969 	mutex_unlock(&adev->grbm_idx_mutex);
3970 
3971 	cu_info->number = active_cu_number;
3972 	cu_info->simd_per_cu = NUM_SIMD_PER_CU_GFX12_1;
3973 	cu_info->lds_size = 320;
3974 
3975 	return 0;
3976 }
3977 
3978 const struct amdgpu_ip_block_version gfx_v12_1_ip_block = {
3979 	.type = AMD_IP_BLOCK_TYPE_GFX,
3980 	.major = 12,
3981 	.minor = 1,
3982 	.rev = 0,
3983 	.funcs = &gfx_v12_1_ip_funcs,
3984 };
3985 
3986 static int gfx_v12_1_xcp_resume(void *handle, uint32_t inst_mask)
3987 {
3988 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
3989 	uint32_t tmp_mask;
3990 	int i, r;
3991 
3992 	/* TODO : Initialize golden regs */
3993 	/* gfx_v12_1_init_golden_registers(adev); */
3994 
3995 	tmp_mask = inst_mask;
3996 	for_each_inst(i, tmp_mask)
3997 		gfx_v12_1_xcc_constants_init(adev, i);
3998 
3999 	if (!amdgpu_sriov_vf(adev)) {
4000 		tmp_mask = inst_mask;
4001 		for_each_inst(i, tmp_mask) {
4002 			r = gfx_v12_1_xcc_rlc_resume(adev, i);
4003 			if (r)
4004 				return r;
4005 		}
4006 	}
4007 
4008 	r = gfx_v12_1_xcc_cp_resume(adev, inst_mask);
4009 
4010 	return r;
4011 }
4012 
4013 static int gfx_v12_1_xcp_suspend(void *handle, uint32_t inst_mask)
4014 {
4015 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
4016 	int i;
4017 
4018 	for_each_inst(i, inst_mask)
4019 		gfx_v12_1_xcc_fini(adev, i);
4020 
4021 	return 0;
4022 }
4023 
4024 struct amdgpu_xcp_ip_funcs gfx_v12_1_xcp_funcs = {
4025 	.suspend = &gfx_v12_1_xcp_suspend,
4026 	.resume = &gfx_v12_1_xcp_resume
4027 };
4028