1 /* 2 * Copyright 2014-2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 #include "amdgpu.h" 23 #include "amdgpu_amdkfd.h" 24 #include "gc/gc_9_0_offset.h" 25 #include "gc/gc_9_0_sh_mask.h" 26 #include "vega10_enum.h" 27 #include "sdma0/sdma0_4_0_offset.h" 28 #include "sdma0/sdma0_4_0_sh_mask.h" 29 #include "sdma1/sdma1_4_0_offset.h" 30 #include "sdma1/sdma1_4_0_sh_mask.h" 31 #include "athub/athub_1_0_offset.h" 32 #include "athub/athub_1_0_sh_mask.h" 33 #include "oss/osssys_4_0_offset.h" 34 #include "oss/osssys_4_0_sh_mask.h" 35 #include "soc15_common.h" 36 #include "v9_structs.h" 37 #include "soc15.h" 38 #include "soc15d.h" 39 #include "gfx_v9_0.h" 40 #include "amdgpu_amdkfd_gfx_v9.h" 41 #include <uapi/linux/kfd_ioctl.h> 42 43 enum hqd_dequeue_request_type { 44 NO_ACTION = 0, 45 DRAIN_PIPE, 46 RESET_WAVES, 47 SAVE_WAVES 48 }; 49 50 static void kgd_gfx_v9_lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe, 51 uint32_t queue, uint32_t vmid, uint32_t inst) 52 { 53 mutex_lock(&adev->srbm_mutex); 54 soc15_grbm_select(adev, mec, pipe, queue, vmid, GET_INST(GC, inst)); 55 } 56 57 static void kgd_gfx_v9_unlock_srbm(struct amdgpu_device *adev, uint32_t inst) 58 { 59 soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst)); 60 mutex_unlock(&adev->srbm_mutex); 61 } 62 63 void kgd_gfx_v9_acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id, 64 uint32_t queue_id, uint32_t inst) 65 { 66 uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 67 uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 68 69 kgd_gfx_v9_lock_srbm(adev, mec, pipe, queue_id, 0, inst); 70 } 71 72 uint64_t kgd_gfx_v9_get_queue_mask(struct amdgpu_device *adev, 73 uint32_t pipe_id, uint32_t queue_id) 74 { 75 unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe + 76 queue_id; 77 78 return 1ull << bit; 79 } 80 81 void kgd_gfx_v9_release_queue(struct amdgpu_device *adev, uint32_t inst) 82 { 83 kgd_gfx_v9_unlock_srbm(adev, inst); 84 } 85 86 void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid, 87 uint32_t sh_mem_config, 88 uint32_t sh_mem_ape1_base, 89 uint32_t sh_mem_ape1_limit, 90 uint32_t sh_mem_bases, uint32_t inst) 91 { 92 kgd_gfx_v9_lock_srbm(adev, 0, 0, 0, vmid, inst); 93 94 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmSH_MEM_CONFIG, sh_mem_config); 95 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmSH_MEM_BASES, sh_mem_bases); 96 /* APE1 no longer exists on GFX9 */ 97 98 kgd_gfx_v9_unlock_srbm(adev, inst); 99 } 100 101 int kgd_gfx_v9_set_pasid_vmid_mapping(struct amdgpu_device *adev, u32 pasid, 102 unsigned int vmid, uint32_t inst) 103 { 104 /* 105 * We have to assume that there is no outstanding mapping. 106 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because 107 * a mapping is in progress or because a mapping finished 108 * and the SW cleared it. 109 * So the protocol is to always wait & clear. 110 */ 111 uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | 112 ATC_VMID0_PASID_MAPPING__VALID_MASK; 113 114 /* 115 * need to do this twice, once for gfx and once for mmhub 116 * for ATC add 16 to VMID for mmhub, for IH different registers. 117 * ATC_VMID0..15 registers are separate from ATC_VMID16..31. 118 */ 119 120 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, 121 pasid_mapping); 122 123 while (!(RREG32(SOC15_REG_OFFSET( 124 ATHUB, 0, 125 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 126 (1U << vmid))) 127 cpu_relax(); 128 129 WREG32(SOC15_REG_OFFSET(ATHUB, 0, 130 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 131 1U << vmid); 132 133 /* Mapping vmid to pasid also for IH block */ 134 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, 135 pasid_mapping); 136 137 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, 138 pasid_mapping); 139 140 while (!(RREG32(SOC15_REG_OFFSET( 141 ATHUB, 0, 142 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 143 (1U << (vmid + 16)))) 144 cpu_relax(); 145 146 WREG32(SOC15_REG_OFFSET(ATHUB, 0, 147 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 148 1U << (vmid + 16)); 149 150 /* Mapping vmid to pasid also for IH block */ 151 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, 152 pasid_mapping); 153 return 0; 154 } 155 156 /* TODO - RING0 form of field is obsolete, seems to date back to SI 157 * but still works 158 */ 159 160 int kgd_gfx_v9_init_interrupts(struct amdgpu_device *adev, uint32_t pipe_id, 161 uint32_t inst) 162 { 163 uint32_t mec; 164 uint32_t pipe; 165 166 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 167 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 168 169 kgd_gfx_v9_lock_srbm(adev, mec, pipe, 0, 0, inst); 170 171 WREG32_SOC15(GC, GET_INST(GC, inst), mmCPC_INT_CNTL, 172 CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | 173 CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); 174 175 kgd_gfx_v9_unlock_srbm(adev, inst); 176 177 return 0; 178 } 179 180 static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, 181 unsigned int engine_id, 182 unsigned int queue_id) 183 { 184 uint32_t sdma_engine_reg_base = 0; 185 uint32_t sdma_rlc_reg_offset; 186 187 switch (engine_id) { 188 default: 189 dev_warn(adev->dev, 190 "Invalid sdma engine id (%d), using engine id 0\n", 191 engine_id); 192 fallthrough; 193 case 0: 194 sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, 195 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; 196 break; 197 case 1: 198 sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0, 199 mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; 200 break; 201 } 202 203 sdma_rlc_reg_offset = sdma_engine_reg_base 204 + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL); 205 206 pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id, 207 queue_id, sdma_rlc_reg_offset); 208 209 return sdma_rlc_reg_offset; 210 } 211 212 static inline struct v9_mqd *get_mqd(void *mqd) 213 { 214 return (struct v9_mqd *)mqd; 215 } 216 217 static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) 218 { 219 return (struct v9_sdma_mqd *)mqd; 220 } 221 222 int kgd_gfx_v9_hqd_load(struct amdgpu_device *adev, void *mqd, 223 uint32_t pipe_id, uint32_t queue_id, 224 uint32_t __user *wptr, uint32_t wptr_shift, 225 uint32_t wptr_mask, struct mm_struct *mm, 226 uint32_t inst) 227 { 228 struct v9_mqd *m; 229 uint32_t *mqd_hqd; 230 uint32_t reg, hqd_base, data; 231 232 m = get_mqd(mqd); 233 234 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 235 236 /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ 237 mqd_hqd = &m->cp_mqd_base_addr_lo; 238 hqd_base = SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_MQD_BASE_ADDR); 239 240 for (reg = hqd_base; 241 reg <= SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI); reg++) 242 WREG32_XCC(reg, mqd_hqd[reg - hqd_base], inst); 243 244 245 /* Activate doorbell logic before triggering WPTR poll. */ 246 data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, 247 CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); 248 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL, data); 249 250 if (wptr) { 251 /* Don't read wptr with get_user because the user 252 * context may not be accessible (if this function 253 * runs in a work queue). Instead trigger a one-shot 254 * polling read from memory in the CP. This assumes 255 * that wptr is GPU-accessible in the queue's VMID via 256 * ATC or SVM. WPTR==RPTR before starting the poll so 257 * the CP starts fetching new commands from the right 258 * place. 259 * 260 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit 261 * tricky. Assume that the queue didn't overflow. The 262 * number of valid bits in the 32-bit RPTR depends on 263 * the queue size. The remaining bits are taken from 264 * the saved 64-bit WPTR. If the WPTR wrapped, add the 265 * queue size. 266 */ 267 uint32_t queue_size = 268 2 << REG_GET_FIELD(m->cp_hqd_pq_control, 269 CP_HQD_PQ_CONTROL, QUEUE_SIZE); 270 uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); 271 272 if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) 273 guessed_wptr += queue_size; 274 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); 275 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; 276 277 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_LO, 278 lower_32_bits(guessed_wptr)); 279 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI, 280 upper_32_bits(guessed_wptr)); 281 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_POLL_ADDR, 282 lower_32_bits((uintptr_t)wptr)); 283 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_POLL_ADDR_HI, 284 upper_32_bits((uintptr_t)wptr)); 285 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_PQ_WPTR_POLL_CNTL1, 286 (uint32_t)kgd_gfx_v9_get_queue_mask(adev, pipe_id, queue_id)); 287 } 288 289 /* Start the EOP fetcher */ 290 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_HQD_EOP_RPTR, 291 REG_SET_FIELD(m->cp_hqd_eop_rptr, CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); 292 293 data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); 294 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE, data); 295 296 kgd_gfx_v9_release_queue(adev, inst); 297 298 return 0; 299 } 300 301 int kgd_gfx_v9_hiq_mqd_load(struct amdgpu_device *adev, void *mqd, 302 uint32_t pipe_id, uint32_t queue_id, 303 uint32_t doorbell_off, uint32_t inst) 304 { 305 struct amdgpu_ring *kiq_ring = &adev->gfx.kiq[inst].ring; 306 struct v9_mqd *m; 307 uint32_t mec, pipe; 308 int r; 309 310 m = get_mqd(mqd); 311 312 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 313 314 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 315 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 316 317 pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 318 mec, pipe, queue_id); 319 320 spin_lock(&adev->gfx.kiq[inst].ring_lock); 321 r = amdgpu_ring_alloc(kiq_ring, 7); 322 if (r) { 323 pr_err("Failed to alloc KIQ (%d).\n", r); 324 goto out_unlock; 325 } 326 327 amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5)); 328 amdgpu_ring_write(kiq_ring, 329 PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ 330 PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */ 331 PACKET3_MAP_QUEUES_QUEUE(queue_id) | 332 PACKET3_MAP_QUEUES_PIPE(pipe) | 333 PACKET3_MAP_QUEUES_ME((mec - 1)) | 334 PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */ 335 PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */ 336 PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */ 337 PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */ 338 amdgpu_ring_write(kiq_ring, 339 PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off)); 340 amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo); 341 amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi); 342 amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo); 343 amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi); 344 amdgpu_ring_commit(kiq_ring); 345 346 out_unlock: 347 spin_unlock(&adev->gfx.kiq[inst].ring_lock); 348 kgd_gfx_v9_release_queue(adev, inst); 349 350 return r; 351 } 352 353 int kgd_gfx_v9_hqd_dump(struct amdgpu_device *adev, 354 uint32_t pipe_id, uint32_t queue_id, 355 uint32_t (**dump)[2], uint32_t *n_regs, uint32_t inst) 356 { 357 uint32_t i = 0, reg; 358 #define HQD_N_REGS 56 359 #define DUMP_REG(addr) do { \ 360 if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ 361 break; \ 362 (*dump)[i][0] = (addr) << 2; \ 363 (*dump)[i++][1] = RREG32(addr); \ 364 } while (0) 365 366 *dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL); 367 if (*dump == NULL) 368 return -ENOMEM; 369 370 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 371 372 for (reg = SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_MQD_BASE_ADDR); 373 reg <= SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI); reg++) 374 DUMP_REG(reg); 375 376 kgd_gfx_v9_release_queue(adev, inst); 377 378 WARN_ON_ONCE(i != HQD_N_REGS); 379 *n_regs = i; 380 381 return 0; 382 } 383 384 static int kgd_hqd_sdma_load(struct amdgpu_device *adev, void *mqd, 385 uint32_t __user *wptr, struct mm_struct *mm) 386 { 387 struct v9_sdma_mqd *m; 388 uint32_t sdma_rlc_reg_offset; 389 unsigned long end_jiffies; 390 uint32_t data; 391 uint64_t data64; 392 uint64_t __user *wptr64 = (uint64_t __user *)wptr; 393 394 m = get_sdma_mqd(mqd); 395 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 396 m->sdma_queue_id); 397 398 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 399 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); 400 401 end_jiffies = msecs_to_jiffies(2000) + jiffies; 402 while (true) { 403 data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 404 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 405 break; 406 if (time_after(jiffies, end_jiffies)) { 407 pr_err("SDMA RLC not idle in %s\n", __func__); 408 return -ETIME; 409 } 410 usleep_range(500, 1000); 411 } 412 413 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET, 414 m->sdmax_rlcx_doorbell_offset); 415 416 data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, 417 ENABLE, 1); 418 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data); 419 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR, 420 m->sdmax_rlcx_rb_rptr); 421 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI, 422 m->sdmax_rlcx_rb_rptr_hi); 423 424 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); 425 if (read_user_wptr(mm, wptr64, data64)) { 426 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 427 lower_32_bits(data64)); 428 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 429 upper_32_bits(data64)); 430 } else { 431 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 432 m->sdmax_rlcx_rb_rptr); 433 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 434 m->sdmax_rlcx_rb_rptr_hi); 435 } 436 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); 437 438 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); 439 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI, 440 m->sdmax_rlcx_rb_base_hi); 441 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, 442 m->sdmax_rlcx_rb_rptr_addr_lo); 443 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, 444 m->sdmax_rlcx_rb_rptr_addr_hi); 445 446 data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, 447 RB_ENABLE, 1); 448 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data); 449 450 return 0; 451 } 452 453 static int kgd_hqd_sdma_dump(struct amdgpu_device *adev, 454 uint32_t engine_id, uint32_t queue_id, 455 uint32_t (**dump)[2], uint32_t *n_regs) 456 { 457 uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, 458 engine_id, queue_id); 459 uint32_t i = 0, reg; 460 #undef HQD_N_REGS 461 #define HQD_N_REGS (19+6+7+10) 462 463 *dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL); 464 if (*dump == NULL) 465 return -ENOMEM; 466 467 for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) 468 DUMP_REG(sdma_rlc_reg_offset + reg); 469 for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) 470 DUMP_REG(sdma_rlc_reg_offset + reg); 471 for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; 472 reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) 473 DUMP_REG(sdma_rlc_reg_offset + reg); 474 for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; 475 reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) 476 DUMP_REG(sdma_rlc_reg_offset + reg); 477 478 WARN_ON_ONCE(i != HQD_N_REGS); 479 *n_regs = i; 480 481 return 0; 482 } 483 484 bool kgd_gfx_v9_hqd_is_occupied(struct amdgpu_device *adev, 485 uint64_t queue_address, uint32_t pipe_id, 486 uint32_t queue_id, uint32_t inst) 487 { 488 uint32_t act; 489 bool retval = false; 490 uint32_t low, high; 491 492 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 493 act = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE); 494 if (act) { 495 low = lower_32_bits(queue_address >> 8); 496 high = upper_32_bits(queue_address >> 8); 497 498 if (low == RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE) && 499 high == RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI)) 500 retval = true; 501 } 502 kgd_gfx_v9_release_queue(adev, inst); 503 return retval; 504 } 505 506 static bool kgd_hqd_sdma_is_occupied(struct amdgpu_device *adev, void *mqd) 507 { 508 struct v9_sdma_mqd *m; 509 uint32_t sdma_rlc_reg_offset; 510 uint32_t sdma_rlc_rb_cntl; 511 512 m = get_sdma_mqd(mqd); 513 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 514 m->sdma_queue_id); 515 516 sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 517 518 if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) 519 return true; 520 521 return false; 522 } 523 524 int kgd_gfx_v9_hqd_destroy(struct amdgpu_device *adev, void *mqd, 525 enum kfd_preempt_type reset_type, 526 unsigned int utimeout, uint32_t pipe_id, 527 uint32_t queue_id, uint32_t inst) 528 { 529 enum hqd_dequeue_request_type type; 530 unsigned long end_jiffies; 531 uint32_t temp; 532 struct v9_mqd *m = get_mqd(mqd); 533 534 if (amdgpu_in_reset(adev)) 535 return -EIO; 536 537 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 538 539 if (m->cp_hqd_vmid == 0) 540 WREG32_FIELD15_RLC(GC, GET_INST(GC, inst), RLC_CP_SCHEDULERS, scheduler1, 0); 541 542 switch (reset_type) { 543 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: 544 type = DRAIN_PIPE; 545 break; 546 case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: 547 type = RESET_WAVES; 548 break; 549 case KFD_PREEMPT_TYPE_WAVEFRONT_SAVE: 550 type = SAVE_WAVES; 551 break; 552 default: 553 type = DRAIN_PIPE; 554 break; 555 } 556 557 WREG32_SOC15_RLC(GC, GET_INST(GC, inst), mmCP_HQD_DEQUEUE_REQUEST, type); 558 559 end_jiffies = (utimeout * HZ / 1000) + jiffies; 560 while (true) { 561 temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE); 562 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 563 break; 564 if (time_after(jiffies, end_jiffies)) { 565 pr_err("cp queue preemption time out.\n"); 566 kgd_gfx_v9_release_queue(adev, inst); 567 return -ETIME; 568 } 569 usleep_range(500, 1000); 570 } 571 572 kgd_gfx_v9_release_queue(adev, inst); 573 return 0; 574 } 575 576 static int kgd_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd, 577 unsigned int utimeout) 578 { 579 struct v9_sdma_mqd *m; 580 uint32_t sdma_rlc_reg_offset; 581 uint32_t temp; 582 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 583 584 m = get_sdma_mqd(mqd); 585 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 586 m->sdma_queue_id); 587 588 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 589 temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; 590 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp); 591 592 while (true) { 593 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 594 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 595 break; 596 if (time_after(jiffies, end_jiffies)) { 597 pr_err("SDMA RLC not idle in %s\n", __func__); 598 return -ETIME; 599 } 600 usleep_range(500, 1000); 601 } 602 603 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0); 604 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 605 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) | 606 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); 607 608 m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR); 609 m->sdmax_rlcx_rb_rptr_hi = 610 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI); 611 612 return 0; 613 } 614 615 bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev, 616 uint8_t vmid, uint16_t *p_pasid) 617 { 618 uint32_t value; 619 620 value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 621 + vmid); 622 *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK; 623 624 return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK); 625 } 626 627 int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev, 628 uint32_t gfx_index_val, 629 uint32_t sq_cmd, uint32_t inst) 630 { 631 uint32_t data = 0; 632 633 mutex_lock(&adev->grbm_idx_mutex); 634 635 WREG32_SOC15_RLC_SHADOW(GC, GET_INST(GC, inst), mmGRBM_GFX_INDEX, gfx_index_val); 636 WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_CMD, sq_cmd); 637 638 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 639 INSTANCE_BROADCAST_WRITES, 1); 640 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 641 SH_BROADCAST_WRITES, 1); 642 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 643 SE_BROADCAST_WRITES, 1); 644 645 WREG32_SOC15_RLC_SHADOW(GC, GET_INST(GC, inst), mmGRBM_GFX_INDEX, data); 646 mutex_unlock(&adev->grbm_idx_mutex); 647 648 return 0; 649 } 650 651 /* 652 * GFX9 helper for wave launch stall requirements on debug trap setting. 653 * 654 * vmid: 655 * Target VMID to stall/unstall. 656 * 657 * stall: 658 * 0-unstall wave launch (enable), 1-stall wave launch (disable). 659 * After wavefront launch has been stalled, allocated waves must drain from 660 * SPI in order for debug trap settings to take effect on those waves. 661 * This is roughly a ~96 clock cycle wait on SPI where a read on 662 * SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles. 663 * KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required. 664 * 665 * NOTE: We can afford to clear the entire STALL_VMID field on unstall 666 * because GFX9.4.1 cannot support multi-process debugging due to trap 667 * configuration and masking being limited to global scope. Always assume 668 * single process conditions. 669 */ 670 #define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY 3 671 void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev, 672 uint32_t vmid, 673 bool stall) 674 { 675 int i; 676 uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); 677 678 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 1)) 679 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID, 680 stall ? 1 << vmid : 0); 681 else 682 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 683 stall ? 1 : 0); 684 685 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data); 686 687 if (!stall) 688 return; 689 690 for (i = 0; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++) 691 RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); 692 } 693 694 /* 695 * restore_dbg_registers is ignored here but is a general interface requirement 696 * for devices that support GFXOFF and where the RLC save/restore list 697 * does not support hw registers for debugging i.e. the driver has to manually 698 * initialize the debug mode registers after it has disabled GFX off during the 699 * debug session. 700 */ 701 uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev, 702 bool restore_dbg_registers, 703 uint32_t vmid) 704 { 705 mutex_lock(&adev->grbm_idx_mutex); 706 707 kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true); 708 709 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0); 710 711 kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false); 712 713 mutex_unlock(&adev->grbm_idx_mutex); 714 715 return 0; 716 } 717 718 /* 719 * keep_trap_enabled is ignored here but is a general interface requirement 720 * for devices that support multi-process debugging where the performance 721 * overhead from trap temporary setup needs to be bypassed when the debug 722 * session has ended. 723 */ 724 uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev, 725 bool keep_trap_enabled, 726 uint32_t vmid) 727 { 728 mutex_lock(&adev->grbm_idx_mutex); 729 730 kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true); 731 732 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0); 733 734 kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false); 735 736 mutex_unlock(&adev->grbm_idx_mutex); 737 738 return 0; 739 } 740 741 int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev, 742 uint32_t trap_override, 743 uint32_t *trap_mask_supported) 744 { 745 *trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH; 746 747 /* The SPI_GDBG_TRAP_MASK register is global and affects all 748 * processes. Only allow OR-ing the address-watch bit, since 749 * this only affects processes under the debugger. Other bits 750 * should stay 0 to avoid the debugger interfering with other 751 * processes. 752 */ 753 if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR) 754 return -EINVAL; 755 756 return 0; 757 } 758 759 uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev, 760 uint32_t vmid, 761 uint32_t trap_override, 762 uint32_t trap_mask_bits, 763 uint32_t trap_mask_request, 764 uint32_t *trap_mask_prev, 765 uint32_t kfd_dbg_cntl_prev) 766 { 767 uint32_t data, wave_cntl_prev; 768 769 mutex_lock(&adev->grbm_idx_mutex); 770 771 wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); 772 773 kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true); 774 775 data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK)); 776 *trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN); 777 778 trap_mask_bits = (trap_mask_bits & trap_mask_request) | 779 (*trap_mask_prev & ~trap_mask_request); 780 781 data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits); 782 data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override); 783 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data); 784 785 /* We need to preserve wave launch mode stall settings. */ 786 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev); 787 788 mutex_unlock(&adev->grbm_idx_mutex); 789 790 return 0; 791 } 792 793 uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev, 794 uint8_t wave_launch_mode, 795 uint32_t vmid) 796 { 797 uint32_t data = 0; 798 bool is_mode_set = !!wave_launch_mode; 799 800 mutex_lock(&adev->grbm_idx_mutex); 801 802 kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true); 803 804 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2, 805 VMID_MASK, is_mode_set ? 1 << vmid : 0); 806 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2, 807 MODE, is_mode_set ? wave_launch_mode : 0); 808 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data); 809 810 kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false); 811 812 mutex_unlock(&adev->grbm_idx_mutex); 813 814 return 0; 815 } 816 817 #define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H) 818 uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev, 819 uint64_t watch_address, 820 uint32_t watch_address_mask, 821 uint32_t watch_id, 822 uint32_t watch_mode, 823 uint32_t debug_vmid, 824 uint32_t inst) 825 { 826 uint32_t watch_address_high; 827 uint32_t watch_address_low; 828 uint32_t watch_address_cntl; 829 830 watch_address_cntl = 0; 831 832 watch_address_low = lower_32_bits(watch_address); 833 watch_address_high = upper_32_bits(watch_address) & 0xffff; 834 835 watch_address_cntl = REG_SET_FIELD(watch_address_cntl, 836 TCP_WATCH0_CNTL, 837 VMID, 838 debug_vmid); 839 watch_address_cntl = REG_SET_FIELD(watch_address_cntl, 840 TCP_WATCH0_CNTL, 841 MODE, 842 watch_mode); 843 watch_address_cntl = REG_SET_FIELD(watch_address_cntl, 844 TCP_WATCH0_CNTL, 845 MASK, 846 watch_address_mask >> 6); 847 848 /* Turning off this watch point until we set all the registers */ 849 watch_address_cntl = REG_SET_FIELD(watch_address_cntl, 850 TCP_WATCH0_CNTL, 851 VALID, 852 0); 853 854 WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) + 855 (watch_id * TCP_WATCH_STRIDE)), 856 watch_address_cntl); 857 858 WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) + 859 (watch_id * TCP_WATCH_STRIDE)), 860 watch_address_high); 861 862 WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) + 863 (watch_id * TCP_WATCH_STRIDE)), 864 watch_address_low); 865 866 /* Enable the watch point */ 867 watch_address_cntl = REG_SET_FIELD(watch_address_cntl, 868 TCP_WATCH0_CNTL, 869 VALID, 870 1); 871 872 WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) + 873 (watch_id * TCP_WATCH_STRIDE)), 874 watch_address_cntl); 875 876 return 0; 877 } 878 879 uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev, 880 uint32_t watch_id) 881 { 882 uint32_t watch_address_cntl; 883 884 watch_address_cntl = 0; 885 886 WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) + 887 (watch_id * TCP_WATCH_STRIDE)), 888 watch_address_cntl); 889 890 return 0; 891 } 892 893 /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values 894 * The values read are: 895 * ib_offload_wait_time -- Wait Count for Indirect Buffer Offloads. 896 * atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads. 897 * wrm_offload_wait_time -- Wait Count for WAIT_REG_MEM Offloads. 898 * gws_wait_time -- Wait Count for Global Wave Syncs. 899 * que_sleep_wait_time -- Wait Count for Dequeue Retry. 900 * sch_wave_wait_time -- Wait Count for Scheduling Wave Message. 901 * sem_rearm_wait_time -- Wait Count for Semaphore re-arm. 902 * deq_retry_wait_time -- Wait Count for Global Wave Syncs. 903 */ 904 void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, 905 uint32_t *wait_times, 906 uint32_t inst) 907 908 { 909 *wait_times = RREG32_SOC15_RLC(GC, GET_INST(GC, inst), 910 mmCP_IQ_WAIT_TIME2); 911 } 912 913 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev, 914 uint32_t vmid, uint64_t page_table_base) 915 { 916 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 917 pr_err("trying to set page table base for wrong VMID %u\n", 918 vmid); 919 return; 920 } 921 922 adev->mmhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); 923 924 adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); 925 } 926 927 static void lock_spi_csq_mutexes(struct amdgpu_device *adev) 928 { 929 mutex_lock(&adev->srbm_mutex); 930 mutex_lock(&adev->grbm_idx_mutex); 931 932 } 933 934 static void unlock_spi_csq_mutexes(struct amdgpu_device *adev) 935 { 936 mutex_unlock(&adev->grbm_idx_mutex); 937 mutex_unlock(&adev->srbm_mutex); 938 } 939 940 /** 941 * get_wave_count: Read device registers to get number of waves in flight for 942 * a particular queue. The method also returns the VMID associated with the 943 * queue. 944 * 945 * @adev: Handle of device whose registers are to be read 946 * @queue_idx: Index of queue in the queue-map bit-field 947 * @wave_cnt: Output parameter updated with number of waves in flight 948 * @vmid: Output parameter updated with VMID of queue whose wave count 949 * is being collected 950 * @inst: xcc's instance number on a multi-XCC setup 951 */ 952 static void get_wave_count(struct amdgpu_device *adev, int queue_idx, 953 struct kfd_cu_occupancy *queue_cnt, uint32_t inst) 954 { 955 int pipe_idx; 956 int queue_slot; 957 unsigned int reg_val; 958 unsigned int wave_cnt; 959 /* 960 * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID 961 * parameters to read out waves in flight. Get VMID if there are 962 * non-zero waves in flight. 963 */ 964 pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; 965 queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; 966 soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst)); 967 reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, inst), 968 mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot); 969 wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; 970 if (wave_cnt != 0) { 971 queue_cnt->wave_cnt += wave_cnt; 972 queue_cnt->doorbell_off = 973 (RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL) & 974 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >> 975 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; 976 } 977 } 978 979 /** 980 * kgd_gfx_v9_get_cu_occupancy: Reads relevant registers associated with each 981 * shader engine and aggregates the number of waves that are in flight for the 982 * process whose pasid is provided as a parameter. The process could have ZERO 983 * or more queues running and submitting waves to compute units. 984 * 985 * @adev: Handle of device from which to get number of waves in flight 986 * @cu_occupancy: Array that gets filled with wave_cnt and doorbell offset 987 * for comparison later. 988 * @max_waves_per_cu: Output parameter updated with maximum number of waves 989 * possible per Compute Unit 990 * @inst: xcc's instance number on a multi-XCC setup 991 * 992 * Note: It's possible that the device has too many queues (oversubscription) 993 * in which case a VMID could be remapped to a different PASID. This could lead 994 * to an inaccurate wave count. Following is a high-level sequence: 995 * Time T1: vmid = getVmid(); vmid is associated with Pasid P1 996 * Time T2: passId = getPasId(vmid); vmid is associated with Pasid P2 997 * In the sequence above wave count obtained from time T1 will be incorrectly 998 * lost or added to total wave count. 999 * 1000 * The registers that provide the waves in flight are: 1001 * 1002 * SPI_CSQ_WF_ACTIVE_STATUS - bit-map of queues per pipe. The bit is ON if a 1003 * queue is slotted, OFF if there is no queue. A process could have ZERO or 1004 * more queues slotted and submitting waves to be run on compute units. Even 1005 * when there is a queue it is possible there could be zero wave fronts, this 1006 * can happen when queue is waiting on top-of-pipe events - e.g. waitRegMem 1007 * command 1008 * 1009 * For each bit that is ON from above: 1010 * 1011 * Read (SPI_CSQ_WF_ACTIVE_COUNT_0 + queue_idx) register. It provides the 1012 * number of waves that are in flight for the queue at specified index. The 1013 * index ranges from 0 to 7. 1014 * 1015 * If non-zero waves are in flight, store the corresponding doorbell offset 1016 * of the queue, along with the wave count. 1017 * 1018 * Determine if the queue belongs to the process by comparing the doorbell 1019 * offset against the process's queues. If it matches, aggregate the wave 1020 * count for the process. 1021 * 1022 * Reading registers referenced above involves programming GRBM appropriately 1023 */ 1024 void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, 1025 struct kfd_cu_occupancy *cu_occupancy, 1026 int *max_waves_per_cu, uint32_t inst) 1027 { 1028 int qidx; 1029 int se_idx; 1030 int se_cnt; 1031 int queue_map; 1032 int max_queue_cnt; 1033 DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES); 1034 1035 lock_spi_csq_mutexes(adev); 1036 soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst)); 1037 1038 /* 1039 * Iterate through the shader engines and arrays of the device 1040 * to get number of waves in flight 1041 */ 1042 bitmap_complement(cp_queue_bitmap, adev->gfx.mec_bitmap[0].queue_bitmap, 1043 AMDGPU_MAX_QUEUES); 1044 max_queue_cnt = adev->gfx.mec.num_pipe_per_mec * 1045 adev->gfx.mec.num_queue_per_pipe; 1046 se_cnt = adev->gfx.config.max_shader_engines; 1047 for (se_idx = 0; se_idx < se_cnt; se_idx++) { 1048 amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst); 1049 queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_STATUS); 1050 1051 /* 1052 * Assumption: queue map encodes following schema: four 1053 * pipes per each micro-engine, with each pipe mapping 1054 * eight queues. This schema is true for GFX9 devices 1055 * and must be verified for newer device families 1056 */ 1057 for (qidx = 0; qidx < max_queue_cnt; qidx++) { 1058 /* Skip qeueus that are not associated with 1059 * compute functions 1060 */ 1061 if (!test_bit(qidx, cp_queue_bitmap)) 1062 continue; 1063 1064 if (!(queue_map & (1 << qidx))) 1065 continue; 1066 1067 /* Get number of waves in flight and aggregate them */ 1068 get_wave_count(adev, qidx, &cu_occupancy[qidx], 1069 inst); 1070 } 1071 } 1072 1073 amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst); 1074 soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst)); 1075 unlock_spi_csq_mutexes(adev); 1076 1077 /* Update the output parameters and return */ 1078 *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu * 1079 adev->gfx.cu_info.max_waves_per_simd; 1080 } 1081 1082 void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, 1083 uint32_t wait_times, 1084 uint32_t grace_period, 1085 uint32_t *reg_offset, 1086 uint32_t *reg_data) 1087 { 1088 *reg_data = wait_times; 1089 1090 /* 1091 * The CP cannot handle a 0 grace period input and will result in 1092 * an infinite grace period being set so set to 1 to prevent this. 1093 */ 1094 if (grace_period == 0) 1095 grace_period = 1; 1096 1097 *reg_data = REG_SET_FIELD(*reg_data, 1098 CP_IQ_WAIT_TIME2, 1099 SCH_WAVE, 1100 grace_period); 1101 1102 *reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2); 1103 } 1104 1105 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev, 1106 uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, uint32_t inst) 1107 { 1108 kgd_gfx_v9_lock_srbm(adev, 0, 0, 0, vmid, inst); 1109 1110 /* 1111 * Program TBA registers 1112 */ 1113 WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TBA_LO, 1114 lower_32_bits(tba_addr >> 8)); 1115 WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TBA_HI, 1116 upper_32_bits(tba_addr >> 8)); 1117 1118 /* 1119 * Program TMA registers 1120 */ 1121 WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TMA_LO, 1122 lower_32_bits(tma_addr >> 8)); 1123 WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TMA_HI, 1124 upper_32_bits(tma_addr >> 8)); 1125 1126 kgd_gfx_v9_unlock_srbm(adev, inst); 1127 } 1128 1129 uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev, 1130 uint32_t pipe_id, uint32_t queue_id, 1131 uint32_t inst) 1132 { 1133 uint32_t low, high; 1134 uint64_t queue_addr = 0; 1135 1136 if (!adev->debug_exp_resets && 1137 !adev->gfx.num_gfx_rings) 1138 return 0; 1139 1140 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 1141 amdgpu_gfx_rlc_enter_safe_mode(adev, inst); 1142 1143 if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE)) 1144 goto unlock_out; 1145 1146 low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE); 1147 high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI); 1148 1149 /* only concerned with user queues. */ 1150 if (!high) 1151 goto unlock_out; 1152 1153 queue_addr = (((queue_addr | high) << 32) | low) << 8; 1154 1155 unlock_out: 1156 amdgpu_gfx_rlc_exit_safe_mode(adev, inst); 1157 kgd_gfx_v9_release_queue(adev, inst); 1158 1159 return queue_addr; 1160 } 1161 1162 /* assume queue acquired */ 1163 static int kgd_gfx_v9_hqd_dequeue_wait(struct amdgpu_device *adev, uint32_t inst, 1164 unsigned int utimeout) 1165 { 1166 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 1167 1168 while (true) { 1169 uint32_t temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE); 1170 1171 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 1172 return 0; 1173 1174 if (time_after(jiffies, end_jiffies)) 1175 return -ETIME; 1176 1177 usleep_range(500, 1000); 1178 } 1179 } 1180 1181 uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev, 1182 uint32_t pipe_id, uint32_t queue_id, 1183 uint32_t inst, unsigned int utimeout) 1184 { 1185 uint32_t low, high, pipe_reset_data = 0; 1186 uint64_t queue_addr = 0; 1187 1188 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 1189 amdgpu_gfx_rlc_enter_safe_mode(adev, inst); 1190 1191 if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE)) 1192 goto unlock_out; 1193 1194 low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE); 1195 high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI); 1196 1197 /* only concerned with user queues. */ 1198 if (!high) 1199 goto unlock_out; 1200 1201 queue_addr = (((queue_addr | high) << 32) | low) << 8; 1202 1203 pr_debug("Attempting queue reset on XCC %i pipe id %i queue id %i\n", 1204 inst, pipe_id, queue_id); 1205 1206 /* assume previous dequeue request issued will take affect after reset */ 1207 WREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_COMPUTE_QUEUE_RESET, 0x1); 1208 1209 if (!kgd_gfx_v9_hqd_dequeue_wait(adev, inst, utimeout)) 1210 goto unlock_out; 1211 1212 pr_debug("Attempting pipe reset on XCC %i pipe id %i\n", inst, pipe_id); 1213 1214 pipe_reset_data = REG_SET_FIELD(pipe_reset_data, CP_MEC_CNTL, MEC_ME1_PIPE0_RESET, 1); 1215 pipe_reset_data = pipe_reset_data << pipe_id; 1216 1217 WREG32_SOC15(GC, GET_INST(GC, inst), mmCP_MEC_CNTL, pipe_reset_data); 1218 WREG32_SOC15(GC, GET_INST(GC, inst), mmCP_MEC_CNTL, 0); 1219 1220 if (kgd_gfx_v9_hqd_dequeue_wait(adev, inst, utimeout)) 1221 queue_addr = 0; 1222 1223 unlock_out: 1224 pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n", 1225 inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!"); 1226 amdgpu_gfx_rlc_exit_safe_mode(adev, inst); 1227 kgd_gfx_v9_release_queue(adev, inst); 1228 1229 return queue_addr; 1230 } 1231 1232 const struct kfd2kgd_calls gfx_v9_kfd2kgd = { 1233 .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, 1234 .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, 1235 .init_interrupts = kgd_gfx_v9_init_interrupts, 1236 .hqd_load = kgd_gfx_v9_hqd_load, 1237 .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load, 1238 .hqd_sdma_load = kgd_hqd_sdma_load, 1239 .hqd_dump = kgd_gfx_v9_hqd_dump, 1240 .hqd_sdma_dump = kgd_hqd_sdma_dump, 1241 .hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied, 1242 .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, 1243 .hqd_destroy = kgd_gfx_v9_hqd_destroy, 1244 .hqd_sdma_destroy = kgd_hqd_sdma_destroy, 1245 .wave_control_execute = kgd_gfx_v9_wave_control_execute, 1246 .get_atc_vmid_pasid_mapping_info = 1247 kgd_gfx_v9_get_atc_vmid_pasid_mapping_info, 1248 .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base, 1249 .enable_debug_trap = kgd_gfx_v9_enable_debug_trap, 1250 .disable_debug_trap = kgd_gfx_v9_disable_debug_trap, 1251 .validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request, 1252 .set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override, 1253 .set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode, 1254 .set_address_watch = kgd_gfx_v9_set_address_watch, 1255 .clear_address_watch = kgd_gfx_v9_clear_address_watch, 1256 .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, 1257 .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, 1258 .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, 1259 .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, 1260 .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, 1261 .hqd_reset = kgd_gfx_v9_hqd_reset 1262 }; 1263