1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2016-2022 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/printk.h> 26 #include <linux/slab.h> 27 #include <linux/uaccess.h> 28 #include "kfd_priv.h" 29 #include "kfd_mqd_manager.h" 30 #include "v9_structs.h" 31 #include "gc/gc_9_0_offset.h" 32 #include "gc/gc_9_0_sh_mask.h" 33 #include "sdma0/sdma0_4_0_sh_mask.h" 34 #include "amdgpu_amdkfd.h" 35 #include "kfd_device_queue_manager.h" 36 37 static void update_mqd(struct mqd_manager *mm, void *mqd, 38 struct queue_properties *q, 39 struct mqd_update_info *minfo); 40 41 static uint64_t mqd_stride_v9(struct mqd_manager *mm, 42 struct queue_properties *q) 43 { 44 if (mm->dev->kfd->cwsr_enabled && 45 q->type == KFD_QUEUE_TYPE_COMPUTE) 46 return ALIGN(q->ctl_stack_size, PAGE_SIZE) + 47 ALIGN(sizeof(struct v9_mqd), PAGE_SIZE); 48 49 return mm->mqd_size; 50 } 51 52 static inline struct v9_mqd *get_mqd(void *mqd) 53 { 54 return (struct v9_mqd *)mqd; 55 } 56 57 static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) 58 { 59 return (struct v9_sdma_mqd *)mqd; 60 } 61 62 static void update_cu_mask(struct mqd_manager *mm, void *mqd, 63 struct mqd_update_info *minfo, uint32_t inst) 64 { 65 struct v9_mqd *m; 66 uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; 67 68 if (!minfo || !minfo->cu_mask.ptr) 69 return; 70 71 mqd_symmetrically_map_cu_mask(mm, 72 minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, inst); 73 74 m = get_mqd(mqd); 75 76 m->compute_static_thread_mgmt_se0 = se_mask[0]; 77 m->compute_static_thread_mgmt_se1 = se_mask[1]; 78 m->compute_static_thread_mgmt_se2 = se_mask[2]; 79 m->compute_static_thread_mgmt_se3 = se_mask[3]; 80 if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3) && 81 KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 4) && 82 KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 5, 0)) { 83 m->compute_static_thread_mgmt_se4 = se_mask[4]; 84 m->compute_static_thread_mgmt_se5 = se_mask[5]; 85 m->compute_static_thread_mgmt_se6 = se_mask[6]; 86 m->compute_static_thread_mgmt_se7 = se_mask[7]; 87 88 pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n", 89 m->compute_static_thread_mgmt_se0, 90 m->compute_static_thread_mgmt_se1, 91 m->compute_static_thread_mgmt_se2, 92 m->compute_static_thread_mgmt_se3, 93 m->compute_static_thread_mgmt_se4, 94 m->compute_static_thread_mgmt_se5, 95 m->compute_static_thread_mgmt_se6, 96 m->compute_static_thread_mgmt_se7); 97 } else { 98 pr_debug("inst: %u, update cu mask to %#x %#x %#x %#x\n", 99 inst, m->compute_static_thread_mgmt_se0, 100 m->compute_static_thread_mgmt_se1, 101 m->compute_static_thread_mgmt_se2, 102 m->compute_static_thread_mgmt_se3); 103 } 104 } 105 106 static void set_priority(struct v9_mqd *m, struct queue_properties *q) 107 { 108 m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; 109 /* m->cp_hqd_queue_priority = q->priority; */ 110 } 111 112 static bool mqd_on_vram(struct amdgpu_device *adev) 113 { 114 if (adev->apu_prefer_gtt) 115 return false; 116 117 switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { 118 case IP_VERSION(9, 4, 3): 119 case IP_VERSION(9, 5, 0): 120 return true; 121 default: 122 return false; 123 } 124 } 125 126 static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, 127 struct queue_properties *q) 128 { 129 int retval; 130 struct kfd_node *node = mm->dev; 131 struct kfd_mem_obj *mqd_mem_obj = NULL; 132 133 /* For V9 only, due to a HW bug, the control stack of a user mode 134 * compute queue needs to be allocated just behind the page boundary 135 * of its regular MQD buffer. So we allocate an enlarged MQD buffer: 136 * the first page of the buffer serves as the regular MQD buffer 137 * purpose and the remaining is for control stack. Although the two 138 * parts are in the same buffer object, they need different memory 139 * types: MQD part needs UC (uncached) as usual, while control stack 140 * needs NC (non coherent), which is different from the UC type which 141 * is used when control stack is allocated in user space. 142 * 143 * Because of all those, we use the gtt allocation function instead 144 * of sub-allocation function for this enlarged MQD buffer. Moreover, 145 * in order to achieve two memory types in a single buffer object, we 146 * pass a special bo flag AMDGPU_GEM_CREATE_CP_MQD_GFX9 to instruct 147 * amdgpu memory functions to do so. 148 */ 149 if (node->kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { 150 mqd_mem_obj = kzalloc_obj(struct kfd_mem_obj); 151 if (!mqd_mem_obj) 152 return NULL; 153 retval = amdgpu_amdkfd_alloc_kernel_mem(node->adev, 154 (ALIGN(q->ctl_stack_size, PAGE_SIZE) + 155 ALIGN(sizeof(struct v9_mqd), PAGE_SIZE)) * 156 NUM_XCC(node->xcc_mask), 157 mqd_on_vram(node->adev) ? AMDGPU_GEM_DOMAIN_VRAM : 158 AMDGPU_GEM_DOMAIN_GTT, 159 &(mqd_mem_obj->mem), 160 &(mqd_mem_obj->gpu_addr), 161 (void *)&(mqd_mem_obj->cpu_ptr), true); 162 163 if (retval) { 164 kfree(mqd_mem_obj); 165 return NULL; 166 } 167 } else { 168 retval = kfd_gtt_sa_allocate(node, sizeof(struct v9_mqd), 169 &mqd_mem_obj); 170 if (retval) 171 return NULL; 172 } 173 174 return mqd_mem_obj; 175 } 176 177 static void init_mqd(struct mqd_manager *mm, void **mqd, 178 struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, 179 struct queue_properties *q) 180 { 181 uint64_t addr; 182 struct v9_mqd *m; 183 184 m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr; 185 addr = mqd_mem_obj->gpu_addr; 186 187 memset(m, 0, sizeof(struct v9_mqd)); 188 189 m->header = 0xC0310800; 190 m->compute_pipelinestat_enable = 1; 191 m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; 192 m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; 193 m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; 194 m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; 195 m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF; 196 m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF; 197 m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF; 198 m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF; 199 200 m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 201 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; 202 203 m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; 204 m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; 205 206 m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; 207 208 m->cp_mqd_base_addr_lo = lower_32_bits(addr); 209 m->cp_mqd_base_addr_hi = upper_32_bits(addr); 210 211 m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | 212 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | 213 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; 214 215 /* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the 216 * DISPATCH_PTR. This is required for the kfd debugger 217 */ 218 m->cp_hqd_hq_status0 = 1 << 14; 219 220 if (q->format == KFD_QUEUE_FORMAT_AQL) 221 m->cp_hqd_aql_control = 222 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; 223 224 if (q->tba_addr) { 225 m->compute_pgm_rsrc2 |= 226 (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); 227 } 228 229 if (mm->dev->kfd->cwsr_enabled && q->ctx_save_restore_area_address) { 230 m->cp_hqd_persistent_state |= 231 (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); 232 m->cp_hqd_ctx_save_base_addr_lo = 233 lower_32_bits(q->ctx_save_restore_area_address); 234 m->cp_hqd_ctx_save_base_addr_hi = 235 upper_32_bits(q->ctx_save_restore_area_address); 236 m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; 237 m->cp_hqd_cntl_stack_size = q->ctl_stack_size; 238 m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; 239 m->cp_hqd_wg_state_offset = q->ctl_stack_size; 240 } 241 242 *mqd = m; 243 if (gart_addr) 244 *gart_addr = addr; 245 update_mqd(mm, m, q, NULL); 246 } 247 248 static int load_mqd(struct mqd_manager *mm, void *mqd, 249 uint32_t pipe_id, uint32_t queue_id, 250 struct queue_properties *p, struct mm_struct *mms) 251 { 252 /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ 253 uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); 254 255 return mm->dev->kfd2kgd->hqd_load(mm->dev->adev, mqd, pipe_id, queue_id, 256 (uint32_t __user *)p->write_ptr, 257 wptr_shift, 0, mms, 0); 258 } 259 260 static void update_mqd(struct mqd_manager *mm, void *mqd, 261 struct queue_properties *q, 262 struct mqd_update_info *minfo) 263 { 264 struct v9_mqd *m; 265 266 m = get_mqd(mqd); 267 268 m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; 269 m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; 270 pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); 271 272 m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); 273 m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); 274 275 m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); 276 m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); 277 m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); 278 m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); 279 280 m->cp_hqd_pq_doorbell_control = 281 q->doorbell_off << 282 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; 283 pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", 284 m->cp_hqd_pq_doorbell_control); 285 286 m->cp_hqd_ib_control = 287 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | 288 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; 289 290 /* 291 * HW does not clamp this field correctly. Maximum EOP queue size 292 * is constrained by per-SE EOP done signal count, which is 8-bit. 293 * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit 294 * more than (EOP entry count - 1) so a queue size of 0x800 dwords 295 * is safe, giving a maximum field value of 0xA. 296 * 297 * Also, do calculation only if EOP is used (size > 0), otherwise 298 * the order_base_2 calculation provides incorrect result. 299 * 300 */ 301 m->cp_hqd_eop_control = q->eop_ring_buffer_size ? 302 min(0xA, order_base_2(q->eop_ring_buffer_size / 4) - 1) : 0; 303 304 m->cp_hqd_eop_base_addr_lo = 305 lower_32_bits(q->eop_ring_buffer_address >> 8); 306 m->cp_hqd_eop_base_addr_hi = 307 upper_32_bits(q->eop_ring_buffer_address >> 8); 308 309 m->cp_hqd_iq_timer = 0; 310 311 m->cp_hqd_vmid = q->vmid; 312 313 if (q->format == KFD_QUEUE_FORMAT_AQL) { 314 m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 315 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | 316 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | 317 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; 318 m->cp_hqd_pq_doorbell_control |= 1 << 319 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; 320 } 321 if (mm->dev->kfd->cwsr_enabled && q->ctx_save_restore_area_address) 322 m->cp_hqd_ctx_save_control = 0; 323 324 if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3) && 325 KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 4) && 326 KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 5, 0)) 327 update_cu_mask(mm, mqd, minfo, 0); 328 set_priority(m, q); 329 330 if (minfo && KFD_GC_VERSION(mm->dev) >= IP_VERSION(9, 4, 2)) { 331 if (minfo->update_flag & UPDATE_FLAG_IS_GWS) 332 m->compute_resource_limits |= 333 COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK; 334 else 335 m->compute_resource_limits &= 336 ~COMPUTE_RESOURCE_LIMITS__FORCE_SIMD_DIST_MASK; 337 } 338 339 q->is_active = QUEUE_IS_ACTIVE(*q); 340 } 341 342 343 static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) 344 { 345 struct v9_mqd *m = (struct v9_mqd *)mqd; 346 uint32_t doorbell_id = m->queue_doorbell_id0; 347 348 m->queue_doorbell_id0 = 0; 349 350 return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0); 351 } 352 353 static int get_wave_state(struct mqd_manager *mm, void *mqd, 354 struct queue_properties *q, 355 void __user *ctl_stack, 356 u32 *ctl_stack_used_size, 357 u32 *save_area_used_size) 358 { 359 struct v9_mqd *m; 360 struct kfd_context_save_area_header header; 361 362 /* Control stack is located one page after MQD. */ 363 void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); 364 365 m = get_mqd(mqd); 366 367 *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - 368 m->cp_hqd_cntl_stack_offset; 369 *save_area_used_size = m->cp_hqd_wg_state_offset - 370 m->cp_hqd_cntl_stack_size; 371 372 header.wave_state.control_stack_size = *ctl_stack_used_size; 373 header.wave_state.wave_state_size = *save_area_used_size; 374 375 header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset; 376 header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset; 377 378 if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state))) 379 return -EFAULT; 380 381 if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset, 382 mqd_ctl_stack + m->cp_hqd_cntl_stack_offset, 383 *ctl_stack_used_size)) 384 return -EFAULT; 385 386 return 0; 387 } 388 389 static void get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size) 390 { 391 struct v9_mqd *m = get_mqd(mqd); 392 393 *ctl_stack_size = m->cp_hqd_cntl_stack_size * NUM_XCC(mm->dev->xcc_mask); 394 } 395 396 static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst) 397 { 398 struct v9_mqd *m; 399 /* Control stack is located one page after MQD. */ 400 void *ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); 401 402 m = get_mqd(mqd); 403 404 memcpy(mqd_dst, m, sizeof(struct v9_mqd)); 405 memcpy(ctl_stack_dst, ctl_stack, m->cp_hqd_cntl_stack_size); 406 } 407 408 static void checkpoint_mqd_v9_4_3(struct mqd_manager *mm, 409 void *mqd, 410 void *mqd_dst, 411 void *ctl_stack_dst) 412 { 413 struct v9_mqd *m; 414 int xcc; 415 uint64_t size = get_mqd(mqd)->cp_mqd_stride_size; 416 417 for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { 418 m = get_mqd(mqd + size * xcc); 419 420 checkpoint_mqd(mm, m, 421 (uint8_t *)mqd_dst + sizeof(*m) * xcc, 422 (uint8_t *)ctl_stack_dst + m->cp_hqd_cntl_stack_size * xcc); 423 } 424 } 425 426 static void restore_mqd(struct mqd_manager *mm, void **mqd, 427 struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, 428 struct queue_properties *qp, 429 const void *mqd_src, 430 const void *ctl_stack_src, u32 ctl_stack_size) 431 { 432 uint64_t addr; 433 struct v9_mqd *m; 434 void *ctl_stack; 435 436 m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr; 437 addr = mqd_mem_obj->gpu_addr; 438 439 memcpy(m, mqd_src, sizeof(*m)); 440 441 *mqd = m; 442 if (gart_addr) 443 *gart_addr = addr; 444 445 /* Control stack is located one page after MQD. */ 446 ctl_stack = (void *)((uintptr_t)*mqd + PAGE_SIZE); 447 memcpy(ctl_stack, ctl_stack_src, ctl_stack_size); 448 449 m->cp_hqd_pq_doorbell_control = 450 qp->doorbell_off << 451 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; 452 pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", 453 m->cp_hqd_pq_doorbell_control); 454 455 qp->is_active = 0; 456 } 457 458 static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, 459 struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, 460 struct queue_properties *q) 461 { 462 struct v9_mqd *m; 463 464 init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); 465 466 m = get_mqd(*mqd); 467 468 m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 469 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; 470 } 471 472 static int destroy_hiq_mqd(struct mqd_manager *mm, void *mqd, 473 enum kfd_preempt_type type, unsigned int timeout, 474 uint32_t pipe_id, uint32_t queue_id) 475 { 476 int err; 477 struct v9_mqd *m; 478 u32 doorbell_off; 479 480 m = get_mqd(mqd); 481 482 doorbell_off = m->cp_hqd_pq_doorbell_control >> 483 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; 484 err = amdgpu_amdkfd_unmap_hiq(mm->dev->adev, doorbell_off, 0); 485 if (err) 486 pr_debug("Destroy HIQ MQD failed: %d\n", err); 487 488 return err; 489 } 490 491 static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, 492 struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, 493 struct queue_properties *q) 494 { 495 struct v9_sdma_mqd *m; 496 497 m = (struct v9_sdma_mqd *) mqd_mem_obj->cpu_ptr; 498 499 memset(m, 0, sizeof(struct v9_sdma_mqd)); 500 501 *mqd = m; 502 if (gart_addr) 503 *gart_addr = mqd_mem_obj->gpu_addr; 504 505 mm->update_mqd(mm, m, q, NULL); 506 } 507 508 #define SDMA_RLC_DUMMY_DEFAULT 0xf 509 510 static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, 511 struct queue_properties *q, 512 struct mqd_update_info *minfo) 513 { 514 struct v9_sdma_mqd *m; 515 516 m = get_sdma_mqd(mqd); 517 m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) 518 << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | 519 q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | 520 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | 521 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; 522 523 m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); 524 m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); 525 m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); 526 m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); 527 m->sdmax_rlcx_doorbell_offset = 528 q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; 529 530 m->sdma_engine_id = q->sdma_engine_id; 531 m->sdma_queue_id = q->sdma_queue_id; 532 m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; 533 /* Allow context switch so we don't cross-process starve with a massive 534 * command buffer of long-running SDMA commands 535 */ 536 m->sdmax_rlcx_ib_cntl |= SDMA0_GFX_IB_CNTL__SWITCH_INSIDE_IB_MASK; 537 538 q->is_active = QUEUE_IS_ACTIVE(*q); 539 } 540 541 static void checkpoint_mqd_sdma(struct mqd_manager *mm, 542 void *mqd, 543 void *mqd_dst, 544 void *ctl_stack_dst) 545 { 546 struct v9_sdma_mqd *m; 547 548 m = get_sdma_mqd(mqd); 549 550 memcpy(mqd_dst, m, sizeof(struct v9_sdma_mqd)); 551 } 552 553 static void restore_mqd_sdma(struct mqd_manager *mm, void **mqd, 554 struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, 555 struct queue_properties *qp, 556 const void *mqd_src, 557 const void *ctl_stack_src, const u32 ctl_stack_size) 558 { 559 uint64_t addr; 560 struct v9_sdma_mqd *m; 561 562 m = (struct v9_sdma_mqd *) mqd_mem_obj->cpu_ptr; 563 addr = mqd_mem_obj->gpu_addr; 564 565 memcpy(m, mqd_src, sizeof(*m)); 566 567 m->sdmax_rlcx_doorbell_offset = 568 qp->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; 569 570 *mqd = m; 571 if (gart_addr) 572 *gart_addr = addr; 573 574 qp->is_active = 0; 575 } 576 577 static void init_mqd_hiq_v9_4_3(struct mqd_manager *mm, void **mqd, 578 struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, 579 struct queue_properties *q) 580 { 581 struct v9_mqd *m; 582 int xcc = 0; 583 struct kfd_mem_obj xcc_mqd_mem_obj; 584 uint64_t xcc_gart_addr = 0; 585 586 memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj)); 587 588 for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { 589 kfd_get_hiq_xcc_mqd(mm->dev, &xcc_mqd_mem_obj, xcc); 590 591 init_mqd(mm, (void **)&m, &xcc_mqd_mem_obj, &xcc_gart_addr, q); 592 593 m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 594 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 595 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; 596 if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) 597 m->cp_hqd_pq_doorbell_control |= 1 << 598 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; 599 m->cp_mqd_stride_size = kfd_hiq_mqd_stride(mm->dev); 600 if (xcc == 0) { 601 /* Set no_update_rptr = 0 in Master XCC */ 602 m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK; 603 604 /* Set the MQD pointer and gart address to XCC0 MQD */ 605 *mqd = m; 606 *gart_addr = xcc_gart_addr; 607 } 608 } 609 } 610 611 static int hiq_load_mqd_kiq_v9_4_3(struct mqd_manager *mm, void *mqd, 612 uint32_t pipe_id, uint32_t queue_id, 613 struct queue_properties *p, struct mm_struct *mms) 614 { 615 uint32_t xcc_mask = mm->dev->xcc_mask; 616 int xcc_id, err = 0, inst = 0; 617 void *xcc_mqd; 618 uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev); 619 620 for_each_inst(xcc_id, xcc_mask) { 621 xcc_mqd = mqd + hiq_mqd_size * inst; 622 err = mm->dev->kfd2kgd->hiq_mqd_load(mm->dev->adev, xcc_mqd, 623 pipe_id, queue_id, 624 p->doorbell_off, xcc_id); 625 if (err) { 626 pr_debug("Failed to load HIQ MQD for XCC: %d\n", inst); 627 break; 628 } 629 ++inst; 630 } 631 632 return err; 633 } 634 635 static int destroy_hiq_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, 636 enum kfd_preempt_type type, unsigned int timeout, 637 uint32_t pipe_id, uint32_t queue_id) 638 { 639 uint32_t xcc_mask = mm->dev->xcc_mask; 640 int xcc_id, err = 0, inst = 0; 641 uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev); 642 struct v9_mqd *m; 643 u32 doorbell_off; 644 645 for_each_inst(xcc_id, xcc_mask) { 646 m = get_mqd(mqd + hiq_mqd_size * inst); 647 648 doorbell_off = m->cp_hqd_pq_doorbell_control >> 649 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; 650 651 err = amdgpu_amdkfd_unmap_hiq(mm->dev->adev, doorbell_off, xcc_id); 652 if (err) { 653 pr_debug("Destroy HIQ MQD failed for xcc: %d\n", inst); 654 break; 655 } 656 ++inst; 657 } 658 659 return err; 660 } 661 662 static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd) 663 { 664 uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev); 665 uint32_t xcc_mask = mm->dev->xcc_mask; 666 int inst = 0, xcc_id; 667 struct v9_mqd *m; 668 bool ret = false; 669 670 for_each_inst(xcc_id, xcc_mask) { 671 m = get_mqd(mqd + hiq_mqd_size * inst); 672 ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, 673 m->queue_doorbell_id0, inst); 674 m->queue_doorbell_id0 = 0; 675 ++inst; 676 } 677 678 return ret; 679 } 680 681 static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj, 682 struct kfd_mem_obj *xcc_mqd_mem_obj, 683 uint64_t offset) 684 { 685 xcc_mqd_mem_obj->mem = (offset == 0) ? 686 mqd_mem_obj->mem : NULL; 687 xcc_mqd_mem_obj->gpu_addr = mqd_mem_obj->gpu_addr + offset; 688 xcc_mqd_mem_obj->cpu_ptr = (uint32_t *)((uintptr_t)mqd_mem_obj->cpu_ptr 689 + offset); 690 } 691 692 static void init_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, 693 struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, 694 struct queue_properties *q) 695 { 696 struct v9_mqd *m; 697 int xcc = 0; 698 struct kfd_mem_obj xcc_mqd_mem_obj; 699 uint64_t xcc_gart_addr = 0; 700 uint64_t xcc_ctx_save_restore_area_address; 701 uint64_t offset = mm->mqd_stride(mm, q); 702 uint32_t local_xcc_start = mm->dev->dqm->current_logical_xcc_start++; 703 704 memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj)); 705 for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { 706 get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset*xcc); 707 708 init_mqd(mm, (void **)&m, &xcc_mqd_mem_obj, &xcc_gart_addr, q); 709 if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) 710 m->cp_hqd_pq_doorbell_control |= 1 << 711 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; 712 m->cp_mqd_stride_size = offset; 713 714 /* 715 * Update the CWSR address for each XCC if CWSR is enabled 716 * and CWSR area is allocated in thunk 717 */ 718 if (mm->dev->kfd->cwsr_enabled && 719 q->ctx_save_restore_area_address) { 720 xcc_ctx_save_restore_area_address = 721 q->ctx_save_restore_area_address + 722 (xcc * q->ctx_save_restore_area_size); 723 724 m->cp_hqd_ctx_save_base_addr_lo = 725 lower_32_bits(xcc_ctx_save_restore_area_address); 726 m->cp_hqd_ctx_save_base_addr_hi = 727 upper_32_bits(xcc_ctx_save_restore_area_address); 728 } 729 730 if (q->format == KFD_QUEUE_FORMAT_AQL) { 731 m->compute_tg_chunk_size = 1; 732 m->compute_current_logic_xcc_id = 733 (local_xcc_start + xcc) % 734 NUM_XCC(mm->dev->xcc_mask); 735 736 switch (xcc) { 737 case 0: 738 /* Master XCC */ 739 m->cp_hqd_pq_control &= 740 ~CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK; 741 break; 742 default: 743 break; 744 } 745 } else { 746 /* PM4 Queue */ 747 m->compute_current_logic_xcc_id = 0; 748 m->compute_tg_chunk_size = 0; 749 m->pm4_target_xcc_in_xcp = q->pm4_target_xcc; 750 } 751 752 if (xcc == 0) { 753 /* Set the MQD pointer and gart address to XCC0 MQD */ 754 *mqd = m; 755 *gart_addr = xcc_gart_addr; 756 } 757 } 758 759 if (mqd_on_vram(mm->dev->adev)) 760 amdgpu_device_flush_hdp(mm->dev->adev, NULL); 761 } 762 763 static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, 764 struct queue_properties *q, struct mqd_update_info *minfo) 765 { 766 struct v9_mqd *m; 767 int xcc = 0; 768 uint64_t size = mm->mqd_stride(mm, q); 769 770 for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { 771 m = get_mqd(mqd + size * xcc); 772 update_mqd(mm, m, q, minfo); 773 774 if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) 775 m->cp_hqd_pq_doorbell_control |= 1 << 776 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; 777 update_cu_mask(mm, m, minfo, xcc); 778 779 if (q->format == KFD_QUEUE_FORMAT_AQL) { 780 switch (xcc) { 781 case 0: 782 /* Master XCC */ 783 m->cp_hqd_pq_control &= 784 ~CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK; 785 break; 786 default: 787 break; 788 } 789 m->compute_tg_chunk_size = 1; 790 } else { 791 /* PM4 Queue */ 792 m->compute_current_logic_xcc_id = 0; 793 m->compute_tg_chunk_size = 0; 794 m->pm4_target_xcc_in_xcp = q->pm4_target_xcc; 795 } 796 } 797 798 if (mqd_on_vram(mm->dev->adev)) 799 amdgpu_device_flush_hdp(mm->dev->adev, NULL); 800 } 801 802 static void restore_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, 803 struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, 804 struct queue_properties *qp, 805 const void *mqd_src, 806 const void *ctl_stack_src, u32 ctl_stack_size) 807 { 808 struct kfd_mem_obj xcc_mqd_mem_obj; 809 u32 mqd_ctl_stack_size; 810 struct v9_mqd *m; 811 u32 num_xcc; 812 int xcc; 813 814 uint64_t offset = mm->mqd_stride(mm, qp); 815 816 mm->dev->dqm->current_logical_xcc_start++; 817 818 num_xcc = NUM_XCC(mm->dev->xcc_mask); 819 mqd_ctl_stack_size = ctl_stack_size / num_xcc; 820 821 memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj)); 822 823 /* Set the MQD pointer and gart address to XCC0 MQD */ 824 *mqd = mqd_mem_obj->cpu_ptr; 825 if (gart_addr) 826 *gart_addr = mqd_mem_obj->gpu_addr; 827 828 for (xcc = 0; xcc < num_xcc; xcc++) { 829 get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset * xcc); 830 restore_mqd(mm, (void **)&m, 831 &xcc_mqd_mem_obj, 832 NULL, 833 qp, 834 (uint8_t *)mqd_src + xcc * sizeof(*m), 835 (uint8_t *)ctl_stack_src + xcc * mqd_ctl_stack_size, 836 mqd_ctl_stack_size); 837 } 838 839 if (mqd_on_vram(mm->dev->adev)) 840 amdgpu_device_flush_hdp(mm->dev->adev, NULL); 841 } 842 static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, 843 enum kfd_preempt_type type, unsigned int timeout, 844 uint32_t pipe_id, uint32_t queue_id) 845 { 846 uint32_t xcc_mask = mm->dev->xcc_mask; 847 int xcc_id, err = 0, inst = 0; 848 void *xcc_mqd; 849 struct v9_mqd *m; 850 uint64_t mqd_offset; 851 852 m = get_mqd(mqd); 853 mqd_offset = m->cp_mqd_stride_size; 854 855 for_each_inst(xcc_id, xcc_mask) { 856 xcc_mqd = mqd + mqd_offset * inst; 857 err = mm->dev->kfd2kgd->hqd_destroy(mm->dev->adev, xcc_mqd, 858 type, timeout, pipe_id, 859 queue_id, xcc_id); 860 if (err) { 861 pr_debug("Destroy MQD failed for xcc: %d\n", inst); 862 break; 863 } 864 ++inst; 865 } 866 867 return err; 868 } 869 870 static int load_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, 871 uint32_t pipe_id, uint32_t queue_id, 872 struct queue_properties *p, struct mm_struct *mms) 873 { 874 /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ 875 uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); 876 uint32_t xcc_mask = mm->dev->xcc_mask; 877 int xcc_id, err = 0, inst = 0; 878 void *xcc_mqd; 879 uint64_t mqd_stride_size = mm->mqd_stride(mm, p); 880 881 for_each_inst(xcc_id, xcc_mask) { 882 xcc_mqd = mqd + mqd_stride_size * inst; 883 err = mm->dev->kfd2kgd->hqd_load( 884 mm->dev->adev, xcc_mqd, pipe_id, queue_id, 885 (uint32_t __user *)p->write_ptr, wptr_shift, 0, mms, 886 xcc_id); 887 if (err) { 888 pr_debug("Load MQD failed for xcc: %d\n", inst); 889 break; 890 } 891 ++inst; 892 } 893 894 return err; 895 } 896 897 static int get_wave_state_v9_4_3(struct mqd_manager *mm, void *mqd, 898 struct queue_properties *q, 899 void __user *ctl_stack, 900 u32 *ctl_stack_used_size, 901 u32 *save_area_used_size) 902 { 903 int xcc, err = 0; 904 void *xcc_mqd; 905 void __user *xcc_ctl_stack; 906 uint64_t mqd_stride_size = mm->mqd_stride(mm, q); 907 u32 tmp_ctl_stack_used_size = 0, tmp_save_area_used_size = 0; 908 909 for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { 910 xcc_mqd = mqd + mqd_stride_size * xcc; 911 xcc_ctl_stack = (void __user *)((uintptr_t)ctl_stack + 912 q->ctx_save_restore_area_size * xcc); 913 914 err = get_wave_state(mm, xcc_mqd, q, xcc_ctl_stack, 915 &tmp_ctl_stack_used_size, 916 &tmp_save_area_used_size); 917 if (err) 918 break; 919 920 /* 921 * Set the ctl_stack_used_size and save_area_used_size to 922 * ctl_stack_used_size and save_area_used_size of XCC 0 when 923 * passing the info the user-space. 924 * For multi XCC, user-space would have to look at the header 925 * info of each Control stack area to determine the control 926 * stack size and save area used. 927 */ 928 if (xcc == 0) { 929 *ctl_stack_used_size = tmp_ctl_stack_used_size; 930 *save_area_used_size = tmp_save_area_used_size; 931 } 932 } 933 934 return err; 935 } 936 937 #if defined(CONFIG_DEBUG_FS) 938 939 static int debugfs_show_mqd(struct seq_file *m, void *data) 940 { 941 seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, 942 data, sizeof(struct v9_mqd), false); 943 return 0; 944 } 945 946 static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) 947 { 948 seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, 949 data, sizeof(struct v9_sdma_mqd), false); 950 return 0; 951 } 952 953 #endif 954 955 struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, 956 struct kfd_node *dev) 957 { 958 struct mqd_manager *mqd; 959 960 if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) 961 return NULL; 962 963 mqd = kzalloc_obj(*mqd); 964 if (!mqd) 965 return NULL; 966 967 mqd->dev = dev; 968 969 switch (type) { 970 case KFD_MQD_TYPE_CP: 971 mqd->allocate_mqd = allocate_mqd; 972 mqd->free_mqd = kfd_free_mqd_cp; 973 mqd->is_occupied = kfd_is_occupied_cp; 974 mqd->get_checkpoint_info = get_checkpoint_info; 975 mqd->mqd_size = sizeof(struct v9_mqd); 976 mqd->mqd_stride = mqd_stride_v9; 977 #if defined(CONFIG_DEBUG_FS) 978 mqd->debugfs_show_mqd = debugfs_show_mqd; 979 #endif 980 if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || 981 KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || 982 KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) { 983 mqd->init_mqd = init_mqd_v9_4_3; 984 mqd->load_mqd = load_mqd_v9_4_3; 985 mqd->update_mqd = update_mqd_v9_4_3; 986 mqd->destroy_mqd = destroy_mqd_v9_4_3; 987 mqd->get_wave_state = get_wave_state_v9_4_3; 988 mqd->checkpoint_mqd = checkpoint_mqd_v9_4_3; 989 mqd->restore_mqd = restore_mqd_v9_4_3; 990 } else { 991 mqd->init_mqd = init_mqd; 992 mqd->load_mqd = load_mqd; 993 mqd->update_mqd = update_mqd; 994 mqd->destroy_mqd = kfd_destroy_mqd_cp; 995 mqd->get_wave_state = get_wave_state; 996 mqd->checkpoint_mqd = checkpoint_mqd; 997 mqd->restore_mqd = restore_mqd; 998 } 999 break; 1000 case KFD_MQD_TYPE_HIQ: 1001 mqd->allocate_mqd = allocate_hiq_mqd; 1002 mqd->free_mqd = free_mqd_hiq_sdma; 1003 mqd->update_mqd = update_mqd; 1004 mqd->is_occupied = kfd_is_occupied_cp; 1005 mqd->mqd_size = sizeof(struct v9_mqd); 1006 mqd->mqd_stride = kfd_mqd_stride; 1007 #if defined(CONFIG_DEBUG_FS) 1008 mqd->debugfs_show_mqd = debugfs_show_mqd; 1009 #endif 1010 mqd->check_preemption_failed = check_preemption_failed; 1011 if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || 1012 KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || 1013 KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) { 1014 mqd->init_mqd = init_mqd_hiq_v9_4_3; 1015 mqd->load_mqd = hiq_load_mqd_kiq_v9_4_3; 1016 mqd->destroy_mqd = destroy_hiq_mqd_v9_4_3; 1017 mqd->check_preemption_failed = check_preemption_failed_v9_4_3; 1018 } else { 1019 mqd->init_mqd = init_mqd_hiq; 1020 mqd->load_mqd = kfd_hiq_load_mqd_kiq; 1021 mqd->destroy_mqd = destroy_hiq_mqd; 1022 mqd->check_preemption_failed = check_preemption_failed; 1023 } 1024 break; 1025 case KFD_MQD_TYPE_DIQ: 1026 mqd->allocate_mqd = allocate_mqd; 1027 mqd->init_mqd = init_mqd_hiq; 1028 mqd->free_mqd = kfd_free_mqd_cp; 1029 mqd->load_mqd = load_mqd; 1030 mqd->update_mqd = update_mqd; 1031 mqd->destroy_mqd = kfd_destroy_mqd_cp; 1032 mqd->is_occupied = kfd_is_occupied_cp; 1033 mqd->mqd_size = sizeof(struct v9_mqd); 1034 #if defined(CONFIG_DEBUG_FS) 1035 mqd->debugfs_show_mqd = debugfs_show_mqd; 1036 #endif 1037 break; 1038 case KFD_MQD_TYPE_SDMA: 1039 mqd->allocate_mqd = allocate_sdma_mqd; 1040 mqd->init_mqd = init_mqd_sdma; 1041 mqd->free_mqd = free_mqd_hiq_sdma; 1042 mqd->load_mqd = kfd_load_mqd_sdma; 1043 mqd->update_mqd = update_mqd_sdma; 1044 mqd->destroy_mqd = kfd_destroy_mqd_sdma; 1045 mqd->is_occupied = kfd_is_occupied_sdma; 1046 mqd->checkpoint_mqd = checkpoint_mqd_sdma; 1047 mqd->restore_mqd = restore_mqd_sdma; 1048 mqd->mqd_size = sizeof(struct v9_sdma_mqd); 1049 mqd->mqd_stride = kfd_mqd_stride; 1050 #if defined(CONFIG_DEBUG_FS) 1051 mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; 1052 #endif 1053 break; 1054 default: 1055 kfree(mqd); 1056 return NULL; 1057 } 1058 1059 return mqd; 1060 } 1061