1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2014-2022 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/slab.h> 26 #include "kfd_priv.h" 27 #include "kfd_topology.h" 28 #include "kfd_svm.h" 29 30 void print_queue_properties(struct queue_properties *q) 31 { 32 if (!q) 33 return; 34 35 pr_debug("Printing queue properties:\n"); 36 pr_debug("Queue Type: %u\n", q->type); 37 pr_debug("Queue Size: %llu\n", q->queue_size); 38 pr_debug("Queue percent: %u\n", q->queue_percent); 39 pr_debug("Queue Address: 0x%llX\n", q->queue_address); 40 pr_debug("Queue Id: %u\n", q->queue_id); 41 pr_debug("Queue Process Vmid: %u\n", q->vmid); 42 pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr); 43 pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr); 44 pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); 45 pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); 46 } 47 48 void print_queue(struct queue *q) 49 { 50 if (!q) 51 return; 52 pr_debug("Printing queue:\n"); 53 pr_debug("Queue Type: %u\n", q->properties.type); 54 pr_debug("Queue Size: %llu\n", q->properties.queue_size); 55 pr_debug("Queue percent: %u\n", q->properties.queue_percent); 56 pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); 57 pr_debug("Queue Id: %u\n", q->properties.queue_id); 58 pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); 59 pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr); 60 pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr); 61 pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); 62 pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); 63 pr_debug("Queue MQD Address: 0x%p\n", q->mqd); 64 pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr); 65 pr_debug("Queue Process Address: 0x%p\n", q->process); 66 pr_debug("Queue Device Address: 0x%p\n", q->device); 67 } 68 69 int init_queue(struct queue **q, const struct queue_properties *properties) 70 { 71 struct queue *tmp_q; 72 73 tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL); 74 if (!tmp_q) 75 return -ENOMEM; 76 77 memcpy(&tmp_q->properties, properties, sizeof(*properties)); 78 79 *q = tmp_q; 80 return 0; 81 } 82 83 void uninit_queue(struct queue *q) 84 { 85 kfree(q); 86 } 87 88 #if IS_ENABLED(CONFIG_HSA_AMD_SVM) 89 90 static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) 91 { 92 struct kfd_process *p = pdd->process; 93 struct list_head update_list; 94 struct svm_range *prange; 95 int ret = -EINVAL; 96 97 INIT_LIST_HEAD(&update_list); 98 addr >>= PAGE_SHIFT; 99 size >>= PAGE_SHIFT; 100 101 mutex_lock(&p->svms.lock); 102 103 /* 104 * range may split to multiple svm pranges aligned to granularity boundaery. 105 */ 106 while (size) { 107 uint32_t gpuid, gpuidx; 108 int r; 109 110 prange = svm_range_from_addr(&p->svms, addr, NULL); 111 if (!prange) 112 break; 113 114 if (!prange->mapped_to_gpu) 115 break; 116 117 r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx); 118 if (r < 0) 119 break; 120 if (!test_bit(gpuidx, prange->bitmap_access) && 121 !test_bit(gpuidx, prange->bitmap_aip)) 122 break; 123 124 if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) 125 break; 126 127 list_add(&prange->update_list, &update_list); 128 129 if (prange->last - prange->start + 1 >= size) { 130 size = 0; 131 break; 132 } 133 134 size -= prange->last - prange->start + 1; 135 addr += prange->last - prange->start + 1; 136 } 137 if (size) { 138 pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size - 1); 139 goto out_unlock; 140 } 141 142 list_for_each_entry(prange, &update_list, update_list) 143 atomic_inc(&prange->queue_refcount); 144 ret = 0; 145 146 out_unlock: 147 mutex_unlock(&p->svms.lock); 148 return ret; 149 } 150 151 static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) 152 { 153 struct kfd_process *p = pdd->process; 154 struct svm_range *prange, *pchild; 155 struct interval_tree_node *node; 156 unsigned long last; 157 158 addr >>= PAGE_SHIFT; 159 last = addr + (size >> PAGE_SHIFT) - 1; 160 161 mutex_lock(&p->svms.lock); 162 163 node = interval_tree_iter_first(&p->svms.objects, addr, last); 164 while (node) { 165 struct interval_tree_node *next_node; 166 unsigned long next_start; 167 168 prange = container_of(node, struct svm_range, it_node); 169 next_node = interval_tree_iter_next(node, addr, last); 170 next_start = min(node->last, last) + 1; 171 172 if (atomic_add_unless(&prange->queue_refcount, -1, 0)) { 173 list_for_each_entry(pchild, &prange->child_list, child_list) 174 atomic_add_unless(&pchild->queue_refcount, -1, 0); 175 } 176 177 node = next_node; 178 addr = next_start; 179 } 180 181 mutex_unlock(&p->svms.lock); 182 } 183 #else 184 185 static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) 186 { 187 return -EINVAL; 188 } 189 190 static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) 191 { 192 } 193 194 #endif 195 196 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, 197 u64 expected_size) 198 { 199 struct amdgpu_bo_va_mapping *mapping; 200 u64 user_addr; 201 u64 size; 202 203 user_addr = (u64)addr >> AMDGPU_GPU_PAGE_SHIFT; 204 size = expected_size >> AMDGPU_GPU_PAGE_SHIFT; 205 206 mapping = amdgpu_vm_bo_lookup_mapping(vm, user_addr); 207 if (!mapping) 208 goto out_err; 209 210 if (user_addr != mapping->start || 211 (size != 0 && user_addr + size - 1 != mapping->last)) { 212 pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx size 0x%llx\n", 213 expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT, 214 (mapping->last - mapping->start + 1) << AMDGPU_GPU_PAGE_SHIFT); 215 goto out_err; 216 } 217 218 *pbo = amdgpu_bo_ref(mapping->bo_va->base.bo); 219 mapping->bo_va->queue_refcount++; 220 return 0; 221 222 out_err: 223 *pbo = NULL; 224 return -EINVAL; 225 } 226 227 /* FIXME: remove this function, just call amdgpu_bo_unref directly */ 228 void kfd_queue_buffer_put(struct amdgpu_bo **bo) 229 { 230 amdgpu_bo_unref(bo); 231 } 232 233 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) 234 { 235 struct kfd_topology_device *topo_dev; 236 u64 expected_queue_size; 237 struct amdgpu_vm *vm; 238 u32 total_cwsr_size; 239 int err; 240 241 topo_dev = kfd_topology_device_by_id(pdd->dev->id); 242 if (!topo_dev) 243 return -EINVAL; 244 245 /* AQL queues on GFX7 and GFX8 appear twice their actual size */ 246 if (properties->type == KFD_QUEUE_TYPE_COMPUTE && 247 properties->format == KFD_QUEUE_FORMAT_AQL && 248 topo_dev->node_props.gfx_target_version >= 70000 && 249 topo_dev->node_props.gfx_target_version < 90000) 250 expected_queue_size = properties->queue_size / 2; 251 else 252 expected_queue_size = properties->queue_size; 253 254 vm = drm_priv_to_vm(pdd->drm_priv); 255 err = amdgpu_bo_reserve(vm->root.bo, false); 256 if (err) 257 return err; 258 259 err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE); 260 if (err) 261 goto out_err_unreserve; 262 263 err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE); 264 if (err) 265 goto out_err_unreserve; 266 267 err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, 268 &properties->ring_bo, expected_queue_size); 269 if (err) 270 goto out_err_unreserve; 271 272 /* only compute queue requires EOP buffer and CWSR area */ 273 if (properties->type != KFD_QUEUE_TYPE_COMPUTE) 274 goto out_unreserve; 275 276 /* EOP buffer is not required for all ASICs */ 277 if (properties->eop_ring_buffer_address) { 278 if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) { 279 pr_debug("queue eop bo size 0x%x not equal to node eop buf size 0x%x\n", 280 properties->eop_ring_buffer_size, 281 topo_dev->node_props.eop_buffer_size); 282 err = -EINVAL; 283 goto out_err_unreserve; 284 } 285 err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, 286 &properties->eop_buf_bo, 287 properties->eop_ring_buffer_size); 288 if (err) 289 goto out_err_unreserve; 290 } 291 292 if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) { 293 pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n", 294 properties->ctl_stack_size, 295 topo_dev->node_props.ctl_stack_size); 296 err = -EINVAL; 297 goto out_err_unreserve; 298 } 299 300 if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { 301 pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", 302 properties->ctx_save_restore_area_size, 303 topo_dev->node_props.cwsr_size); 304 err = -EINVAL; 305 goto out_err_unreserve; 306 } 307 308 total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) 309 * NUM_XCC(pdd->dev->xcc_mask); 310 total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); 311 312 err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, 313 &properties->cwsr_bo, total_cwsr_size); 314 if (!err) 315 goto out_unreserve; 316 317 amdgpu_bo_unreserve(vm->root.bo); 318 319 err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address, 320 total_cwsr_size); 321 if (err) 322 goto out_err_release; 323 324 return 0; 325 326 out_unreserve: 327 amdgpu_bo_unreserve(vm->root.bo); 328 return 0; 329 330 out_err_unreserve: 331 amdgpu_bo_unreserve(vm->root.bo); 332 out_err_release: 333 /* FIXME: make a _locked version of this that can be called before 334 * dropping the VM reservation. 335 */ 336 kfd_queue_unref_bo_vas(pdd, properties); 337 kfd_queue_release_buffers(pdd, properties); 338 return err; 339 } 340 341 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) 342 { 343 struct kfd_topology_device *topo_dev; 344 u32 total_cwsr_size; 345 346 kfd_queue_buffer_put(&properties->wptr_bo); 347 kfd_queue_buffer_put(&properties->rptr_bo); 348 kfd_queue_buffer_put(&properties->ring_bo); 349 kfd_queue_buffer_put(&properties->eop_buf_bo); 350 kfd_queue_buffer_put(&properties->cwsr_bo); 351 352 topo_dev = kfd_topology_device_by_id(pdd->dev->id); 353 if (!topo_dev) 354 return -EINVAL; 355 total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) 356 * NUM_XCC(pdd->dev->xcc_mask); 357 total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); 358 359 kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size); 360 return 0; 361 } 362 363 void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo) 364 { 365 if (*bo) { 366 struct amdgpu_bo_va *bo_va; 367 368 bo_va = amdgpu_vm_bo_find(vm, *bo); 369 if (bo_va && bo_va->queue_refcount) 370 bo_va->queue_refcount--; 371 } 372 } 373 374 int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd, 375 struct queue_properties *properties) 376 { 377 struct amdgpu_vm *vm; 378 int err; 379 380 vm = drm_priv_to_vm(pdd->drm_priv); 381 err = amdgpu_bo_reserve(vm->root.bo, false); 382 if (err) 383 return err; 384 385 kfd_queue_unref_bo_va(vm, &properties->wptr_bo); 386 kfd_queue_unref_bo_va(vm, &properties->rptr_bo); 387 kfd_queue_unref_bo_va(vm, &properties->ring_bo); 388 kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo); 389 kfd_queue_unref_bo_va(vm, &properties->cwsr_bo); 390 391 amdgpu_bo_unreserve(vm->root.bo); 392 return 0; 393 } 394 395 #define SGPR_SIZE_PER_CU 0x4000 396 #define LDS_SIZE_PER_CU 0x10000 397 #define HWREG_SIZE_PER_CU 0x1000 398 #define DEBUGGER_BYTES_ALIGN 64 399 #define DEBUGGER_BYTES_PER_WAVE 32 400 401 static u32 kfd_get_vgpr_size_per_cu(u32 gfxv) 402 { 403 u32 vgpr_size = 0x40000; 404 405 if ((gfxv / 100 * 100) == 90400 || /* GFX_VERSION_AQUA_VANJARAM */ 406 gfxv == 90010 || /* GFX_VERSION_ALDEBARAN */ 407 gfxv == 90008 || /* GFX_VERSION_ARCTURUS */ 408 gfxv == 90500) 409 vgpr_size = 0x80000; 410 else if (gfxv == 110000 || /* GFX_VERSION_PLUM_BONITO */ 411 gfxv == 110001 || /* GFX_VERSION_WHEAT_NAS */ 412 gfxv == 120000 || /* GFX_VERSION_GFX1200 */ 413 gfxv == 120001) /* GFX_VERSION_GFX1201 */ 414 vgpr_size = 0x60000; 415 416 return vgpr_size; 417 } 418 419 #define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props) \ 420 (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\ 421 (((gfxv) == 90500) ? (props->lds_size_in_kb << 10) : LDS_SIZE_PER_CU) +\ 422 HWREG_SIZE_PER_CU) 423 424 #define CNTL_STACK_BYTES_PER_WAVE(gfxv) \ 425 ((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/ 426 427 #define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40 428 429 void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) 430 { 431 struct kfd_node_properties *props = &dev->node_props; 432 u32 gfxv = props->gfx_target_version; 433 u32 ctl_stack_size; 434 u32 wg_data_size; 435 u32 wave_num; 436 u32 cu_num; 437 438 if (gfxv < 80001) /* GFX_VERSION_CARRIZO */ 439 return; 440 441 cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask); 442 wave_num = (gfxv < 100100) ? /* GFX_VERSION_NAVI10 */ 443 min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512) 444 : cu_num * 32; 445 446 wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), PAGE_SIZE); 447 ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8; 448 ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size, 449 PAGE_SIZE); 450 451 if ((gfxv / 10000 * 10000) == 100000) { 452 /* HW design limits control stack size to 0x7000. 453 * This is insufficient for theoretical PM4 cases 454 * but sufficient for AQL, limited by SPI events. 455 */ 456 ctl_stack_size = min(ctl_stack_size, 0x7000); 457 } 458 459 props->ctl_stack_size = ctl_stack_size; 460 props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN); 461 props->cwsr_size = ctl_stack_size + wg_data_size; 462 463 if (gfxv == 80002) /* GFX_VERSION_TONGA */ 464 props->eop_buffer_size = 0x8000; 465 else if ((gfxv / 100 * 100) == 90400) /* GFX_VERSION_AQUA_VANJARAM */ 466 props->eop_buffer_size = 4096; 467 else if (gfxv >= 80000) 468 props->eop_buffer_size = 4096; 469 } 470