1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2023 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25 #include <drm/drm_auth.h> 26 #include <drm/drm_exec.h> 27 #include <linux/pm_runtime.h> 28 #include <drm/drm_drv.h> 29 30 #include "amdgpu.h" 31 #include "amdgpu_reset.h" 32 #include "amdgpu_vm.h" 33 #include "amdgpu_userq.h" 34 #include "amdgpu_hmm.h" 35 #include "amdgpu_userq_fence.h" 36 37 u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) 38 { 39 int i; 40 u32 userq_ip_mask = 0; 41 42 for (i = 0; i < AMDGPU_HW_IP_NUM; i++) { 43 if (adev->userq_funcs[i]) 44 userq_ip_mask |= (1 << i); 45 } 46 47 return userq_ip_mask; 48 } 49 50 static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev, 51 enum amdgpu_ring_type ring_type, int reset_type) 52 { 53 54 if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX) 55 return false; 56 57 switch (ring_type) { 58 case AMDGPU_RING_TYPE_GFX: 59 if (adev->gfx.gfx_supported_reset & reset_type) 60 return true; 61 break; 62 case AMDGPU_RING_TYPE_COMPUTE: 63 if (adev->gfx.compute_supported_reset & reset_type) 64 return true; 65 break; 66 case AMDGPU_RING_TYPE_SDMA: 67 if (adev->sdma.supported_reset & reset_type) 68 return true; 69 break; 70 case AMDGPU_RING_TYPE_VCN_DEC: 71 case AMDGPU_RING_TYPE_VCN_ENC: 72 if (adev->vcn.supported_reset & reset_type) 73 return true; 74 break; 75 case AMDGPU_RING_TYPE_VCN_JPEG: 76 if (adev->jpeg.supported_reset & reset_type) 77 return true; 78 break; 79 default: 80 break; 81 } 82 return false; 83 } 84 85 static void amdgpu_userq_mgr_reset_work(struct work_struct *work) 86 { 87 struct amdgpu_userq_mgr *uq_mgr = 88 container_of(work, struct amdgpu_userq_mgr, 89 reset_work); 90 struct amdgpu_device *adev = uq_mgr->adev; 91 const int queue_types[] = { 92 AMDGPU_RING_TYPE_COMPUTE, 93 AMDGPU_RING_TYPE_GFX, 94 AMDGPU_RING_TYPE_SDMA 95 }; 96 const int num_queue_types = ARRAY_SIZE(queue_types); 97 bool gpu_reset = false; 98 int i, r; 99 100 if (unlikely(adev->debug_disable_gpu_ring_reset)) { 101 dev_err(adev->dev, "userq reset disabled by debug mask\n"); 102 return; 103 } 104 105 /* 106 * If GPU recovery feature is disabled system-wide, 107 * skip all reset detection logic 108 */ 109 if (!amdgpu_gpu_recovery) 110 return; 111 112 /* 113 * Iterate through all queue types to detect and reset problematic queues 114 * Process each queue type in the defined order 115 */ 116 for (i = 0; i < num_queue_types; i++) { 117 int ring_type = queue_types[i]; 118 const struct amdgpu_userq_funcs *funcs = 119 adev->userq_funcs[ring_type]; 120 121 if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, 122 AMDGPU_RESET_TYPE_PER_QUEUE)) 123 continue; 124 125 if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 && 126 funcs && funcs->detect_and_reset) { 127 r = funcs->detect_and_reset(adev, ring_type); 128 if (r) { 129 gpu_reset = true; 130 break; 131 } 132 } 133 } 134 135 if (gpu_reset) { 136 struct amdgpu_reset_context reset_context; 137 138 memset(&reset_context, 0, sizeof(reset_context)); 139 140 reset_context.method = AMD_RESET_METHOD_NONE; 141 reset_context.reset_req_dev = adev; 142 reset_context.src = AMDGPU_RESET_SRC_USERQ; 143 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 144 /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ 145 146 amdgpu_device_gpu_recover(adev, NULL, &reset_context); 147 } 148 } 149 150 static void amdgpu_userq_hang_detect_work(struct work_struct *work) 151 { 152 struct amdgpu_usermode_queue *queue = 153 container_of(work, struct amdgpu_usermode_queue, 154 hang_detect_work.work); 155 156 /* 157 * Don't schedule the work here! Scheduling or queue work from one reset 158 * handler to another is illegal if you don't take extra precautions! 159 */ 160 amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work); 161 } 162 163 /* 164 * Start hang detection for a user queue fence. A delayed work will be scheduled 165 * to reset the queues when the fence doesn't signal in time. 166 */ 167 void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) 168 { 169 struct amdgpu_device *adev; 170 unsigned long timeout_ms; 171 172 adev = queue->userq_mgr->adev; 173 /* Determine timeout based on queue type */ 174 switch (queue->queue_type) { 175 case AMDGPU_RING_TYPE_GFX: 176 timeout_ms = adev->gfx_timeout; 177 break; 178 case AMDGPU_RING_TYPE_COMPUTE: 179 timeout_ms = adev->compute_timeout; 180 break; 181 case AMDGPU_RING_TYPE_SDMA: 182 timeout_ms = adev->sdma_timeout; 183 break; 184 default: 185 timeout_ms = adev->gfx_timeout; 186 break; 187 } 188 189 queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work, 190 msecs_to_jiffies(timeout_ms)); 191 } 192 193 void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell) 194 { 195 struct xarray *xa = &adev->userq_doorbell_xa; 196 struct amdgpu_usermode_queue *queue; 197 unsigned long flags; 198 int r; 199 200 xa_lock_irqsave(xa, flags); 201 queue = xa_load(xa, doorbell); 202 if (queue) { 203 r = amdgpu_userq_fence_driver_process(queue->fence_drv); 204 /* 205 * We are in interrupt context here, this *can't* wait for 206 * reset work to finish. 207 */ 208 if (r >= 0) 209 cancel_delayed_work(&queue->hang_detect_work); 210 211 /* Restart the timer when there are still fences pending */ 212 if (r == 1) 213 amdgpu_userq_start_hang_detect_work(queue); 214 } 215 xa_unlock_irqrestore(xa, flags); 216 } 217 218 int amdgpu_userq_input_va_validate(struct amdgpu_device *adev, 219 struct amdgpu_usermode_queue *queue, 220 u64 addr, u64 expected_size, 221 u64 *va_out) 222 { 223 struct amdgpu_bo_va_mapping *va_map; 224 struct amdgpu_vm *vm = queue->vm; 225 u64 user_addr; 226 u64 size; 227 228 /* Caller must hold vm->root.bo reservation */ 229 dma_resv_assert_held(queue->vm->root.bo->tbo.base.resv); 230 231 user_addr = (addr & AMDGPU_GMC_HOLE_MASK) >> AMDGPU_GPU_PAGE_SHIFT; 232 size = expected_size >> AMDGPU_GPU_PAGE_SHIFT; 233 234 va_map = amdgpu_vm_bo_lookup_mapping(vm, user_addr); 235 if (!va_map) 236 return -EINVAL; 237 238 /* Only validate the userq whether resident in the VM mapping range */ 239 if (user_addr >= va_map->start && 240 va_map->last - user_addr + 1 >= size) { 241 va_map->bo_va->userq_va_mapped = true; 242 *va_out = user_addr; 243 return 0; 244 } 245 246 return -EINVAL; 247 } 248 249 static bool amdgpu_userq_buffer_va_mapped(struct amdgpu_vm *vm, u64 addr) 250 { 251 struct amdgpu_bo_va_mapping *mapping; 252 bool r; 253 254 dma_resv_assert_held(vm->root.bo->tbo.base.resv); 255 256 mapping = amdgpu_vm_bo_lookup_mapping(vm, addr); 257 if (!IS_ERR_OR_NULL(mapping) && mapping->bo_va->userq_va_mapped) 258 r = true; 259 else 260 r = false; 261 262 return r; 263 } 264 265 static bool amdgpu_userq_buffer_vas_mapped(struct amdgpu_usermode_queue *queue) 266 { 267 int i, r = 0; 268 269 for (i = 0; i < ARRAY_SIZE(queue->userq_vas.va_array); i++) { 270 if (!queue->userq_vas.va_array[i]) 271 continue; 272 r += amdgpu_userq_buffer_va_mapped(queue->vm, 273 queue->userq_vas.va_array[i]); 274 dev_dbg(queue->userq_mgr->adev->dev, 275 "validate the userq mapping:%p va:%llx r:%d\n", 276 queue, queue->userq_vas.va_array[i], r); 277 } 278 279 if (r != 0) 280 return true; 281 282 return false; 283 } 284 285 286 287 static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue) 288 { 289 struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; 290 struct amdgpu_device *adev = uq_mgr->adev; 291 const struct amdgpu_userq_funcs *userq_funcs = 292 adev->userq_funcs[queue->queue_type]; 293 int r; 294 295 if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { 296 r = userq_funcs->preempt(queue); 297 if (r) { 298 queue->state = AMDGPU_USERQ_STATE_HUNG; 299 return r; 300 } else { 301 queue->state = AMDGPU_USERQ_STATE_PREEMPTED; 302 } 303 } 304 return 0; 305 } 306 307 static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue) 308 { 309 struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; 310 struct amdgpu_device *adev = uq_mgr->adev; 311 const struct amdgpu_userq_funcs *userq_funcs = 312 adev->userq_funcs[queue->queue_type]; 313 int r = 0; 314 315 if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) { 316 r = userq_funcs->restore(queue); 317 if (r) { 318 queue->state = AMDGPU_USERQ_STATE_HUNG; 319 } else { 320 queue->state = AMDGPU_USERQ_STATE_MAPPED; 321 } 322 } 323 324 return r; 325 } 326 327 static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue) 328 { 329 struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; 330 struct amdgpu_device *adev = uq_mgr->adev; 331 const struct amdgpu_userq_funcs *userq_funcs = 332 adev->userq_funcs[queue->queue_type]; 333 int r; 334 335 if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || 336 (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { 337 338 r = userq_funcs->unmap(queue); 339 if (r) { 340 queue->state = AMDGPU_USERQ_STATE_HUNG; 341 return r; 342 } else { 343 queue->state = AMDGPU_USERQ_STATE_UNMAPPED; 344 } 345 } 346 347 return 0; 348 } 349 350 static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue) 351 { 352 struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; 353 struct amdgpu_device *adev = uq_mgr->adev; 354 const struct amdgpu_userq_funcs *userq_funcs = 355 adev->userq_funcs[queue->queue_type]; 356 int r; 357 358 if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { 359 r = userq_funcs->map(queue); 360 if (r) { 361 queue->state = AMDGPU_USERQ_STATE_HUNG; 362 return r; 363 } else { 364 queue->state = AMDGPU_USERQ_STATE_MAPPED; 365 } 366 } 367 368 return 0; 369 } 370 371 static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue) 372 { 373 struct dma_fence *f = queue->last_fence; 374 375 if (!f) 376 return; 377 378 dma_fence_wait(f, false); 379 } 380 381 static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue) 382 { 383 struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; 384 struct amdgpu_device *adev = uq_mgr->adev; 385 386 /* Wait for mode-1 reset to complete */ 387 down_read(&adev->reset_domain->sem); 388 389 /* Use interrupt-safe locking since IRQ handlers may access these XArrays */ 390 xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); 391 amdgpu_userq_fence_driver_free(queue); 392 queue->fence_drv = NULL; 393 394 up_read(&adev->reset_domain->sem); 395 } 396 397 /** 398 * amdgpu_userq_ensure_ev_fence - ensure a valid, unsignaled eviction fence exists 399 * @uq_mgr: the usermode queue manager for this process 400 * @evf_mgr: the eviction fence manager to check and rearm 401 * 402 * Ensures that a valid and not yet signaled eviction fence is attached to the 403 * usermode queue before any queue operations proceed. If it is signalled, then 404 * rearm a new eviction fence. 405 */ 406 void 407 amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr, 408 struct amdgpu_eviction_fence_mgr *evf_mgr) 409 { 410 struct dma_fence *ev_fence; 411 412 retry: 413 /* Flush any pending resume work to create ev_fence */ 414 flush_delayed_work(&uq_mgr->resume_work); 415 416 mutex_lock(&uq_mgr->userq_mutex); 417 ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr); 418 if (dma_fence_is_signaled(ev_fence)) { 419 dma_fence_put(ev_fence); 420 mutex_unlock(&uq_mgr->userq_mutex); 421 /* 422 * Looks like there was no pending resume work, 423 * add one now to create a valid eviction fence 424 */ 425 schedule_delayed_work(&uq_mgr->resume_work, 0); 426 goto retry; 427 } 428 dma_fence_put(ev_fence); 429 } 430 431 432 433 static int 434 amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, 435 struct amdgpu_db_info *db_info, 436 struct drm_file *filp, 437 u64 *index) 438 { 439 u64 doorbell_index; 440 struct drm_gem_object *gobj; 441 struct amdgpu_userq_obj *db_obj = db_info->db_obj; 442 int r, db_size; 443 444 gobj = drm_gem_object_lookup(filp, db_info->doorbell_handle); 445 if (gobj == NULL) { 446 drm_file_err(uq_mgr->file, "Can't find GEM object for doorbell\n"); 447 return -EINVAL; 448 } 449 450 db_obj->obj = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj)); 451 drm_gem_object_put(gobj); 452 453 r = amdgpu_bo_reserve(db_obj->obj, true); 454 if (r) { 455 drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin doorbell object\n"); 456 goto unref_bo; 457 } 458 459 /* Pin the BO before generating the index, unpin in queue destroy */ 460 r = amdgpu_bo_pin(db_obj->obj, AMDGPU_GEM_DOMAIN_DOORBELL); 461 if (r) { 462 drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin doorbell object\n"); 463 goto unresv_bo; 464 } 465 466 switch (db_info->queue_type) { 467 case AMDGPU_HW_IP_GFX: 468 case AMDGPU_HW_IP_COMPUTE: 469 case AMDGPU_HW_IP_DMA: 470 db_size = sizeof(u64); 471 break; 472 default: 473 drm_file_err(uq_mgr->file, "[Usermode queues] IP %d not support\n", 474 db_info->queue_type); 475 r = -EINVAL; 476 goto unpin_bo; 477 } 478 479 /* Validate doorbell_offset is within the doorbell BO */ 480 if ((u64)db_info->doorbell_offset * db_size + db_size > 481 amdgpu_bo_size(db_obj->obj)) { 482 r = -EINVAL; 483 goto unpin_bo; 484 } 485 486 doorbell_index = amdgpu_doorbell_index_on_bar(uq_mgr->adev, db_obj->obj, 487 db_info->doorbell_offset, db_size); 488 drm_dbg_driver(adev_to_drm(uq_mgr->adev), 489 "[Usermode queues] doorbell index=%lld\n", doorbell_index); 490 amdgpu_bo_unreserve(db_obj->obj); 491 *index = doorbell_index; 492 return 0; 493 494 unpin_bo: 495 amdgpu_bo_unpin(db_obj->obj); 496 unresv_bo: 497 amdgpu_bo_unreserve(db_obj->obj); 498 unref_bo: 499 amdgpu_bo_unref(&db_obj->obj); 500 return r; 501 } 502 503 static int 504 amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue) 505 { 506 struct amdgpu_device *adev = uq_mgr->adev; 507 const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type]; 508 int r = 0; 509 510 cancel_delayed_work_sync(&uq_mgr->resume_work); 511 512 /* Cancel any pending hang detection work and cleanup */ 513 cancel_delayed_work_sync(&queue->hang_detect_work); 514 515 mutex_lock(&uq_mgr->userq_mutex); 516 amdgpu_userq_wait_for_last_fence(queue); 517 518 #if defined(CONFIG_DEBUG_FS) 519 debugfs_remove_recursive(queue->debugfs_queue); 520 #endif 521 r = amdgpu_userq_unmap_helper(queue); 522 atomic_dec(&uq_mgr->userq_count[queue->queue_type]); 523 amdgpu_userq_cleanup(queue); 524 mutex_unlock(&uq_mgr->userq_mutex); 525 526 cancel_delayed_work_sync(&queue->hang_detect_work); 527 uq_funcs->mqd_destroy(queue); 528 queue->userq_mgr = NULL; 529 530 amdgpu_bo_reserve(queue->db_obj.obj, true); 531 amdgpu_bo_unpin(queue->db_obj.obj); 532 amdgpu_bo_unreserve(queue->db_obj.obj); 533 amdgpu_bo_unref(&queue->db_obj.obj); 534 535 amdgpu_bo_reserve(queue->wptr_obj.obj, true); 536 amdgpu_bo_unpin(queue->wptr_obj.obj); 537 amdgpu_bo_unreserve(queue->wptr_obj.obj); 538 amdgpu_bo_unref(&queue->wptr_obj.obj); 539 kfree(queue); 540 541 pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); 542 543 return r; 544 } 545 546 static void amdgpu_userq_kref_destroy(struct kref *kref) 547 { 548 int r; 549 struct amdgpu_usermode_queue *queue = 550 container_of(kref, struct amdgpu_usermode_queue, refcount); 551 struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; 552 553 r = amdgpu_userq_destroy(uq_mgr, queue); 554 if (r) 555 drm_file_err(uq_mgr->file, "Failed to destroy usermode queue %d\n", r); 556 } 557 558 struct amdgpu_usermode_queue *amdgpu_userq_get(struct amdgpu_userq_mgr *uq_mgr, u32 qid) 559 { 560 struct amdgpu_usermode_queue *queue; 561 562 xa_lock(&uq_mgr->userq_xa); 563 queue = xa_load(&uq_mgr->userq_xa, qid); 564 if (queue) 565 kref_get(&queue->refcount); 566 xa_unlock(&uq_mgr->userq_xa); 567 568 return queue; 569 } 570 571 void amdgpu_userq_put(struct amdgpu_usermode_queue *queue) 572 { 573 if (queue) 574 kref_put(&queue->refcount, amdgpu_userq_kref_destroy); 575 } 576 577 static int amdgpu_userq_priority_permit(struct drm_file *filp, 578 int priority) 579 { 580 if (priority < AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH) 581 return 0; 582 583 if (capable(CAP_SYS_NICE)) 584 return 0; 585 586 if (drm_is_current_master(filp)) 587 return 0; 588 589 return -EACCES; 590 } 591 592 static int 593 amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) 594 { 595 struct amdgpu_fpriv *fpriv = filp->driver_priv; 596 struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr; 597 struct amdgpu_device *adev = uq_mgr->adev; 598 const struct amdgpu_userq_funcs *uq_funcs; 599 struct amdgpu_usermode_queue *queue; 600 struct amdgpu_db_info db_info; 601 uint64_t index; 602 int priority; 603 u32 qid; 604 int r; 605 606 priority = 607 (args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) 608 >> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT; 609 r = amdgpu_userq_priority_permit(filp, priority); 610 if (r) 611 return r; 612 613 r = pm_runtime_resume_and_get(adev_to_drm(adev)->dev); 614 if (r < 0) { 615 drm_file_err(uq_mgr->file, "pm_runtime_resume_and_get() failed for userqueue create\n"); 616 return r; 617 } 618 619 uq_funcs = adev->userq_funcs[args->in.ip_type]; 620 if (!uq_funcs) { 621 r = -EINVAL; 622 goto err_pm_runtime; 623 } 624 625 queue = kzalloc_obj(struct amdgpu_usermode_queue); 626 if (!queue) { 627 r = -ENOMEM; 628 goto err_pm_runtime; 629 } 630 631 kref_init(&queue->refcount); 632 queue->doorbell_handle = args->in.doorbell_handle; 633 queue->queue_type = args->in.ip_type; 634 queue->vm = &fpriv->vm; 635 queue->priority = priority; 636 queue->userq_mgr = uq_mgr; 637 INIT_DELAYED_WORK(&queue->hang_detect_work, 638 amdgpu_userq_hang_detect_work); 639 640 r = amdgpu_userq_fence_driver_alloc(adev, &queue->fence_drv); 641 if (r) 642 goto free_queue; 643 644 xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC); 645 mutex_init(&queue->fence_drv_lock); 646 /* Make sure the queue can actually run with those virtual addresses. */ 647 r = amdgpu_bo_reserve(fpriv->vm.root.bo, false); 648 if (r) 649 goto free_fence_drv; 650 651 if (amdgpu_userq_input_va_validate(adev, queue, args->in.queue_va, 652 args->in.queue_size, 653 &queue->userq_vas.va.queue_rb) || 654 amdgpu_userq_input_va_validate(adev, queue, args->in.rptr_va, 655 AMDGPU_GPU_PAGE_SIZE, 656 &queue->userq_vas.va.rptr) || 657 amdgpu_userq_input_va_validate(adev, queue, args->in.wptr_va, 658 AMDGPU_GPU_PAGE_SIZE, 659 &queue->userq_vas.va.wptr)) { 660 r = -EINVAL; 661 amdgpu_bo_unreserve(fpriv->vm.root.bo); 662 goto free_fence_drv; 663 } 664 amdgpu_bo_unreserve(fpriv->vm.root.bo); 665 666 /* Convert relative doorbell offset into absolute doorbell index */ 667 db_info.queue_type = queue->queue_type; 668 db_info.doorbell_handle = queue->doorbell_handle; 669 db_info.db_obj = &queue->db_obj; 670 db_info.doorbell_offset = args->in.doorbell_offset; 671 r = amdgpu_userq_get_doorbell_index(uq_mgr, &db_info, filp, &index); 672 if (r) { 673 drm_file_err(uq_mgr->file, "Failed to get doorbell for queue\n"); 674 goto free_fence_drv; 675 } 676 677 queue->doorbell_index = index; 678 r = uq_funcs->mqd_create(queue, &args->in); 679 if (r) { 680 drm_file_err(uq_mgr->file, "Failed to create Queue\n"); 681 goto clean_doorbell_bo; 682 } 683 684 /* Update VM owner at userq submit-time for page-fault attribution. */ 685 amdgpu_vm_set_task_info(&fpriv->vm); 686 687 r = xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue, 688 GFP_KERNEL)); 689 if (r) 690 goto clean_mqd; 691 692 amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); 693 694 /* don't map the queue if scheduling is halted */ 695 if (!adev->userq_halt_for_enforce_isolation || 696 ((queue->queue_type != AMDGPU_HW_IP_GFX) && 697 (queue->queue_type != AMDGPU_HW_IP_COMPUTE))) { 698 r = amdgpu_userq_map_helper(queue); 699 if (r) { 700 drm_file_err(uq_mgr->file, "Failed to map Queue\n"); 701 mutex_unlock(&uq_mgr->userq_mutex); 702 goto erase_doorbell; 703 } 704 } 705 706 atomic_inc(&uq_mgr->userq_count[queue->queue_type]); 707 mutex_unlock(&uq_mgr->userq_mutex); 708 709 r = xa_alloc(&uq_mgr->userq_xa, &qid, queue, 710 XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), 711 GFP_KERNEL); 712 if (r) { 713 /* 714 * This drops the last reference which should take care of 715 * all cleanup. 716 */ 717 amdgpu_userq_put(queue); 718 return r; 719 } 720 721 amdgpu_debugfs_userq_init(filp, queue, qid); 722 args->out.queue_id = qid; 723 return 0; 724 725 erase_doorbell: 726 xa_erase_irq(&adev->userq_doorbell_xa, index); 727 clean_mqd: 728 uq_funcs->mqd_destroy(queue); 729 clean_doorbell_bo: 730 amdgpu_bo_reserve(queue->db_obj.obj, true); 731 amdgpu_bo_unpin(queue->db_obj.obj); 732 amdgpu_bo_unreserve(queue->db_obj.obj); 733 amdgpu_bo_unref(&queue->db_obj.obj); 734 free_fence_drv: 735 amdgpu_userq_fence_driver_free(queue); 736 free_queue: 737 kfree(queue); 738 err_pm_runtime: 739 pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); 740 return r; 741 } 742 743 static int amdgpu_userq_input_args_validate(struct drm_device *dev, 744 union drm_amdgpu_userq *args, 745 struct drm_file *filp) 746 { 747 struct amdgpu_device *adev = drm_to_adev(dev); 748 749 switch (args->in.op) { 750 case AMDGPU_USERQ_OP_CREATE: 751 if (args->in.flags & ~(AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK | 752 AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE)) 753 return -EINVAL; 754 /* Usermode queues are only supported for GFX IP as of now */ 755 if (args->in.ip_type != AMDGPU_HW_IP_GFX && 756 args->in.ip_type != AMDGPU_HW_IP_DMA && 757 args->in.ip_type != AMDGPU_HW_IP_COMPUTE) { 758 drm_file_err(filp, "Usermode queue doesn't support IP type %u\n", 759 args->in.ip_type); 760 return -EINVAL; 761 } 762 763 if ((args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE) && 764 (args->in.ip_type != AMDGPU_HW_IP_GFX) && 765 (args->in.ip_type != AMDGPU_HW_IP_COMPUTE) && 766 !amdgpu_is_tmz(adev)) { 767 drm_file_err(filp, "Secure only supported on GFX/Compute queues\n"); 768 return -EINVAL; 769 } 770 771 if (args->in.queue_va == AMDGPU_BO_INVALID_OFFSET || 772 args->in.queue_va == 0 || 773 args->in.queue_size == 0) { 774 drm_file_err(filp, "invalidate userq queue va or size\n"); 775 return -EINVAL; 776 } 777 778 if (!is_power_of_2(args->in.queue_size)) { 779 drm_file_err(filp, "Queue size must be a power of 2\n"); 780 return -EINVAL; 781 } 782 783 if (args->in.queue_size < AMDGPU_GPU_PAGE_SIZE) { 784 drm_file_err(filp, "Queue size smaller than AMDGPU_GPU_PAGE_SIZE\n"); 785 return -EINVAL; 786 } 787 788 if (!args->in.wptr_va || !args->in.rptr_va) { 789 drm_file_err(filp, "invalidate userq queue rptr or wptr\n"); 790 return -EINVAL; 791 } 792 break; 793 case AMDGPU_USERQ_OP_FREE: 794 if (args->in.ip_type || 795 args->in.doorbell_handle || 796 args->in.doorbell_offset || 797 args->in.flags || 798 args->in.queue_va || 799 args->in.queue_size || 800 args->in.rptr_va || 801 args->in.wptr_va || 802 args->in.mqd || 803 args->in.mqd_size) 804 return -EINVAL; 805 break; 806 default: 807 return -EINVAL; 808 } 809 810 return 0; 811 } 812 813 bool amdgpu_userq_enabled(struct drm_device *dev) 814 { 815 struct amdgpu_device *adev = drm_to_adev(dev); 816 int i; 817 818 for (i = 0; i < AMDGPU_HW_IP_NUM; i++) { 819 if (adev->userq_funcs[i]) 820 return true; 821 } 822 823 return false; 824 } 825 826 int amdgpu_userq_ioctl(struct drm_device *dev, void *data, 827 struct drm_file *filp) 828 { 829 union drm_amdgpu_userq *args = data; 830 struct amdgpu_fpriv *fpriv = filp->driver_priv; 831 struct amdgpu_usermode_queue *queue; 832 int r = 0; 833 834 if (!amdgpu_userq_enabled(dev)) 835 return -ENOTSUPP; 836 837 if (amdgpu_userq_input_args_validate(dev, args, filp) < 0) 838 return -EINVAL; 839 840 switch (args->in.op) { 841 case AMDGPU_USERQ_OP_CREATE: 842 r = amdgpu_userq_create(filp, args); 843 if (r) 844 drm_file_err(filp, "Failed to create usermode queue\n"); 845 break; 846 847 case AMDGPU_USERQ_OP_FREE: { 848 xa_lock(&fpriv->userq_mgr.userq_xa); 849 queue = __xa_erase(&fpriv->userq_mgr.userq_xa, args->in.queue_id); 850 xa_unlock(&fpriv->userq_mgr.userq_xa); 851 if (!queue) 852 return -ENOENT; 853 854 amdgpu_userq_put(queue); 855 break; 856 } 857 858 default: 859 drm_dbg_driver(dev, "Invalid user queue op specified: %d\n", args->in.op); 860 return -EINVAL; 861 } 862 863 return r; 864 } 865 866 static int 867 amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) 868 { 869 struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); 870 struct amdgpu_vm *vm = &fpriv->vm; 871 struct amdgpu_usermode_queue *queue; 872 unsigned long queue_id; 873 int ret = 0, r; 874 875 876 if (amdgpu_bo_reserve(vm->root.bo, false)) 877 return false; 878 879 mutex_lock(&uq_mgr->userq_mutex); 880 /* Resume all the queues for this process */ 881 xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { 882 883 if (!amdgpu_userq_buffer_vas_mapped(queue)) { 884 drm_file_err(uq_mgr->file, 885 "trying restore queue without va mapping\n"); 886 queue->state = AMDGPU_USERQ_STATE_INVALID_VA; 887 continue; 888 } 889 890 r = amdgpu_userq_restore_helper(queue); 891 if (r) 892 ret = r; 893 894 } 895 mutex_unlock(&uq_mgr->userq_mutex); 896 amdgpu_bo_unreserve(vm->root.bo); 897 898 if (ret) 899 drm_file_err(uq_mgr->file, 900 "Failed to map all the queues, restore failed ret=%d\n", ret); 901 return ret; 902 } 903 904 static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo) 905 { 906 struct ttm_operation_ctx ctx = { false, false }; 907 908 amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); 909 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 910 } 911 912 /* Handle all BOs on the invalidated list, validate them and update the PTs */ 913 static int 914 amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, 915 struct amdgpu_vm *vm) 916 { 917 struct ttm_operation_ctx ctx = { false, false }; 918 struct amdgpu_bo_va *bo_va; 919 struct amdgpu_bo *bo; 920 int ret; 921 922 spin_lock(&vm->status_lock); 923 while (!list_empty(&vm->invalidated)) { 924 bo_va = list_first_entry(&vm->invalidated, 925 struct amdgpu_bo_va, 926 base.vm_status); 927 spin_unlock(&vm->status_lock); 928 929 bo = bo_va->base.bo; 930 ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2); 931 if (unlikely(ret)) 932 return ret; 933 934 amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); 935 ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 936 if (ret) 937 return ret; 938 939 /* This moves the bo_va to the done list */ 940 ret = amdgpu_vm_bo_update(adev, bo_va, false); 941 if (ret) 942 return ret; 943 944 spin_lock(&vm->status_lock); 945 } 946 spin_unlock(&vm->status_lock); 947 948 return 0; 949 } 950 951 /* Make sure the whole VM is ready to be used */ 952 static int 953 amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr) 954 { 955 struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); 956 bool invalidated = false, new_addition = false; 957 struct ttm_operation_ctx ctx = { true, false }; 958 struct amdgpu_device *adev = uq_mgr->adev; 959 struct amdgpu_hmm_range *range; 960 struct amdgpu_vm *vm = &fpriv->vm; 961 unsigned long key, tmp_key; 962 struct amdgpu_bo_va *bo_va; 963 struct amdgpu_bo *bo; 964 struct drm_exec exec; 965 struct xarray xa; 966 int ret; 967 968 xa_init(&xa); 969 970 retry_lock: 971 drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0); 972 drm_exec_until_all_locked(&exec) { 973 ret = amdgpu_vm_lock_pd(vm, &exec, 1); 974 drm_exec_retry_on_contention(&exec); 975 if (unlikely(ret)) 976 goto unlock_all; 977 978 ret = amdgpu_vm_lock_done_list(vm, &exec, 1); 979 drm_exec_retry_on_contention(&exec); 980 if (unlikely(ret)) 981 goto unlock_all; 982 983 /* This validates PDs, PTs and per VM BOs */ 984 ret = amdgpu_vm_validate(adev, vm, NULL, 985 amdgpu_userq_validate_vm, 986 NULL); 987 if (unlikely(ret)) 988 goto unlock_all; 989 990 /* This locks and validates the remaining evicted BOs */ 991 ret = amdgpu_userq_bo_validate(adev, &exec, vm); 992 drm_exec_retry_on_contention(&exec); 993 if (unlikely(ret)) 994 goto unlock_all; 995 } 996 997 if (invalidated) { 998 xa_for_each(&xa, tmp_key, range) { 999 bo = range->bo; 1000 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); 1001 ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 1002 if (ret) 1003 goto unlock_all; 1004 1005 amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, range); 1006 1007 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT); 1008 ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 1009 if (ret) 1010 goto unlock_all; 1011 } 1012 invalidated = false; 1013 } 1014 1015 ret = amdgpu_vm_handle_moved(adev, vm, NULL); 1016 if (ret) 1017 goto unlock_all; 1018 1019 key = 0; 1020 /* Validate User Ptr BOs */ 1021 list_for_each_entry(bo_va, &vm->done, base.vm_status) { 1022 bo = bo_va->base.bo; 1023 if (!bo) 1024 continue; 1025 1026 if (!amdgpu_ttm_tt_is_userptr(bo->tbo.ttm)) 1027 continue; 1028 1029 range = xa_load(&xa, key); 1030 if (range && range->bo != bo) { 1031 xa_erase(&xa, key); 1032 amdgpu_hmm_range_free(range); 1033 range = NULL; 1034 } 1035 1036 if (!range) { 1037 range = amdgpu_hmm_range_alloc(bo); 1038 if (!range) { 1039 ret = -ENOMEM; 1040 goto unlock_all; 1041 } 1042 1043 xa_store(&xa, key, range, GFP_KERNEL); 1044 new_addition = true; 1045 } 1046 key++; 1047 } 1048 1049 if (new_addition) { 1050 drm_exec_fini(&exec); 1051 xa_for_each(&xa, tmp_key, range) { 1052 if (!range) 1053 continue; 1054 bo = range->bo; 1055 ret = amdgpu_ttm_tt_get_user_pages(bo, range); 1056 if (ret) 1057 goto free_ranges; 1058 } 1059 1060 invalidated = true; 1061 new_addition = false; 1062 goto retry_lock; 1063 } 1064 1065 ret = amdgpu_vm_update_pdes(adev, vm, false); 1066 if (ret) 1067 goto unlock_all; 1068 1069 /* 1070 * We need to wait for all VM updates to finish before restarting the 1071 * queues. Using the done list like that is now ok since everything is 1072 * locked in place. 1073 */ 1074 list_for_each_entry(bo_va, &vm->done, base.vm_status) 1075 dma_fence_wait(bo_va->last_pt_update, false); 1076 dma_fence_wait(vm->last_update, false); 1077 1078 ret = amdgpu_evf_mgr_rearm(&fpriv->evf_mgr, &exec); 1079 if (ret) 1080 drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n"); 1081 1082 unlock_all: 1083 drm_exec_fini(&exec); 1084 free_ranges: 1085 xa_for_each(&xa, tmp_key, range) { 1086 if (!range) 1087 continue; 1088 bo = range->bo; 1089 amdgpu_hmm_range_free(range); 1090 } 1091 xa_destroy(&xa); 1092 return ret; 1093 } 1094 1095 static void amdgpu_userq_restore_worker(struct work_struct *work) 1096 { 1097 struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work); 1098 struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); 1099 struct dma_fence *ev_fence; 1100 int ret; 1101 1102 ev_fence = amdgpu_evf_mgr_get_fence(&fpriv->evf_mgr); 1103 if (!dma_fence_is_signaled(ev_fence)) 1104 goto put_fence; 1105 1106 ret = amdgpu_userq_vm_validate(uq_mgr); 1107 if (ret) { 1108 drm_file_err(uq_mgr->file, "Failed to validate BOs to restore ret=%d\n", ret); 1109 goto put_fence; 1110 } 1111 1112 amdgpu_userq_restore_all(uq_mgr); 1113 1114 put_fence: 1115 dma_fence_put(ev_fence); 1116 } 1117 1118 static int 1119 amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) 1120 { 1121 struct amdgpu_usermode_queue *queue; 1122 unsigned long queue_id; 1123 int ret = 0, r; 1124 1125 /* Try to unmap all the queues in this process ctx */ 1126 xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { 1127 r = amdgpu_userq_preempt_helper(queue); 1128 if (r) 1129 ret = r; 1130 } 1131 1132 if (ret) { 1133 drm_file_err(uq_mgr->file, 1134 "Couldn't unmap all the queues, eviction failed ret=%d\n", ret); 1135 amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain, 1136 &uq_mgr->reset_work); 1137 flush_work(&uq_mgr->reset_work); 1138 } 1139 return ret; 1140 } 1141 1142 static void 1143 amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) 1144 { 1145 struct amdgpu_usermode_queue *queue; 1146 unsigned long queue_id; 1147 1148 xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { 1149 struct dma_fence *f = queue->last_fence; 1150 1151 if (!f) 1152 continue; 1153 1154 dma_fence_wait(f, false); 1155 } 1156 } 1157 1158 void 1159 amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr) 1160 { 1161 /* Wait for any pending userqueue fence work to finish */ 1162 amdgpu_userq_wait_for_signal(uq_mgr); 1163 amdgpu_userq_evict_all(uq_mgr); 1164 } 1165 1166 int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv, 1167 struct amdgpu_device *adev) 1168 { 1169 mutex_init(&userq_mgr->userq_mutex); 1170 xa_init_flags(&userq_mgr->userq_xa, XA_FLAGS_ALLOC); 1171 userq_mgr->adev = adev; 1172 userq_mgr->file = file_priv; 1173 1174 INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker); 1175 INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work); 1176 return 0; 1177 } 1178 1179 void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev) 1180 { 1181 struct xarray *xa = &adev->userq_doorbell_xa; 1182 struct amdgpu_usermode_queue *queue; 1183 unsigned long flags, queue_id; 1184 1185 xa_lock_irqsave(xa, flags); 1186 xa_for_each(xa, queue_id, queue) { 1187 cancel_delayed_work(&queue->hang_detect_work); 1188 cancel_work(&queue->userq_mgr->reset_work); 1189 } 1190 xa_unlock_irqrestore(xa, flags); 1191 } 1192 1193 void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr) 1194 { 1195 cancel_delayed_work_sync(&userq_mgr->resume_work); 1196 } 1197 1198 void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) 1199 { 1200 struct amdgpu_usermode_queue *queue; 1201 unsigned long queue_id = 0; 1202 1203 for (;;) { 1204 xa_lock(&userq_mgr->userq_xa); 1205 queue = xa_find(&userq_mgr->userq_xa, &queue_id, ULONG_MAX, 1206 XA_PRESENT); 1207 if (queue) 1208 __xa_erase(&userq_mgr->userq_xa, queue_id); 1209 xa_unlock(&userq_mgr->userq_xa); 1210 1211 if (!queue) 1212 break; 1213 1214 amdgpu_userq_put(queue); 1215 } 1216 1217 xa_destroy(&userq_mgr->userq_xa); 1218 1219 /* 1220 * Drain any in-flight reset_work. By this point all queues are freed 1221 * and userq_count is 0, so if reset_work starts now it exits early. 1222 * We still need to wait in case it was already executing gpu_recover. 1223 */ 1224 cancel_work_sync(&userq_mgr->reset_work); 1225 1226 mutex_destroy(&userq_mgr->userq_mutex); 1227 } 1228 1229 int amdgpu_userq_suspend(struct amdgpu_device *adev) 1230 { 1231 u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); 1232 struct amdgpu_usermode_queue *queue; 1233 struct amdgpu_userq_mgr *uqm; 1234 unsigned long queue_id; 1235 int r; 1236 1237 if (!ip_mask) 1238 return 0; 1239 1240 xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { 1241 uqm = queue->userq_mgr; 1242 cancel_delayed_work_sync(&uqm->resume_work); 1243 guard(mutex)(&uqm->userq_mutex); 1244 if (adev->in_s0ix) 1245 r = amdgpu_userq_preempt_helper(queue); 1246 else 1247 r = amdgpu_userq_unmap_helper(queue); 1248 if (r) 1249 return r; 1250 } 1251 return 0; 1252 } 1253 1254 int amdgpu_userq_resume(struct amdgpu_device *adev) 1255 { 1256 u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); 1257 struct amdgpu_usermode_queue *queue; 1258 struct amdgpu_userq_mgr *uqm; 1259 unsigned long queue_id; 1260 int r; 1261 1262 if (!ip_mask) 1263 return 0; 1264 1265 xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { 1266 uqm = queue->userq_mgr; 1267 guard(mutex)(&uqm->userq_mutex); 1268 if (adev->in_s0ix) 1269 r = amdgpu_userq_restore_helper(queue); 1270 else 1271 r = amdgpu_userq_map_helper(queue); 1272 if (r) 1273 return r; 1274 } 1275 1276 return 0; 1277 } 1278 1279 int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, 1280 u32 idx) 1281 { 1282 u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); 1283 struct amdgpu_usermode_queue *queue; 1284 struct amdgpu_userq_mgr *uqm; 1285 unsigned long queue_id; 1286 int ret = 0, r; 1287 1288 /* only need to stop gfx/compute */ 1289 if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE)))) 1290 return 0; 1291 1292 if (adev->userq_halt_for_enforce_isolation) 1293 dev_warn(adev->dev, "userq scheduling already stopped!\n"); 1294 adev->userq_halt_for_enforce_isolation = true; 1295 xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { 1296 uqm = queue->userq_mgr; 1297 cancel_delayed_work_sync(&uqm->resume_work); 1298 mutex_lock(&uqm->userq_mutex); 1299 if (((queue->queue_type == AMDGPU_HW_IP_GFX) || 1300 (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && 1301 (queue->xcp_id == idx)) { 1302 r = amdgpu_userq_preempt_helper(queue); 1303 if (r) 1304 ret = r; 1305 } 1306 mutex_unlock(&uqm->userq_mutex); 1307 } 1308 1309 return ret; 1310 } 1311 1312 int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, 1313 u32 idx) 1314 { 1315 u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); 1316 struct amdgpu_usermode_queue *queue; 1317 struct amdgpu_userq_mgr *uqm; 1318 unsigned long queue_id; 1319 int ret = 0, r; 1320 1321 /* only need to stop gfx/compute */ 1322 if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE)))) 1323 return 0; 1324 1325 if (!adev->userq_halt_for_enforce_isolation) 1326 dev_warn(adev->dev, "userq scheduling already started!\n"); 1327 1328 adev->userq_halt_for_enforce_isolation = false; 1329 1330 xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { 1331 uqm = queue->userq_mgr; 1332 mutex_lock(&uqm->userq_mutex); 1333 if (((queue->queue_type == AMDGPU_HW_IP_GFX) || 1334 (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && 1335 (queue->xcp_id == idx)) { 1336 r = amdgpu_userq_restore_helper(queue); 1337 if (r) 1338 ret = r; 1339 } 1340 mutex_unlock(&uqm->userq_mutex); 1341 } 1342 1343 return ret; 1344 } 1345 1346 void amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev, 1347 struct amdgpu_bo_va_mapping *mapping, 1348 uint64_t saddr) 1349 { 1350 u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); 1351 struct amdgpu_bo_va *bo_va = mapping->bo_va; 1352 struct dma_resv *resv = bo_va->base.bo->tbo.base.resv; 1353 1354 if (!ip_mask) 1355 return; 1356 1357 dev_warn_once(adev->dev, "now unmapping a vital queue va:%llx\n", saddr); 1358 /** 1359 * The userq VA mapping reservation should include the eviction fence, 1360 * if the eviction fence can't signal successfully during unmapping, 1361 * then driver will warn to flag this improper unmap of the userq VA. 1362 * Note: The eviction fence may be attached to different BOs, and this 1363 * unmap is only for one kind of userq VAs, so at this point suppose 1364 * the eviction fence is always unsignaled. 1365 */ 1366 dma_resv_wait_timeout(resv, DMA_RESV_USAGE_BOOKKEEP, 1367 false, MAX_SCHEDULE_TIMEOUT); 1368 } 1369 1370 void amdgpu_userq_pre_reset(struct amdgpu_device *adev) 1371 { 1372 const struct amdgpu_userq_funcs *userq_funcs; 1373 struct amdgpu_usermode_queue *queue; 1374 unsigned long queue_id; 1375 1376 /* TODO: We probably need a new lock for the queue state */ 1377 xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { 1378 if (queue->state != AMDGPU_USERQ_STATE_MAPPED) 1379 continue; 1380 1381 userq_funcs = adev->userq_funcs[queue->queue_type]; 1382 userq_funcs->unmap(queue); 1383 /* just mark all queues as hung at this point. 1384 * if unmap succeeds, we could map again 1385 * in amdgpu_userq_post_reset() if vram is not lost 1386 */ 1387 queue->state = AMDGPU_USERQ_STATE_HUNG; 1388 amdgpu_userq_fence_driver_force_completion(queue); 1389 } 1390 } 1391 1392 int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) 1393 { 1394 /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED 1395 * at this point, we should be able to map it again 1396 * and continue if vram is not lost. 1397 */ 1398 struct amdgpu_usermode_queue *queue; 1399 const struct amdgpu_userq_funcs *userq_funcs; 1400 unsigned long queue_id; 1401 int r = 0; 1402 1403 xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { 1404 if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) { 1405 userq_funcs = adev->userq_funcs[queue->queue_type]; 1406 /* Re-map queue */ 1407 r = userq_funcs->map(queue); 1408 if (r) { 1409 dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id); 1410 continue; 1411 } 1412 queue->state = AMDGPU_USERQ_STATE_MAPPED; 1413 } 1414 } 1415 1416 return r; 1417 } 1418