1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2020-2021 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/types.h> 25 #include <linux/sched/task.h> 26 #include "amdgpu_sync.h" 27 #include "amdgpu_object.h" 28 #include "amdgpu_vm.h" 29 #include "amdgpu_mn.h" 30 #include "amdgpu.h" 31 #include "amdgpu_xgmi.h" 32 #include "kfd_priv.h" 33 #include "kfd_svm.h" 34 #include "kfd_migrate.h" 35 36 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 37 38 /* Long enough to ensure no retry fault comes after svm range is restored and 39 * page table is updated. 40 */ 41 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 42 43 static void svm_range_evict_svm_bo_worker(struct work_struct *work); 44 static bool 45 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 46 const struct mmu_notifier_range *range, 47 unsigned long cur_seq); 48 49 static const struct mmu_interval_notifier_ops svm_range_mn_ops = { 50 .invalidate = svm_range_cpu_invalidate_pagetables, 51 }; 52 53 /** 54 * svm_range_unlink - unlink svm_range from lists and interval tree 55 * @prange: svm range structure to be removed 56 * 57 * Remove the svm_range from the svms and svm_bo lists and the svms 58 * interval tree. 59 * 60 * Context: The caller must hold svms->lock 61 */ 62 static void svm_range_unlink(struct svm_range *prange) 63 { 64 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 65 prange, prange->start, prange->last); 66 67 if (prange->svm_bo) { 68 spin_lock(&prange->svm_bo->list_lock); 69 list_del(&prange->svm_bo_list); 70 spin_unlock(&prange->svm_bo->list_lock); 71 } 72 73 list_del(&prange->list); 74 if (prange->it_node.start != 0 && prange->it_node.last != 0) 75 interval_tree_remove(&prange->it_node, &prange->svms->objects); 76 } 77 78 static void 79 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) 80 { 81 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 82 prange, prange->start, prange->last); 83 84 mmu_interval_notifier_insert_locked(&prange->notifier, mm, 85 prange->start << PAGE_SHIFT, 86 prange->npages << PAGE_SHIFT, 87 &svm_range_mn_ops); 88 } 89 90 /** 91 * svm_range_add_to_svms - add svm range to svms 92 * @prange: svm range structure to be added 93 * 94 * Add the svm range to svms interval tree and link list 95 * 96 * Context: The caller must hold svms->lock 97 */ 98 static void svm_range_add_to_svms(struct svm_range *prange) 99 { 100 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 101 prange, prange->start, prange->last); 102 103 list_add_tail(&prange->list, &prange->svms->list); 104 prange->it_node.start = prange->start; 105 prange->it_node.last = prange->last; 106 interval_tree_insert(&prange->it_node, &prange->svms->objects); 107 } 108 109 static void svm_range_remove_notifier(struct svm_range *prange) 110 { 111 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", 112 prange->svms, prange, 113 prange->notifier.interval_tree.start >> PAGE_SHIFT, 114 prange->notifier.interval_tree.last >> PAGE_SHIFT); 115 116 if (prange->notifier.interval_tree.start != 0 && 117 prange->notifier.interval_tree.last != 0) 118 mmu_interval_notifier_remove(&prange->notifier); 119 } 120 121 static int 122 svm_range_dma_map_dev(struct device *dev, dma_addr_t **dma_addr, 123 unsigned long *hmm_pfns, uint64_t npages) 124 { 125 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 126 dma_addr_t *addr = *dma_addr; 127 struct page *page; 128 int i, r; 129 130 if (!addr) { 131 addr = kvmalloc_array(npages, sizeof(*addr), 132 GFP_KERNEL | __GFP_ZERO); 133 if (!addr) 134 return -ENOMEM; 135 *dma_addr = addr; 136 } 137 138 for (i = 0; i < npages; i++) { 139 if (WARN_ONCE(addr[i] && !dma_mapping_error(dev, addr[i]), 140 "leaking dma mapping\n")) 141 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); 142 143 page = hmm_pfn_to_page(hmm_pfns[i]); 144 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); 145 r = dma_mapping_error(dev, addr[i]); 146 if (r) { 147 pr_debug("failed %d dma_map_page\n", r); 148 return r; 149 } 150 pr_debug("dma mapping 0x%llx for page addr 0x%lx\n", 151 addr[i] >> PAGE_SHIFT, page_to_pfn(page)); 152 } 153 return 0; 154 } 155 156 static int 157 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, 158 unsigned long *hmm_pfns) 159 { 160 struct kfd_process *p; 161 uint32_t gpuidx; 162 int r; 163 164 p = container_of(prange->svms, struct kfd_process, svms); 165 166 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 167 struct kfd_process_device *pdd; 168 struct amdgpu_device *adev; 169 170 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 171 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 172 if (!pdd) { 173 pr_debug("failed to find device idx %d\n", gpuidx); 174 return -EINVAL; 175 } 176 adev = (struct amdgpu_device *)pdd->dev->kgd; 177 178 r = svm_range_dma_map_dev(adev->dev, &prange->dma_addr[gpuidx], 179 hmm_pfns, prange->npages); 180 if (r) 181 break; 182 } 183 184 return r; 185 } 186 187 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, 188 unsigned long offset, unsigned long npages) 189 { 190 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 191 int i; 192 193 if (!dma_addr) 194 return; 195 196 for (i = offset; i < offset + npages; i++) { 197 if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i])) 198 continue; 199 pr_debug("dma unmapping 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); 200 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); 201 dma_addr[i] = 0; 202 } 203 } 204 205 void svm_range_free_dma_mappings(struct svm_range *prange) 206 { 207 struct kfd_process_device *pdd; 208 dma_addr_t *dma_addr; 209 struct device *dev; 210 struct kfd_process *p; 211 uint32_t gpuidx; 212 213 p = container_of(prange->svms, struct kfd_process, svms); 214 215 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { 216 dma_addr = prange->dma_addr[gpuidx]; 217 if (!dma_addr) 218 continue; 219 220 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 221 if (!pdd) { 222 pr_debug("failed to find device idx %d\n", gpuidx); 223 continue; 224 } 225 dev = &pdd->dev->pdev->dev; 226 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); 227 kvfree(dma_addr); 228 prange->dma_addr[gpuidx] = NULL; 229 } 230 } 231 232 static void svm_range_free(struct svm_range *prange) 233 { 234 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, 235 prange->start, prange->last); 236 237 svm_range_vram_node_free(prange); 238 svm_range_free_dma_mappings(prange); 239 mutex_destroy(&prange->lock); 240 mutex_destroy(&prange->migrate_mutex); 241 kfree(prange); 242 } 243 244 static void 245 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, 246 uint8_t *granularity, uint32_t *flags) 247 { 248 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 249 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 250 *granularity = 9; 251 *flags = 252 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; 253 } 254 255 static struct 256 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, 257 uint64_t last) 258 { 259 uint64_t size = last - start + 1; 260 struct svm_range *prange; 261 struct kfd_process *p; 262 263 prange = kzalloc(sizeof(*prange), GFP_KERNEL); 264 if (!prange) 265 return NULL; 266 prange->npages = size; 267 prange->svms = svms; 268 prange->start = start; 269 prange->last = last; 270 INIT_LIST_HEAD(&prange->list); 271 INIT_LIST_HEAD(&prange->update_list); 272 INIT_LIST_HEAD(&prange->remove_list); 273 INIT_LIST_HEAD(&prange->insert_list); 274 INIT_LIST_HEAD(&prange->svm_bo_list); 275 INIT_LIST_HEAD(&prange->deferred_list); 276 INIT_LIST_HEAD(&prange->child_list); 277 atomic_set(&prange->invalid, 0); 278 prange->validate_timestamp = 0; 279 mutex_init(&prange->migrate_mutex); 280 mutex_init(&prange->lock); 281 282 p = container_of(svms, struct kfd_process, svms); 283 if (p->xnack_enabled) 284 bitmap_fill(prange->bitmap_access, MAX_GPU_INSTANCE); 285 286 svm_range_set_default_attributes(&prange->preferred_loc, 287 &prange->prefetch_loc, 288 &prange->granularity, &prange->flags); 289 290 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); 291 292 return prange; 293 } 294 295 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) 296 { 297 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) 298 return false; 299 300 return true; 301 } 302 303 static struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo) 304 { 305 if (svm_bo) 306 kref_get(&svm_bo->kref); 307 308 return svm_bo; 309 } 310 311 static void svm_range_bo_release(struct kref *kref) 312 { 313 struct svm_range_bo *svm_bo; 314 315 svm_bo = container_of(kref, struct svm_range_bo, kref); 316 spin_lock(&svm_bo->list_lock); 317 while (!list_empty(&svm_bo->range_list)) { 318 struct svm_range *prange = 319 list_first_entry(&svm_bo->range_list, 320 struct svm_range, svm_bo_list); 321 /* list_del_init tells a concurrent svm_range_vram_node_new when 322 * it's safe to reuse the svm_bo pointer and svm_bo_list head. 323 */ 324 list_del_init(&prange->svm_bo_list); 325 spin_unlock(&svm_bo->list_lock); 326 327 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 328 prange->start, prange->last); 329 mutex_lock(&prange->lock); 330 prange->svm_bo = NULL; 331 mutex_unlock(&prange->lock); 332 333 spin_lock(&svm_bo->list_lock); 334 } 335 spin_unlock(&svm_bo->list_lock); 336 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { 337 /* We're not in the eviction worker. 338 * Signal the fence and synchronize with any 339 * pending eviction work. 340 */ 341 dma_fence_signal(&svm_bo->eviction_fence->base); 342 cancel_work_sync(&svm_bo->eviction_work); 343 } 344 dma_fence_put(&svm_bo->eviction_fence->base); 345 amdgpu_bo_unref(&svm_bo->bo); 346 kfree(svm_bo); 347 } 348 349 static void svm_range_bo_unref(struct svm_range_bo *svm_bo) 350 { 351 if (!svm_bo) 352 return; 353 354 kref_put(&svm_bo->kref, svm_range_bo_release); 355 } 356 357 static bool 358 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) 359 { 360 struct amdgpu_device *bo_adev; 361 362 mutex_lock(&prange->lock); 363 if (!prange->svm_bo) { 364 mutex_unlock(&prange->lock); 365 return false; 366 } 367 if (prange->ttm_res) { 368 /* We still have a reference, all is well */ 369 mutex_unlock(&prange->lock); 370 return true; 371 } 372 if (svm_bo_ref_unless_zero(prange->svm_bo)) { 373 /* 374 * Migrate from GPU to GPU, remove range from source bo_adev 375 * svm_bo range list, and return false to allocate svm_bo from 376 * destination adev. 377 */ 378 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 379 if (bo_adev != adev) { 380 mutex_unlock(&prange->lock); 381 382 spin_lock(&prange->svm_bo->list_lock); 383 list_del_init(&prange->svm_bo_list); 384 spin_unlock(&prange->svm_bo->list_lock); 385 386 svm_range_bo_unref(prange->svm_bo); 387 return false; 388 } 389 if (READ_ONCE(prange->svm_bo->evicting)) { 390 struct dma_fence *f; 391 struct svm_range_bo *svm_bo; 392 /* The BO is getting evicted, 393 * we need to get a new one 394 */ 395 mutex_unlock(&prange->lock); 396 svm_bo = prange->svm_bo; 397 f = dma_fence_get(&svm_bo->eviction_fence->base); 398 svm_range_bo_unref(prange->svm_bo); 399 /* wait for the fence to avoid long spin-loop 400 * at list_empty_careful 401 */ 402 dma_fence_wait(f, false); 403 dma_fence_put(f); 404 } else { 405 /* The BO was still around and we got 406 * a new reference to it 407 */ 408 mutex_unlock(&prange->lock); 409 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", 410 prange->svms, prange->start, prange->last); 411 412 prange->ttm_res = &prange->svm_bo->bo->tbo.mem; 413 return true; 414 } 415 416 } else { 417 mutex_unlock(&prange->lock); 418 } 419 420 /* We need a new svm_bo. Spin-loop to wait for concurrent 421 * svm_range_bo_release to finish removing this range from 422 * its range list. After this, it is safe to reuse the 423 * svm_bo pointer and svm_bo_list head. 424 */ 425 while (!list_empty_careful(&prange->svm_bo_list)) 426 ; 427 428 return false; 429 } 430 431 static struct svm_range_bo *svm_range_bo_new(void) 432 { 433 struct svm_range_bo *svm_bo; 434 435 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); 436 if (!svm_bo) 437 return NULL; 438 439 kref_init(&svm_bo->kref); 440 INIT_LIST_HEAD(&svm_bo->range_list); 441 spin_lock_init(&svm_bo->list_lock); 442 443 return svm_bo; 444 } 445 446 int 447 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, 448 bool clear) 449 { 450 struct amdgpu_bo_param bp; 451 struct svm_range_bo *svm_bo; 452 struct amdgpu_bo_user *ubo; 453 struct amdgpu_bo *bo; 454 struct kfd_process *p; 455 struct mm_struct *mm; 456 int r; 457 458 p = container_of(prange->svms, struct kfd_process, svms); 459 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, 460 prange->start, prange->last); 461 462 if (svm_range_validate_svm_bo(adev, prange)) 463 return 0; 464 465 svm_bo = svm_range_bo_new(); 466 if (!svm_bo) { 467 pr_debug("failed to alloc svm bo\n"); 468 return -ENOMEM; 469 } 470 mm = get_task_mm(p->lead_thread); 471 if (!mm) { 472 pr_debug("failed to get mm\n"); 473 kfree(svm_bo); 474 return -ESRCH; 475 } 476 svm_bo->svms = prange->svms; 477 svm_bo->eviction_fence = 478 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 479 mm, 480 svm_bo); 481 mmput(mm); 482 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); 483 svm_bo->evicting = 0; 484 memset(&bp, 0, sizeof(bp)); 485 bp.size = prange->npages * PAGE_SIZE; 486 bp.byte_align = PAGE_SIZE; 487 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 488 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 489 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; 490 bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO; 491 bp.type = ttm_bo_type_device; 492 bp.resv = NULL; 493 494 r = amdgpu_bo_create_user(adev, &bp, &ubo); 495 if (r) { 496 pr_debug("failed %d to create bo\n", r); 497 goto create_bo_failed; 498 } 499 bo = &ubo->bo; 500 r = amdgpu_bo_reserve(bo, true); 501 if (r) { 502 pr_debug("failed %d to reserve bo\n", r); 503 goto reserve_bo_failed; 504 } 505 506 r = dma_resv_reserve_shared(bo->tbo.base.resv, 1); 507 if (r) { 508 pr_debug("failed %d to reserve bo\n", r); 509 amdgpu_bo_unreserve(bo); 510 goto reserve_bo_failed; 511 } 512 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); 513 514 amdgpu_bo_unreserve(bo); 515 516 svm_bo->bo = bo; 517 prange->svm_bo = svm_bo; 518 prange->ttm_res = &bo->tbo.mem; 519 prange->offset = 0; 520 521 spin_lock(&svm_bo->list_lock); 522 list_add(&prange->svm_bo_list, &svm_bo->range_list); 523 spin_unlock(&svm_bo->list_lock); 524 525 return 0; 526 527 reserve_bo_failed: 528 amdgpu_bo_unref(&bo); 529 create_bo_failed: 530 dma_fence_put(&svm_bo->eviction_fence->base); 531 kfree(svm_bo); 532 prange->ttm_res = NULL; 533 534 return r; 535 } 536 537 void svm_range_vram_node_free(struct svm_range *prange) 538 { 539 svm_range_bo_unref(prange->svm_bo); 540 prange->ttm_res = NULL; 541 } 542 543 struct amdgpu_device * 544 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) 545 { 546 struct kfd_process_device *pdd; 547 struct kfd_process *p; 548 int32_t gpu_idx; 549 550 p = container_of(prange->svms, struct kfd_process, svms); 551 552 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); 553 if (gpu_idx < 0) { 554 pr_debug("failed to get device by id 0x%x\n", gpu_id); 555 return NULL; 556 } 557 pdd = kfd_process_device_from_gpuidx(p, gpu_idx); 558 if (!pdd) { 559 pr_debug("failed to get device by idx 0x%x\n", gpu_idx); 560 return NULL; 561 } 562 563 return (struct amdgpu_device *)pdd->dev->kgd; 564 } 565 566 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) 567 { 568 struct ttm_operation_ctx ctx = { false, false }; 569 570 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); 571 572 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 573 } 574 575 static int 576 svm_range_check_attr(struct kfd_process *p, 577 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 578 { 579 uint32_t i; 580 int gpuidx; 581 582 for (i = 0; i < nattr; i++) { 583 switch (attrs[i].type) { 584 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 585 if (attrs[i].value != KFD_IOCTL_SVM_LOCATION_SYSMEM && 586 attrs[i].value != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 587 kfd_process_gpuidx_from_gpuid(p, 588 attrs[i].value) < 0) { 589 pr_debug("no GPU 0x%x found\n", attrs[i].value); 590 return -EINVAL; 591 } 592 break; 593 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 594 if (attrs[i].value != KFD_IOCTL_SVM_LOCATION_SYSMEM && 595 kfd_process_gpuidx_from_gpuid(p, 596 attrs[i].value) < 0) { 597 pr_debug("no GPU 0x%x found\n", attrs[i].value); 598 return -EINVAL; 599 } 600 break; 601 case KFD_IOCTL_SVM_ATTR_ACCESS: 602 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 603 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 604 gpuidx = kfd_process_gpuidx_from_gpuid(p, 605 attrs[i].value); 606 if (gpuidx < 0) { 607 pr_debug("no GPU 0x%x found\n", attrs[i].value); 608 return -EINVAL; 609 } 610 break; 611 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 612 break; 613 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 614 break; 615 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 616 break; 617 default: 618 pr_debug("unknown attr type 0x%x\n", attrs[i].type); 619 return -EINVAL; 620 } 621 } 622 623 return 0; 624 } 625 626 static void 627 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, 628 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 629 { 630 uint32_t i; 631 int gpuidx; 632 633 for (i = 0; i < nattr; i++) { 634 switch (attrs[i].type) { 635 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 636 prange->preferred_loc = attrs[i].value; 637 break; 638 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 639 prange->prefetch_loc = attrs[i].value; 640 break; 641 case KFD_IOCTL_SVM_ATTR_ACCESS: 642 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 643 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 644 gpuidx = kfd_process_gpuidx_from_gpuid(p, 645 attrs[i].value); 646 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 647 bitmap_clear(prange->bitmap_access, gpuidx, 1); 648 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 649 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 650 bitmap_set(prange->bitmap_access, gpuidx, 1); 651 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 652 } else { 653 bitmap_clear(prange->bitmap_access, gpuidx, 1); 654 bitmap_set(prange->bitmap_aip, gpuidx, 1); 655 } 656 break; 657 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 658 prange->flags |= attrs[i].value; 659 break; 660 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 661 prange->flags &= ~attrs[i].value; 662 break; 663 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 664 prange->granularity = attrs[i].value; 665 break; 666 default: 667 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 668 } 669 } 670 } 671 672 /** 673 * svm_range_debug_dump - print all range information from svms 674 * @svms: svm range list header 675 * 676 * debug output svm range start, end, prefetch location from svms 677 * interval tree and link list 678 * 679 * Context: The caller must hold svms->lock 680 */ 681 static void svm_range_debug_dump(struct svm_range_list *svms) 682 { 683 struct interval_tree_node *node; 684 struct svm_range *prange; 685 686 pr_debug("dump svms 0x%p list\n", svms); 687 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 688 689 list_for_each_entry(prange, &svms->list, list) { 690 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 691 prange, prange->start, prange->npages, 692 prange->start + prange->npages - 1, 693 prange->actual_loc); 694 } 695 696 pr_debug("dump svms 0x%p interval tree\n", svms); 697 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 698 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); 699 while (node) { 700 prange = container_of(node, struct svm_range, it_node); 701 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 702 prange, prange->start, prange->npages, 703 prange->start + prange->npages - 1, 704 prange->actual_loc); 705 node = interval_tree_iter_next(node, 0, ~0ULL); 706 } 707 } 708 709 static bool 710 svm_range_is_same_attrs(struct svm_range *old, struct svm_range *new) 711 { 712 return (old->prefetch_loc == new->prefetch_loc && 713 old->flags == new->flags && 714 old->granularity == new->granularity); 715 } 716 717 static int 718 svm_range_split_array(void *ppnew, void *ppold, size_t size, 719 uint64_t old_start, uint64_t old_n, 720 uint64_t new_start, uint64_t new_n) 721 { 722 unsigned char *new, *old, *pold; 723 uint64_t d; 724 725 if (!ppold) 726 return 0; 727 pold = *(unsigned char **)ppold; 728 if (!pold) 729 return 0; 730 731 new = kvmalloc_array(new_n, size, GFP_KERNEL); 732 if (!new) 733 return -ENOMEM; 734 735 d = (new_start - old_start) * size; 736 memcpy(new, pold + d, new_n * size); 737 738 old = kvmalloc_array(old_n, size, GFP_KERNEL); 739 if (!old) { 740 kvfree(new); 741 return -ENOMEM; 742 } 743 744 d = (new_start == old_start) ? new_n * size : 0; 745 memcpy(old, pold + d, old_n * size); 746 747 kvfree(pold); 748 *(void **)ppold = old; 749 *(void **)ppnew = new; 750 751 return 0; 752 } 753 754 static int 755 svm_range_split_pages(struct svm_range *new, struct svm_range *old, 756 uint64_t start, uint64_t last) 757 { 758 uint64_t npages = last - start + 1; 759 int i, r; 760 761 for (i = 0; i < MAX_GPU_INSTANCE; i++) { 762 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], 763 sizeof(*old->dma_addr[i]), old->start, 764 npages, new->start, new->npages); 765 if (r) 766 return r; 767 } 768 769 return 0; 770 } 771 772 static int 773 svm_range_split_nodes(struct svm_range *new, struct svm_range *old, 774 uint64_t start, uint64_t last) 775 { 776 uint64_t npages = last - start + 1; 777 778 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", 779 new->svms, new, new->start, start, last); 780 781 if (new->start == old->start) { 782 new->offset = old->offset; 783 old->offset += new->npages; 784 } else { 785 new->offset = old->offset + npages; 786 } 787 788 new->svm_bo = svm_range_bo_ref(old->svm_bo); 789 new->ttm_res = old->ttm_res; 790 791 spin_lock(&new->svm_bo->list_lock); 792 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 793 spin_unlock(&new->svm_bo->list_lock); 794 795 return 0; 796 } 797 798 /** 799 * svm_range_split_adjust - split range and adjust 800 * 801 * @new: new range 802 * @old: the old range 803 * @start: the old range adjust to start address in pages 804 * @last: the old range adjust to last address in pages 805 * 806 * Copy system memory dma_addr or vram ttm_res in old range to new 807 * range from new_start up to size new->npages, the remaining old range is from 808 * start to last 809 * 810 * Return: 811 * 0 - OK, -ENOMEM - out of memory 812 */ 813 static int 814 svm_range_split_adjust(struct svm_range *new, struct svm_range *old, 815 uint64_t start, uint64_t last) 816 { 817 int r; 818 819 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", 820 new->svms, new->start, old->start, old->last, start, last); 821 822 if (new->start < old->start || 823 new->last > old->last) { 824 WARN_ONCE(1, "invalid new range start or last\n"); 825 return -EINVAL; 826 } 827 828 r = svm_range_split_pages(new, old, start, last); 829 if (r) 830 return r; 831 832 if (old->actual_loc && old->ttm_res) { 833 r = svm_range_split_nodes(new, old, start, last); 834 if (r) 835 return r; 836 } 837 838 old->npages = last - start + 1; 839 old->start = start; 840 old->last = last; 841 new->flags = old->flags; 842 new->preferred_loc = old->preferred_loc; 843 new->prefetch_loc = old->prefetch_loc; 844 new->actual_loc = old->actual_loc; 845 new->granularity = old->granularity; 846 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 847 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 848 849 return 0; 850 } 851 852 /** 853 * svm_range_split - split a range in 2 ranges 854 * 855 * @prange: the svm range to split 856 * @start: the remaining range start address in pages 857 * @last: the remaining range last address in pages 858 * @new: the result new range generated 859 * 860 * Two cases only: 861 * case 1: if start == prange->start 862 * prange ==> prange[start, last] 863 * new range [last + 1, prange->last] 864 * 865 * case 2: if last == prange->last 866 * prange ==> prange[start, last] 867 * new range [prange->start, start - 1] 868 * 869 * Return: 870 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last 871 */ 872 static int 873 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, 874 struct svm_range **new) 875 { 876 uint64_t old_start = prange->start; 877 uint64_t old_last = prange->last; 878 struct svm_range_list *svms; 879 int r = 0; 880 881 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, 882 old_start, old_last, start, last); 883 884 if (old_start != start && old_last != last) 885 return -EINVAL; 886 if (start < old_start || last > old_last) 887 return -EINVAL; 888 889 svms = prange->svms; 890 if (old_start == start) 891 *new = svm_range_new(svms, last + 1, old_last); 892 else 893 *new = svm_range_new(svms, old_start, start - 1); 894 if (!*new) 895 return -ENOMEM; 896 897 r = svm_range_split_adjust(*new, prange, start, last); 898 if (r) { 899 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", 900 r, old_start, old_last, start, last); 901 svm_range_free(*new); 902 *new = NULL; 903 } 904 905 return r; 906 } 907 908 static int 909 svm_range_split_tail(struct svm_range *prange, struct svm_range *new, 910 uint64_t new_last, struct list_head *insert_list) 911 { 912 struct svm_range *tail; 913 int r = svm_range_split(prange, prange->start, new_last, &tail); 914 915 if (!r) 916 list_add(&tail->insert_list, insert_list); 917 return r; 918 } 919 920 static int 921 svm_range_split_head(struct svm_range *prange, struct svm_range *new, 922 uint64_t new_start, struct list_head *insert_list) 923 { 924 struct svm_range *head; 925 int r = svm_range_split(prange, new_start, prange->last, &head); 926 927 if (!r) 928 list_add(&head->insert_list, insert_list); 929 return r; 930 } 931 932 static void 933 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 934 struct svm_range *pchild, enum svm_work_list_ops op) 935 { 936 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 937 pchild, pchild->start, pchild->last, prange, op); 938 939 pchild->work_item.mm = mm; 940 pchild->work_item.op = op; 941 list_add_tail(&pchild->child_list, &prange->child_list); 942 } 943 944 /** 945 * svm_range_split_by_granularity - collect ranges within granularity boundary 946 * 947 * @p: the process with svms list 948 * @mm: mm structure 949 * @addr: the vm fault address in pages, to split the prange 950 * @parent: parent range if prange is from child list 951 * @prange: prange to split 952 * 953 * Trims @prange to be a single aligned block of prange->granularity if 954 * possible. The head and tail are added to the child_list in @parent. 955 * 956 * Context: caller must hold mmap_read_lock and prange->lock 957 * 958 * Return: 959 * 0 - OK, otherwise error code 960 */ 961 int 962 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, 963 unsigned long addr, struct svm_range *parent, 964 struct svm_range *prange) 965 { 966 struct svm_range *head, *tail; 967 unsigned long start, last, size; 968 int r; 969 970 /* Align splited range start and size to granularity size, then a single 971 * PTE will be used for whole range, this reduces the number of PTE 972 * updated and the L1 TLB space used for translation. 973 */ 974 size = 1UL << prange->granularity; 975 start = ALIGN_DOWN(addr, size); 976 last = ALIGN(addr + 1, size) - 1; 977 978 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", 979 prange->svms, prange->start, prange->last, start, last, size); 980 981 if (start > prange->start) { 982 r = svm_range_split(prange, start, prange->last, &head); 983 if (r) 984 return r; 985 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); 986 } 987 988 if (last < prange->last) { 989 r = svm_range_split(prange, prange->start, last, &tail); 990 if (r) 991 return r; 992 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 993 } 994 995 /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ 996 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { 997 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; 998 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", 999 prange, prange->start, prange->last, 1000 SVM_OP_ADD_RANGE_AND_MAP); 1001 } 1002 return 0; 1003 } 1004 1005 static uint64_t 1006 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange) 1007 { 1008 struct amdgpu_device *bo_adev; 1009 uint32_t flags = prange->flags; 1010 uint32_t mapping_flags = 0; 1011 uint64_t pte_flags; 1012 bool snoop = !prange->ttm_res; 1013 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; 1014 1015 if (prange->svm_bo && prange->ttm_res) 1016 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1017 1018 switch (adev->asic_type) { 1019 case CHIP_ARCTURUS: 1020 if (prange->svm_bo && prange->ttm_res) { 1021 if (bo_adev == adev) { 1022 mapping_flags |= coherent ? 1023 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1024 } else { 1025 mapping_flags |= coherent ? 1026 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1027 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1028 snoop = true; 1029 } 1030 } else { 1031 mapping_flags |= coherent ? 1032 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1033 } 1034 break; 1035 case CHIP_ALDEBARAN: 1036 if (prange->svm_bo && prange->ttm_res) { 1037 if (bo_adev == adev) { 1038 mapping_flags |= coherent ? 1039 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1040 if (adev->gmc.xgmi.connected_to_cpu) 1041 snoop = true; 1042 } else { 1043 mapping_flags |= coherent ? 1044 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1045 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1046 snoop = true; 1047 } 1048 } else { 1049 mapping_flags |= coherent ? 1050 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1051 } 1052 break; 1053 default: 1054 mapping_flags |= coherent ? 1055 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1056 } 1057 1058 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; 1059 1060 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) 1061 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; 1062 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) 1063 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 1064 1065 pte_flags = AMDGPU_PTE_VALID; 1066 pte_flags |= prange->ttm_res ? 0 : AMDGPU_PTE_SYSTEM; 1067 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 1068 1069 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); 1070 1071 pr_debug("svms 0x%p [0x%lx 0x%lx] vram %d PTE 0x%llx mapping 0x%x\n", 1072 prange->svms, prange->start, prange->last, 1073 prange->ttm_res ? 1:0, pte_flags, mapping_flags); 1074 1075 return pte_flags; 1076 } 1077 1078 static int 1079 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1080 uint64_t start, uint64_t last, 1081 struct dma_fence **fence) 1082 { 1083 uint64_t init_pte_value = 0; 1084 1085 pr_debug("[0x%llx 0x%llx]\n", start, last); 1086 1087 return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL, 1088 start, last, init_pte_value, 0, 1089 NULL, NULL, fence, NULL); 1090 } 1091 1092 static int 1093 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, 1094 unsigned long last) 1095 { 1096 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1097 struct kfd_process_device *pdd; 1098 struct dma_fence *fence = NULL; 1099 struct amdgpu_device *adev; 1100 struct kfd_process *p; 1101 uint32_t gpuidx; 1102 int r = 0; 1103 1104 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 1105 MAX_GPU_INSTANCE); 1106 p = container_of(prange->svms, struct kfd_process, svms); 1107 1108 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1109 pr_debug("unmap from gpu idx 0x%x\n", gpuidx); 1110 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1111 if (!pdd) { 1112 pr_debug("failed to find device idx %d\n", gpuidx); 1113 return -EINVAL; 1114 } 1115 adev = (struct amdgpu_device *)pdd->dev->kgd; 1116 1117 r = svm_range_unmap_from_gpu(adev, drm_priv_to_vm(pdd->drm_priv), 1118 start, last, &fence); 1119 if (r) 1120 break; 1121 1122 if (fence) { 1123 r = dma_fence_wait(fence, false); 1124 dma_fence_put(fence); 1125 fence = NULL; 1126 if (r) 1127 break; 1128 } 1129 amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev, 1130 p->pasid, TLB_FLUSH_HEAVYWEIGHT); 1131 } 1132 1133 return r; 1134 } 1135 1136 static int 1137 svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1138 struct svm_range *prange, dma_addr_t *dma_addr, 1139 struct amdgpu_device *bo_adev, struct dma_fence **fence) 1140 { 1141 struct amdgpu_bo_va bo_va; 1142 bool table_freed = false; 1143 uint64_t pte_flags; 1144 int r = 0; 1145 1146 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start, 1147 prange->last); 1148 1149 if (prange->svm_bo && prange->ttm_res) { 1150 bo_va.is_xgmi = amdgpu_xgmi_same_hive(adev, bo_adev); 1151 prange->mapping.bo_va = &bo_va; 1152 } 1153 1154 prange->mapping.start = prange->start; 1155 prange->mapping.last = prange->last; 1156 prange->mapping.offset = prange->offset; 1157 pte_flags = svm_range_get_pte_flags(adev, prange); 1158 1159 r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false, NULL, 1160 prange->mapping.start, 1161 prange->mapping.last, pte_flags, 1162 prange->mapping.offset, 1163 prange->ttm_res, 1164 dma_addr, &vm->last_update, 1165 &table_freed); 1166 if (r) { 1167 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); 1168 goto out; 1169 } 1170 1171 r = amdgpu_vm_update_pdes(adev, vm, false); 1172 if (r) { 1173 pr_debug("failed %d to update directories 0x%lx\n", r, 1174 prange->start); 1175 goto out; 1176 } 1177 1178 if (fence) 1179 *fence = dma_fence_get(vm->last_update); 1180 1181 if (table_freed) { 1182 struct kfd_process *p; 1183 1184 p = container_of(prange->svms, struct kfd_process, svms); 1185 amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev, 1186 p->pasid, TLB_FLUSH_LEGACY); 1187 } 1188 out: 1189 prange->mapping.bo_va = NULL; 1190 return r; 1191 } 1192 1193 static int svm_range_map_to_gpus(struct svm_range *prange, 1194 unsigned long *bitmap, bool wait) 1195 { 1196 struct kfd_process_device *pdd; 1197 struct amdgpu_device *bo_adev; 1198 struct amdgpu_device *adev; 1199 struct kfd_process *p; 1200 struct dma_fence *fence = NULL; 1201 uint32_t gpuidx; 1202 int r = 0; 1203 1204 if (prange->svm_bo && prange->ttm_res) 1205 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1206 else 1207 bo_adev = NULL; 1208 1209 p = container_of(prange->svms, struct kfd_process, svms); 1210 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1211 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 1212 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1213 if (!pdd) { 1214 pr_debug("failed to find device idx %d\n", gpuidx); 1215 return -EINVAL; 1216 } 1217 adev = (struct amdgpu_device *)pdd->dev->kgd; 1218 1219 pdd = kfd_bind_process_to_device(pdd->dev, p); 1220 if (IS_ERR(pdd)) 1221 return -EINVAL; 1222 1223 if (bo_adev && adev != bo_adev && 1224 !amdgpu_xgmi_same_hive(adev, bo_adev)) { 1225 pr_debug("cannot map to device idx %d\n", gpuidx); 1226 continue; 1227 } 1228 1229 r = svm_range_map_to_gpu(adev, drm_priv_to_vm(pdd->drm_priv), 1230 prange, prange->dma_addr[gpuidx], 1231 bo_adev, wait ? &fence : NULL); 1232 if (r) 1233 break; 1234 1235 if (fence) { 1236 r = dma_fence_wait(fence, false); 1237 dma_fence_put(fence); 1238 fence = NULL; 1239 if (r) { 1240 pr_debug("failed %d to dma fence wait\n", r); 1241 break; 1242 } 1243 } 1244 } 1245 1246 return r; 1247 } 1248 1249 struct svm_validate_context { 1250 struct kfd_process *process; 1251 struct svm_range *prange; 1252 bool intr; 1253 unsigned long bitmap[MAX_GPU_INSTANCE]; 1254 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE+1]; 1255 struct list_head validate_list; 1256 struct ww_acquire_ctx ticket; 1257 }; 1258 1259 static int svm_range_reserve_bos(struct svm_validate_context *ctx) 1260 { 1261 struct kfd_process_device *pdd; 1262 struct amdgpu_device *adev; 1263 struct amdgpu_vm *vm; 1264 uint32_t gpuidx; 1265 int r; 1266 1267 INIT_LIST_HEAD(&ctx->validate_list); 1268 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1269 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1270 if (!pdd) { 1271 pr_debug("failed to find device idx %d\n", gpuidx); 1272 return -EINVAL; 1273 } 1274 adev = (struct amdgpu_device *)pdd->dev->kgd; 1275 vm = drm_priv_to_vm(pdd->drm_priv); 1276 1277 ctx->tv[gpuidx].bo = &vm->root.base.bo->tbo; 1278 ctx->tv[gpuidx].num_shared = 4; 1279 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); 1280 } 1281 if (ctx->prange->svm_bo && ctx->prange->ttm_res) { 1282 ctx->tv[MAX_GPU_INSTANCE].bo = &ctx->prange->svm_bo->bo->tbo; 1283 ctx->tv[MAX_GPU_INSTANCE].num_shared = 1; 1284 list_add(&ctx->tv[MAX_GPU_INSTANCE].head, &ctx->validate_list); 1285 } 1286 1287 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, 1288 ctx->intr, NULL); 1289 if (r) { 1290 pr_debug("failed %d to reserve bo\n", r); 1291 return r; 1292 } 1293 1294 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1295 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1296 if (!pdd) { 1297 pr_debug("failed to find device idx %d\n", gpuidx); 1298 r = -EINVAL; 1299 goto unreserve_out; 1300 } 1301 adev = (struct amdgpu_device *)pdd->dev->kgd; 1302 1303 r = amdgpu_vm_validate_pt_bos(adev, drm_priv_to_vm(pdd->drm_priv), 1304 svm_range_bo_validate, NULL); 1305 if (r) { 1306 pr_debug("failed %d validate pt bos\n", r); 1307 goto unreserve_out; 1308 } 1309 } 1310 1311 return 0; 1312 1313 unreserve_out: 1314 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1315 return r; 1316 } 1317 1318 static void svm_range_unreserve_bos(struct svm_validate_context *ctx) 1319 { 1320 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1321 } 1322 1323 /* 1324 * Validation+GPU mapping with concurrent invalidation (MMU notifiers) 1325 * 1326 * To prevent concurrent destruction or change of range attributes, the 1327 * svm_read_lock must be held. The caller must not hold the svm_write_lock 1328 * because that would block concurrent evictions and lead to deadlocks. To 1329 * serialize concurrent migrations or validations of the same range, the 1330 * prange->migrate_mutex must be held. 1331 * 1332 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its 1333 * eviction fence. 1334 * 1335 * The following sequence ensures race-free validation and GPU mapping: 1336 * 1337 * 1. Reserve page table (and SVM BO if range is in VRAM) 1338 * 2. hmm_range_fault to get page addresses (if system memory) 1339 * 3. DMA-map pages (if system memory) 1340 * 4-a. Take notifier lock 1341 * 4-b. Check that pages still valid (mmu_interval_read_retry) 1342 * 4-c. Check that the range was not split or otherwise invalidated 1343 * 4-d. Update GPU page table 1344 * 4.e. Release notifier lock 1345 * 5. Release page table (and SVM BO) reservation 1346 */ 1347 static int svm_range_validate_and_map(struct mm_struct *mm, 1348 struct svm_range *prange, 1349 int32_t gpuidx, bool intr, bool wait) 1350 { 1351 struct svm_validate_context ctx; 1352 struct hmm_range *hmm_range; 1353 int r = 0; 1354 1355 ctx.process = container_of(prange->svms, struct kfd_process, svms); 1356 ctx.prange = prange; 1357 ctx.intr = intr; 1358 1359 if (gpuidx < MAX_GPU_INSTANCE) { 1360 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); 1361 bitmap_set(ctx.bitmap, gpuidx, 1); 1362 } else if (ctx.process->xnack_enabled) { 1363 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 1364 1365 /* If prefetch range to GPU, or GPU retry fault migrate range to 1366 * GPU, which has ACCESS attribute to the range, create mapping 1367 * on that GPU. 1368 */ 1369 if (prange->actual_loc) { 1370 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, 1371 prange->actual_loc); 1372 if (gpuidx < 0) { 1373 WARN_ONCE(1, "failed get device by id 0x%x\n", 1374 prange->actual_loc); 1375 return -EINVAL; 1376 } 1377 if (test_bit(gpuidx, prange->bitmap_access)) 1378 bitmap_set(ctx.bitmap, gpuidx, 1); 1379 } 1380 } else { 1381 bitmap_or(ctx.bitmap, prange->bitmap_access, 1382 prange->bitmap_aip, MAX_GPU_INSTANCE); 1383 } 1384 1385 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) 1386 return 0; 1387 1388 if (prange->actual_loc && !prange->ttm_res) { 1389 /* This should never happen. actual_loc gets set by 1390 * svm_migrate_ram_to_vram after allocating a BO. 1391 */ 1392 WARN(1, "VRAM BO missing during validation\n"); 1393 return -EINVAL; 1394 } 1395 1396 svm_range_reserve_bos(&ctx); 1397 1398 if (!prange->actual_loc) { 1399 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 1400 prange->start << PAGE_SHIFT, 1401 prange->npages, &hmm_range, 1402 false, true); 1403 if (r) { 1404 pr_debug("failed %d to get svm range pages\n", r); 1405 goto unreserve_out; 1406 } 1407 1408 r = svm_range_dma_map(prange, ctx.bitmap, 1409 hmm_range->hmm_pfns); 1410 if (r) { 1411 pr_debug("failed %d to dma map range\n", r); 1412 goto unreserve_out; 1413 } 1414 1415 prange->validated_once = true; 1416 } 1417 1418 svm_range_lock(prange); 1419 if (!prange->actual_loc) { 1420 if (amdgpu_hmm_range_get_pages_done(hmm_range)) { 1421 pr_debug("hmm update the range, need validate again\n"); 1422 r = -EAGAIN; 1423 goto unlock_out; 1424 } 1425 } 1426 if (!list_empty(&prange->child_list)) { 1427 pr_debug("range split by unmap in parallel, validate again\n"); 1428 r = -EAGAIN; 1429 goto unlock_out; 1430 } 1431 1432 r = svm_range_map_to_gpus(prange, ctx.bitmap, wait); 1433 1434 unlock_out: 1435 svm_range_unlock(prange); 1436 unreserve_out: 1437 svm_range_unreserve_bos(&ctx); 1438 1439 if (!r) 1440 prange->validate_timestamp = ktime_to_us(ktime_get()); 1441 1442 return r; 1443 } 1444 1445 /** 1446 * svm_range_list_lock_and_flush_work - flush pending deferred work 1447 * 1448 * @svms: the svm range list 1449 * @mm: the mm structure 1450 * 1451 * Context: Returns with mmap write lock held, pending deferred work flushed 1452 * 1453 */ 1454 static void 1455 svm_range_list_lock_and_flush_work(struct svm_range_list *svms, 1456 struct mm_struct *mm) 1457 { 1458 retry_flush_work: 1459 flush_work(&svms->deferred_list_work); 1460 mmap_write_lock(mm); 1461 1462 if (list_empty(&svms->deferred_range_list)) 1463 return; 1464 mmap_write_unlock(mm); 1465 pr_debug("retry flush\n"); 1466 goto retry_flush_work; 1467 } 1468 1469 static void svm_range_restore_work(struct work_struct *work) 1470 { 1471 struct delayed_work *dwork = to_delayed_work(work); 1472 struct amdkfd_process_info *process_info; 1473 struct svm_range_list *svms; 1474 struct svm_range *prange; 1475 struct kfd_process *p; 1476 struct mm_struct *mm; 1477 int evicted_ranges; 1478 int invalid; 1479 int r; 1480 1481 svms = container_of(dwork, struct svm_range_list, restore_work); 1482 evicted_ranges = atomic_read(&svms->evicted_ranges); 1483 if (!evicted_ranges) 1484 return; 1485 1486 pr_debug("restore svm ranges\n"); 1487 1488 /* kfd_process_notifier_release destroys this worker thread. So during 1489 * the lifetime of this thread, kfd_process and mm will be valid. 1490 */ 1491 p = container_of(svms, struct kfd_process, svms); 1492 process_info = p->kgd_process_info; 1493 mm = p->mm; 1494 if (!mm) 1495 return; 1496 1497 mutex_lock(&process_info->lock); 1498 svm_range_list_lock_and_flush_work(svms, mm); 1499 mutex_lock(&svms->lock); 1500 1501 evicted_ranges = atomic_read(&svms->evicted_ranges); 1502 1503 list_for_each_entry(prange, &svms->list, list) { 1504 invalid = atomic_read(&prange->invalid); 1505 if (!invalid) 1506 continue; 1507 1508 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", 1509 prange->svms, prange, prange->start, prange->last, 1510 invalid); 1511 1512 /* 1513 * If range is migrating, wait for migration is done. 1514 */ 1515 mutex_lock(&prange->migrate_mutex); 1516 1517 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 1518 false, true); 1519 if (r) 1520 pr_debug("failed %d to map 0x%lx to gpus\n", r, 1521 prange->start); 1522 1523 mutex_unlock(&prange->migrate_mutex); 1524 if (r) 1525 goto out_reschedule; 1526 1527 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) 1528 goto out_reschedule; 1529 } 1530 1531 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != 1532 evicted_ranges) 1533 goto out_reschedule; 1534 1535 evicted_ranges = 0; 1536 1537 r = kgd2kfd_resume_mm(mm); 1538 if (r) { 1539 /* No recovery from this failure. Probably the CP is 1540 * hanging. No point trying again. 1541 */ 1542 pr_debug("failed %d to resume KFD\n", r); 1543 } 1544 1545 pr_debug("restore svm ranges successfully\n"); 1546 1547 out_reschedule: 1548 mutex_unlock(&svms->lock); 1549 mmap_write_unlock(mm); 1550 mutex_unlock(&process_info->lock); 1551 1552 /* If validation failed, reschedule another attempt */ 1553 if (evicted_ranges) { 1554 pr_debug("reschedule to restore svm range\n"); 1555 schedule_delayed_work(&svms->restore_work, 1556 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1557 } 1558 } 1559 1560 /** 1561 * svm_range_evict - evict svm range 1562 * 1563 * Stop all queues of the process to ensure GPU doesn't access the memory, then 1564 * return to let CPU evict the buffer and proceed CPU pagetable update. 1565 * 1566 * Don't need use lock to sync cpu pagetable invalidation with GPU execution. 1567 * If invalidation happens while restore work is running, restore work will 1568 * restart to ensure to get the latest CPU pages mapping to GPU, then start 1569 * the queues. 1570 */ 1571 static int 1572 svm_range_evict(struct svm_range *prange, struct mm_struct *mm, 1573 unsigned long start, unsigned long last) 1574 { 1575 struct svm_range_list *svms = prange->svms; 1576 struct kfd_process *p; 1577 int r = 0; 1578 1579 p = container_of(svms, struct kfd_process, svms); 1580 1581 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1582 svms, prange->start, prange->last, start, last); 1583 1584 if (!p->xnack_enabled) { 1585 int evicted_ranges; 1586 1587 atomic_inc(&prange->invalid); 1588 evicted_ranges = atomic_inc_return(&svms->evicted_ranges); 1589 if (evicted_ranges != 1) 1590 return r; 1591 1592 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", 1593 prange->svms, prange->start, prange->last); 1594 1595 /* First eviction, stop the queues */ 1596 r = kgd2kfd_quiesce_mm(mm); 1597 if (r) 1598 pr_debug("failed to quiesce KFD\n"); 1599 1600 pr_debug("schedule to restore svm %p ranges\n", svms); 1601 schedule_delayed_work(&svms->restore_work, 1602 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1603 } else { 1604 struct svm_range *pchild; 1605 unsigned long s, l; 1606 1607 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", 1608 prange->svms, start, last); 1609 list_for_each_entry(pchild, &prange->child_list, child_list) { 1610 mutex_lock_nested(&pchild->lock, 1); 1611 s = max(start, pchild->start); 1612 l = min(last, pchild->last); 1613 if (l >= s) 1614 svm_range_unmap_from_gpus(pchild, s, l); 1615 mutex_unlock(&pchild->lock); 1616 } 1617 s = max(start, prange->start); 1618 l = min(last, prange->last); 1619 if (l >= s) 1620 svm_range_unmap_from_gpus(prange, s, l); 1621 } 1622 1623 return r; 1624 } 1625 1626 static struct svm_range *svm_range_clone(struct svm_range *old) 1627 { 1628 struct svm_range *new; 1629 1630 new = svm_range_new(old->svms, old->start, old->last); 1631 if (!new) 1632 return NULL; 1633 1634 if (old->svm_bo) { 1635 new->ttm_res = old->ttm_res; 1636 new->offset = old->offset; 1637 new->svm_bo = svm_range_bo_ref(old->svm_bo); 1638 spin_lock(&new->svm_bo->list_lock); 1639 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 1640 spin_unlock(&new->svm_bo->list_lock); 1641 } 1642 new->flags = old->flags; 1643 new->preferred_loc = old->preferred_loc; 1644 new->prefetch_loc = old->prefetch_loc; 1645 new->actual_loc = old->actual_loc; 1646 new->granularity = old->granularity; 1647 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 1648 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 1649 1650 return new; 1651 } 1652 1653 /** 1654 * svm_range_handle_overlap - split overlap ranges 1655 * @svms: svm range list header 1656 * @new: range added with this attributes 1657 * @start: range added start address, in pages 1658 * @last: range last address, in pages 1659 * @update_list: output, the ranges attributes are updated. For set_attr, this 1660 * will do validation and map to GPUs. For unmap, this will be 1661 * removed and unmap from GPUs 1662 * @insert_list: output, the ranges will be inserted into svms, attributes are 1663 * not changes. For set_attr, this will add into svms. 1664 * @remove_list:output, the ranges will be removed from svms 1665 * @left: the remaining range after overlap, For set_attr, this will be added 1666 * as new range. 1667 * 1668 * Total have 5 overlap cases. 1669 * 1670 * This function handles overlap of an address interval with existing 1671 * struct svm_ranges for applying new attributes. This may require 1672 * splitting existing struct svm_ranges. All changes should be applied to 1673 * the range_list and interval tree transactionally. If any split operation 1674 * fails, the entire update fails. Therefore the existing overlapping 1675 * svm_ranges are cloned and the original svm_ranges left unchanged. If the 1676 * transaction succeeds, the modified clones are added and the originals 1677 * freed. Otherwise the clones are removed and the old svm_ranges remain. 1678 * 1679 * Context: The caller must hold svms->lock 1680 */ 1681 static int 1682 svm_range_handle_overlap(struct svm_range_list *svms, struct svm_range *new, 1683 unsigned long start, unsigned long last, 1684 struct list_head *update_list, 1685 struct list_head *insert_list, 1686 struct list_head *remove_list, 1687 unsigned long *left) 1688 { 1689 struct interval_tree_node *node; 1690 struct svm_range *prange; 1691 struct svm_range *tmp; 1692 int r = 0; 1693 1694 INIT_LIST_HEAD(update_list); 1695 INIT_LIST_HEAD(insert_list); 1696 INIT_LIST_HEAD(remove_list); 1697 1698 node = interval_tree_iter_first(&svms->objects, start, last); 1699 while (node) { 1700 struct interval_tree_node *next; 1701 struct svm_range *old; 1702 unsigned long next_start; 1703 1704 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, 1705 node->last); 1706 1707 old = container_of(node, struct svm_range, it_node); 1708 next = interval_tree_iter_next(node, start, last); 1709 next_start = min(node->last, last) + 1; 1710 1711 if (node->start < start || node->last > last) { 1712 /* node intersects the updated range, clone+split it */ 1713 prange = svm_range_clone(old); 1714 if (!prange) { 1715 r = -ENOMEM; 1716 goto out; 1717 } 1718 1719 list_add(&old->remove_list, remove_list); 1720 list_add(&prange->insert_list, insert_list); 1721 1722 if (node->start < start) { 1723 pr_debug("change old range start\n"); 1724 r = svm_range_split_head(prange, new, start, 1725 insert_list); 1726 if (r) 1727 goto out; 1728 } 1729 if (node->last > last) { 1730 pr_debug("change old range last\n"); 1731 r = svm_range_split_tail(prange, new, last, 1732 insert_list); 1733 if (r) 1734 goto out; 1735 } 1736 } else { 1737 /* The node is contained within start..last, 1738 * just update it 1739 */ 1740 prange = old; 1741 } 1742 1743 if (!svm_range_is_same_attrs(prange, new)) 1744 list_add(&prange->update_list, update_list); 1745 1746 /* insert a new node if needed */ 1747 if (node->start > start) { 1748 prange = svm_range_new(prange->svms, start, 1749 node->start - 1); 1750 if (!prange) { 1751 r = -ENOMEM; 1752 goto out; 1753 } 1754 1755 list_add(&prange->insert_list, insert_list); 1756 list_add(&prange->update_list, update_list); 1757 } 1758 1759 node = next; 1760 start = next_start; 1761 } 1762 1763 if (left && start <= last) 1764 *left = last - start + 1; 1765 1766 out: 1767 if (r) 1768 list_for_each_entry_safe(prange, tmp, insert_list, insert_list) 1769 svm_range_free(prange); 1770 1771 return r; 1772 } 1773 1774 static void 1775 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, 1776 struct svm_range *prange) 1777 { 1778 unsigned long start; 1779 unsigned long last; 1780 1781 start = prange->notifier.interval_tree.start >> PAGE_SHIFT; 1782 last = prange->notifier.interval_tree.last >> PAGE_SHIFT; 1783 1784 if (prange->start == start && prange->last == last) 1785 return; 1786 1787 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1788 prange->svms, prange, start, last, prange->start, 1789 prange->last); 1790 1791 if (start != 0 && last != 0) { 1792 interval_tree_remove(&prange->it_node, &prange->svms->objects); 1793 svm_range_remove_notifier(prange); 1794 } 1795 prange->it_node.start = prange->start; 1796 prange->it_node.last = prange->last; 1797 1798 interval_tree_insert(&prange->it_node, &prange->svms->objects); 1799 svm_range_add_notifier_locked(mm, prange); 1800 } 1801 1802 static void 1803 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange) 1804 { 1805 struct mm_struct *mm = prange->work_item.mm; 1806 1807 switch (prange->work_item.op) { 1808 case SVM_OP_NULL: 1809 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1810 svms, prange, prange->start, prange->last); 1811 break; 1812 case SVM_OP_UNMAP_RANGE: 1813 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1814 svms, prange, prange->start, prange->last); 1815 svm_range_unlink(prange); 1816 svm_range_remove_notifier(prange); 1817 svm_range_free(prange); 1818 break; 1819 case SVM_OP_UPDATE_RANGE_NOTIFIER: 1820 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1821 svms, prange, prange->start, prange->last); 1822 svm_range_update_notifier_and_interval_tree(mm, prange); 1823 break; 1824 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: 1825 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1826 svms, prange, prange->start, prange->last); 1827 svm_range_update_notifier_and_interval_tree(mm, prange); 1828 /* TODO: implement deferred validation and mapping */ 1829 break; 1830 case SVM_OP_ADD_RANGE: 1831 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, 1832 prange->start, prange->last); 1833 svm_range_add_to_svms(prange); 1834 svm_range_add_notifier_locked(mm, prange); 1835 break; 1836 case SVM_OP_ADD_RANGE_AND_MAP: 1837 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, 1838 prange, prange->start, prange->last); 1839 svm_range_add_to_svms(prange); 1840 svm_range_add_notifier_locked(mm, prange); 1841 /* TODO: implement deferred validation and mapping */ 1842 break; 1843 default: 1844 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, 1845 prange->work_item.op); 1846 } 1847 } 1848 1849 static void svm_range_drain_retry_fault(struct svm_range_list *svms) 1850 { 1851 struct kfd_process_device *pdd; 1852 struct amdgpu_device *adev; 1853 struct kfd_process *p; 1854 uint32_t i; 1855 1856 p = container_of(svms, struct kfd_process, svms); 1857 1858 for (i = 0; i < p->n_pdds; i++) { 1859 pdd = p->pdds[i]; 1860 if (!pdd) 1861 continue; 1862 1863 pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 1864 adev = (struct amdgpu_device *)pdd->dev->kgd; 1865 1866 amdgpu_ih_wait_on_checkpoint_process(adev, &adev->irq.ih1); 1867 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 1868 } 1869 } 1870 1871 static void svm_range_deferred_list_work(struct work_struct *work) 1872 { 1873 struct svm_range_list *svms; 1874 struct svm_range *prange; 1875 struct mm_struct *mm; 1876 1877 svms = container_of(work, struct svm_range_list, deferred_list_work); 1878 pr_debug("enter svms 0x%p\n", svms); 1879 1880 spin_lock(&svms->deferred_list_lock); 1881 while (!list_empty(&svms->deferred_range_list)) { 1882 prange = list_first_entry(&svms->deferred_range_list, 1883 struct svm_range, deferred_list); 1884 spin_unlock(&svms->deferred_list_lock); 1885 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, 1886 prange->start, prange->last, prange->work_item.op); 1887 1888 /* Make sure no stale retry fault coming after range is freed */ 1889 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) 1890 svm_range_drain_retry_fault(prange->svms); 1891 1892 mm = prange->work_item.mm; 1893 mmap_write_lock(mm); 1894 mutex_lock(&svms->lock); 1895 1896 /* Remove from deferred_list must be inside mmap write lock, 1897 * otherwise, svm_range_list_lock_and_flush_work may hold mmap 1898 * write lock, and continue because deferred_list is empty, then 1899 * deferred_list handle is blocked by mmap write lock. 1900 */ 1901 spin_lock(&svms->deferred_list_lock); 1902 list_del_init(&prange->deferred_list); 1903 spin_unlock(&svms->deferred_list_lock); 1904 1905 mutex_lock(&prange->migrate_mutex); 1906 while (!list_empty(&prange->child_list)) { 1907 struct svm_range *pchild; 1908 1909 pchild = list_first_entry(&prange->child_list, 1910 struct svm_range, child_list); 1911 pr_debug("child prange 0x%p op %d\n", pchild, 1912 pchild->work_item.op); 1913 list_del_init(&pchild->child_list); 1914 svm_range_handle_list_op(svms, pchild); 1915 } 1916 mutex_unlock(&prange->migrate_mutex); 1917 1918 svm_range_handle_list_op(svms, prange); 1919 mutex_unlock(&svms->lock); 1920 mmap_write_unlock(mm); 1921 1922 spin_lock(&svms->deferred_list_lock); 1923 } 1924 spin_unlock(&svms->deferred_list_lock); 1925 1926 pr_debug("exit svms 0x%p\n", svms); 1927 } 1928 1929 void 1930 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, 1931 struct mm_struct *mm, enum svm_work_list_ops op) 1932 { 1933 spin_lock(&svms->deferred_list_lock); 1934 /* if prange is on the deferred list */ 1935 if (!list_empty(&prange->deferred_list)) { 1936 pr_debug("update exist prange 0x%p work op %d\n", prange, op); 1937 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); 1938 if (op != SVM_OP_NULL && 1939 prange->work_item.op != SVM_OP_UNMAP_RANGE) 1940 prange->work_item.op = op; 1941 } else { 1942 prange->work_item.op = op; 1943 prange->work_item.mm = mm; 1944 list_add_tail(&prange->deferred_list, 1945 &prange->svms->deferred_range_list); 1946 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 1947 prange, prange->start, prange->last, op); 1948 } 1949 spin_unlock(&svms->deferred_list_lock); 1950 } 1951 1952 void schedule_deferred_list_work(struct svm_range_list *svms) 1953 { 1954 spin_lock(&svms->deferred_list_lock); 1955 if (!list_empty(&svms->deferred_range_list)) 1956 schedule_work(&svms->deferred_list_work); 1957 spin_unlock(&svms->deferred_list_lock); 1958 } 1959 1960 static void 1961 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 1962 struct svm_range *prange, unsigned long start, 1963 unsigned long last) 1964 { 1965 struct svm_range *head; 1966 struct svm_range *tail; 1967 1968 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 1969 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, 1970 prange->start, prange->last); 1971 return; 1972 } 1973 if (start > prange->last || last < prange->start) 1974 return; 1975 1976 head = tail = prange; 1977 if (start > prange->start) 1978 svm_range_split(prange, prange->start, start - 1, &tail); 1979 if (last < tail->last) 1980 svm_range_split(tail, last + 1, tail->last, &head); 1981 1982 if (head != prange && tail != prange) { 1983 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 1984 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 1985 } else if (tail != prange) { 1986 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 1987 } else if (head != prange) { 1988 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 1989 } else if (parent != prange) { 1990 prange->work_item.op = SVM_OP_UNMAP_RANGE; 1991 } 1992 } 1993 1994 static void 1995 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, 1996 unsigned long start, unsigned long last) 1997 { 1998 struct svm_range_list *svms; 1999 struct svm_range *pchild; 2000 struct kfd_process *p; 2001 unsigned long s, l; 2002 bool unmap_parent; 2003 2004 p = kfd_lookup_process_by_mm(mm); 2005 if (!p) 2006 return; 2007 svms = &p->svms; 2008 2009 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, 2010 prange, prange->start, prange->last, start, last); 2011 2012 unmap_parent = start <= prange->start && last >= prange->last; 2013 2014 list_for_each_entry(pchild, &prange->child_list, child_list) { 2015 mutex_lock_nested(&pchild->lock, 1); 2016 s = max(start, pchild->start); 2017 l = min(last, pchild->last); 2018 if (l >= s) 2019 svm_range_unmap_from_gpus(pchild, s, l); 2020 svm_range_unmap_split(mm, prange, pchild, start, last); 2021 mutex_unlock(&pchild->lock); 2022 } 2023 s = max(start, prange->start); 2024 l = min(last, prange->last); 2025 if (l >= s) 2026 svm_range_unmap_from_gpus(prange, s, l); 2027 svm_range_unmap_split(mm, prange, prange, start, last); 2028 2029 if (unmap_parent) 2030 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); 2031 else 2032 svm_range_add_list_work(svms, prange, mm, 2033 SVM_OP_UPDATE_RANGE_NOTIFIER); 2034 schedule_deferred_list_work(svms); 2035 2036 kfd_unref_process(p); 2037 } 2038 2039 /** 2040 * svm_range_cpu_invalidate_pagetables - interval notifier callback 2041 * 2042 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it 2043 * is from migration, or CPU page invalidation callback. 2044 * 2045 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed 2046 * work thread, and split prange if only part of prange is unmapped. 2047 * 2048 * For invalidation event, if GPU retry fault is not enabled, evict the queues, 2049 * then schedule svm_range_restore_work to update GPU mapping and resume queues. 2050 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will 2051 * update GPU mapping to recover. 2052 * 2053 * Context: mmap lock, notifier_invalidate_start lock are held 2054 * for invalidate event, prange lock is held if this is from migration 2055 */ 2056 static bool 2057 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 2058 const struct mmu_notifier_range *range, 2059 unsigned long cur_seq) 2060 { 2061 struct svm_range *prange; 2062 unsigned long start; 2063 unsigned long last; 2064 2065 if (range->event == MMU_NOTIFY_RELEASE) 2066 return true; 2067 2068 start = mni->interval_tree.start; 2069 last = mni->interval_tree.last; 2070 start = (start > range->start ? start : range->start) >> PAGE_SHIFT; 2071 last = (last < (range->end - 1) ? last : range->end - 1) >> PAGE_SHIFT; 2072 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", 2073 start, last, range->start >> PAGE_SHIFT, 2074 (range->end - 1) >> PAGE_SHIFT, 2075 mni->interval_tree.start >> PAGE_SHIFT, 2076 mni->interval_tree.last >> PAGE_SHIFT, range->event); 2077 2078 prange = container_of(mni, struct svm_range, notifier); 2079 2080 svm_range_lock(prange); 2081 mmu_interval_set_seq(mni, cur_seq); 2082 2083 switch (range->event) { 2084 case MMU_NOTIFY_UNMAP: 2085 svm_range_unmap_from_cpu(mni->mm, prange, start, last); 2086 break; 2087 default: 2088 svm_range_evict(prange, mni->mm, start, last); 2089 break; 2090 } 2091 2092 svm_range_unlock(prange); 2093 2094 return true; 2095 } 2096 2097 /** 2098 * svm_range_from_addr - find svm range from fault address 2099 * @svms: svm range list header 2100 * @addr: address to search range interval tree, in pages 2101 * @parent: parent range if range is on child list 2102 * 2103 * Context: The caller must hold svms->lock 2104 * 2105 * Return: the svm_range found or NULL 2106 */ 2107 struct svm_range * 2108 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, 2109 struct svm_range **parent) 2110 { 2111 struct interval_tree_node *node; 2112 struct svm_range *prange; 2113 struct svm_range *pchild; 2114 2115 node = interval_tree_iter_first(&svms->objects, addr, addr); 2116 if (!node) 2117 return NULL; 2118 2119 prange = container_of(node, struct svm_range, it_node); 2120 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", 2121 addr, prange->start, prange->last, node->start, node->last); 2122 2123 if (addr >= prange->start && addr <= prange->last) { 2124 if (parent) 2125 *parent = prange; 2126 return prange; 2127 } 2128 list_for_each_entry(pchild, &prange->child_list, child_list) 2129 if (addr >= pchild->start && addr <= pchild->last) { 2130 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", 2131 addr, pchild->start, pchild->last); 2132 if (parent) 2133 *parent = prange; 2134 return pchild; 2135 } 2136 2137 return NULL; 2138 } 2139 2140 /* svm_range_best_restore_location - decide the best fault restore location 2141 * @prange: svm range structure 2142 * @adev: the GPU on which vm fault happened 2143 * 2144 * This is only called when xnack is on, to decide the best location to restore 2145 * the range mapping after GPU vm fault. Caller uses the best location to do 2146 * migration if actual loc is not best location, then update GPU page table 2147 * mapping to the best location. 2148 * 2149 * If vm fault gpu is range preferred loc, the best_loc is preferred loc. 2150 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu 2151 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then 2152 * if range actual loc is cpu, best_loc is cpu 2153 * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is 2154 * range actual loc. 2155 * Otherwise, GPU no access, best_loc is -1. 2156 * 2157 * Return: 2158 * -1 means vm fault GPU no access 2159 * 0 for CPU or GPU id 2160 */ 2161 static int32_t 2162 svm_range_best_restore_location(struct svm_range *prange, 2163 struct amdgpu_device *adev, 2164 int32_t *gpuidx) 2165 { 2166 struct amdgpu_device *bo_adev; 2167 struct kfd_process *p; 2168 uint32_t gpuid; 2169 int r; 2170 2171 p = container_of(prange->svms, struct kfd_process, svms); 2172 2173 r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, gpuidx); 2174 if (r < 0) { 2175 pr_debug("failed to get gpuid from kgd\n"); 2176 return -1; 2177 } 2178 2179 if (prange->preferred_loc == gpuid) 2180 return prange->preferred_loc; 2181 2182 if (test_bit(*gpuidx, prange->bitmap_access)) 2183 return gpuid; 2184 2185 if (test_bit(*gpuidx, prange->bitmap_aip)) { 2186 if (!prange->actual_loc) 2187 return 0; 2188 2189 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); 2190 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 2191 return prange->actual_loc; 2192 else 2193 return 0; 2194 } 2195 2196 return -1; 2197 } 2198 static int 2199 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, 2200 unsigned long *start, unsigned long *last) 2201 { 2202 struct vm_area_struct *vma; 2203 struct interval_tree_node *node; 2204 unsigned long start_limit, end_limit; 2205 2206 vma = find_vma(p->mm, addr << PAGE_SHIFT); 2207 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2208 pr_debug("VMA does not exist in address [0x%llx]\n", addr); 2209 return -EFAULT; 2210 } 2211 start_limit = max(vma->vm_start >> PAGE_SHIFT, 2212 (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); 2213 end_limit = min(vma->vm_end >> PAGE_SHIFT, 2214 (unsigned long)ALIGN(addr + 1, 2UL << 8)); 2215 /* First range that starts after the fault address */ 2216 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); 2217 if (node) { 2218 end_limit = min(end_limit, node->start); 2219 /* Last range that ends before the fault address */ 2220 node = container_of(rb_prev(&node->rb), 2221 struct interval_tree_node, rb); 2222 } else { 2223 /* Last range must end before addr because 2224 * there was no range after addr 2225 */ 2226 node = container_of(rb_last(&p->svms.objects.rb_root), 2227 struct interval_tree_node, rb); 2228 } 2229 if (node) { 2230 if (node->last >= addr) { 2231 WARN(1, "Overlap with prev node and page fault addr\n"); 2232 return -EFAULT; 2233 } 2234 start_limit = max(start_limit, node->last + 1); 2235 } 2236 2237 *start = start_limit; 2238 *last = end_limit - 1; 2239 2240 pr_debug("vma start: 0x%lx start: 0x%lx vma end: 0x%lx last: 0x%lx\n", 2241 vma->vm_start >> PAGE_SHIFT, *start, 2242 vma->vm_end >> PAGE_SHIFT, *last); 2243 2244 return 0; 2245 2246 } 2247 static struct 2248 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, 2249 struct kfd_process *p, 2250 struct mm_struct *mm, 2251 int64_t addr) 2252 { 2253 struct svm_range *prange = NULL; 2254 unsigned long start, last; 2255 uint32_t gpuid, gpuidx; 2256 2257 if (svm_range_get_range_boundaries(p, addr, &start, &last)) 2258 return NULL; 2259 2260 prange = svm_range_new(&p->svms, start, last); 2261 if (!prange) { 2262 pr_debug("Failed to create prange in address [0x%llx]\n", addr); 2263 return NULL; 2264 } 2265 if (kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx)) { 2266 pr_debug("failed to get gpuid from kgd\n"); 2267 svm_range_free(prange); 2268 return NULL; 2269 } 2270 2271 svm_range_add_to_svms(prange); 2272 svm_range_add_notifier_locked(mm, prange); 2273 2274 return prange; 2275 } 2276 2277 /* svm_range_skip_recover - decide if prange can be recovered 2278 * @prange: svm range structure 2279 * 2280 * GPU vm retry fault handle skip recover the range for cases: 2281 * 1. prange is on deferred list to be removed after unmap, it is stale fault, 2282 * deferred list work will drain the stale fault before free the prange. 2283 * 2. prange is on deferred list to add interval notifier after split, or 2284 * 3. prange is child range, it is split from parent prange, recover later 2285 * after interval notifier is added. 2286 * 2287 * Return: true to skip recover, false to recover 2288 */ 2289 static bool svm_range_skip_recover(struct svm_range *prange) 2290 { 2291 struct svm_range_list *svms = prange->svms; 2292 2293 spin_lock(&svms->deferred_list_lock); 2294 if (list_empty(&prange->deferred_list) && 2295 list_empty(&prange->child_list)) { 2296 spin_unlock(&svms->deferred_list_lock); 2297 return false; 2298 } 2299 spin_unlock(&svms->deferred_list_lock); 2300 2301 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2302 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", 2303 svms, prange, prange->start, prange->last); 2304 return true; 2305 } 2306 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || 2307 prange->work_item.op == SVM_OP_ADD_RANGE) { 2308 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", 2309 svms, prange, prange->start, prange->last); 2310 return true; 2311 } 2312 return false; 2313 } 2314 2315 int 2316 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2317 uint64_t addr) 2318 { 2319 struct mm_struct *mm = NULL; 2320 struct svm_range_list *svms; 2321 struct svm_range *prange; 2322 struct kfd_process *p; 2323 uint64_t timestamp; 2324 int32_t best_loc, gpuidx; 2325 bool write_locked = false; 2326 int r = 0; 2327 2328 p = kfd_lookup_process_by_pasid(pasid); 2329 if (!p) { 2330 pr_debug("kfd process not founded pasid 0x%x\n", pasid); 2331 return -ESRCH; 2332 } 2333 if (!p->xnack_enabled) { 2334 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 2335 return -EFAULT; 2336 } 2337 svms = &p->svms; 2338 2339 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 2340 2341 mm = get_task_mm(p->lead_thread); 2342 if (!mm) { 2343 pr_debug("svms 0x%p failed to get mm\n", svms); 2344 r = -ESRCH; 2345 goto out; 2346 } 2347 2348 mmap_read_lock(mm); 2349 retry_write_locked: 2350 mutex_lock(&svms->lock); 2351 prange = svm_range_from_addr(svms, addr, NULL); 2352 if (!prange) { 2353 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 2354 svms, addr); 2355 if (!write_locked) { 2356 /* Need the write lock to create new range with MMU notifier. 2357 * Also flush pending deferred work to make sure the interval 2358 * tree is up to date before we add a new range 2359 */ 2360 mutex_unlock(&svms->lock); 2361 mmap_read_unlock(mm); 2362 mmap_write_lock(mm); 2363 write_locked = true; 2364 goto retry_write_locked; 2365 } 2366 prange = svm_range_create_unregistered_range(adev, p, mm, addr); 2367 if (!prange) { 2368 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 2369 svms, addr); 2370 mmap_write_downgrade(mm); 2371 r = -EFAULT; 2372 goto out_unlock_svms; 2373 } 2374 } 2375 if (write_locked) 2376 mmap_write_downgrade(mm); 2377 2378 mutex_lock(&prange->migrate_mutex); 2379 2380 if (svm_range_skip_recover(prange)) { 2381 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2382 goto out_unlock_range; 2383 } 2384 2385 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp; 2386 /* skip duplicate vm fault on different pages of same range */ 2387 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { 2388 pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 2389 svms, prange->start, prange->last); 2390 goto out_unlock_range; 2391 } 2392 2393 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); 2394 if (best_loc == -1) { 2395 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 2396 svms, prange->start, prange->last); 2397 r = -EACCES; 2398 goto out_unlock_range; 2399 } 2400 2401 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 2402 svms, prange->start, prange->last, best_loc, 2403 prange->actual_loc); 2404 2405 if (prange->actual_loc != best_loc) { 2406 if (best_loc) { 2407 r = svm_migrate_to_vram(prange, best_loc, mm); 2408 if (r) { 2409 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 2410 r, addr); 2411 /* Fallback to system memory if migration to 2412 * VRAM failed 2413 */ 2414 if (prange->actual_loc) 2415 r = svm_migrate_vram_to_ram(prange, mm); 2416 else 2417 r = 0; 2418 } 2419 } else { 2420 r = svm_migrate_vram_to_ram(prange, mm); 2421 } 2422 if (r) { 2423 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 2424 r, svms, prange->start, prange->last); 2425 goto out_unlock_range; 2426 } 2427 } 2428 2429 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false); 2430 if (r) 2431 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 2432 r, svms, prange->start, prange->last); 2433 2434 out_unlock_range: 2435 mutex_unlock(&prange->migrate_mutex); 2436 out_unlock_svms: 2437 mutex_unlock(&svms->lock); 2438 mmap_read_unlock(mm); 2439 mmput(mm); 2440 out: 2441 kfd_unref_process(p); 2442 2443 if (r == -EAGAIN) { 2444 pr_debug("recover vm fault later\n"); 2445 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2446 r = 0; 2447 } 2448 return r; 2449 } 2450 2451 void svm_range_list_fini(struct kfd_process *p) 2452 { 2453 struct svm_range *prange; 2454 struct svm_range *next; 2455 2456 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); 2457 2458 /* Ensure list work is finished before process is destroyed */ 2459 flush_work(&p->svms.deferred_list_work); 2460 2461 list_for_each_entry_safe(prange, next, &p->svms.list, list) { 2462 svm_range_unlink(prange); 2463 svm_range_remove_notifier(prange); 2464 svm_range_free(prange); 2465 } 2466 2467 mutex_destroy(&p->svms.lock); 2468 2469 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); 2470 } 2471 2472 int svm_range_list_init(struct kfd_process *p) 2473 { 2474 struct svm_range_list *svms = &p->svms; 2475 2476 svms->objects = RB_ROOT_CACHED; 2477 mutex_init(&svms->lock); 2478 INIT_LIST_HEAD(&svms->list); 2479 atomic_set(&svms->evicted_ranges, 0); 2480 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); 2481 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); 2482 INIT_LIST_HEAD(&svms->deferred_range_list); 2483 spin_lock_init(&svms->deferred_list_lock); 2484 2485 return 0; 2486 } 2487 2488 /** 2489 * svm_range_is_valid - check if virtual address range is valid 2490 * @mm: current process mm_struct 2491 * @start: range start address, in pages 2492 * @size: range size, in pages 2493 * 2494 * Valid virtual address range means it belongs to one or more VMAs 2495 * 2496 * Context: Process context 2497 * 2498 * Return: 2499 * true - valid svm range 2500 * false - invalid svm range 2501 */ 2502 static bool 2503 svm_range_is_valid(struct mm_struct *mm, uint64_t start, uint64_t size) 2504 { 2505 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 2506 struct vm_area_struct *vma; 2507 unsigned long end; 2508 2509 start <<= PAGE_SHIFT; 2510 end = start + (size << PAGE_SHIFT); 2511 2512 do { 2513 vma = find_vma(mm, start); 2514 if (!vma || start < vma->vm_start || 2515 (vma->vm_flags & device_vma)) 2516 return false; 2517 start = min(end, vma->vm_end); 2518 } while (start < end); 2519 2520 return true; 2521 } 2522 2523 /** 2524 * svm_range_add - add svm range and handle overlap 2525 * @p: the range add to this process svms 2526 * @start: page size aligned 2527 * @size: page size aligned 2528 * @nattr: number of attributes 2529 * @attrs: array of attributes 2530 * @update_list: output, the ranges need validate and update GPU mapping 2531 * @insert_list: output, the ranges need insert to svms 2532 * @remove_list: output, the ranges are replaced and need remove from svms 2533 * 2534 * Check if the virtual address range has overlap with the registered ranges, 2535 * split the overlapped range, copy and adjust pages address and vram nodes in 2536 * old and new ranges. 2537 * 2538 * Context: Process context, caller must hold svms->lock 2539 * 2540 * Return: 2541 * 0 - OK, otherwise error code 2542 */ 2543 static int 2544 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, 2545 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 2546 struct list_head *update_list, struct list_head *insert_list, 2547 struct list_head *remove_list) 2548 { 2549 uint64_t last = start + size - 1UL; 2550 struct svm_range_list *svms; 2551 struct svm_range new = {0}; 2552 struct svm_range *prange; 2553 unsigned long left = 0; 2554 int r = 0; 2555 2556 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", &p->svms, start, last); 2557 2558 svm_range_apply_attrs(p, &new, nattr, attrs); 2559 2560 svms = &p->svms; 2561 2562 r = svm_range_handle_overlap(svms, &new, start, last, update_list, 2563 insert_list, remove_list, &left); 2564 if (r) 2565 return r; 2566 2567 if (left) { 2568 prange = svm_range_new(svms, last - left + 1, last); 2569 list_add(&prange->insert_list, insert_list); 2570 list_add(&prange->update_list, update_list); 2571 } 2572 2573 return 0; 2574 } 2575 2576 /* svm_range_best_prefetch_location - decide the best prefetch location 2577 * @prange: svm range structure 2578 * 2579 * For xnack off: 2580 * If range map to single GPU, the best acutal location is prefetch loc, which 2581 * can be CPU or GPU. 2582 * 2583 * If range map to multiple GPUs, only if mGPU connection on xgmi same hive, 2584 * the best actual location could be prefetch_loc GPU. If mGPU connection on 2585 * PCIe, the best actual location is always CPU, because GPU cannot access vram 2586 * of other GPUs, assuming PCIe small bar (large bar support is not upstream). 2587 * 2588 * For xnack on: 2589 * The best actual location is prefetch location. If mGPU connection on xgmi 2590 * same hive, range map to multiple GPUs. Otherwise, the range only map to 2591 * actual location GPU. Other GPU access vm fault will trigger migration. 2592 * 2593 * Context: Process context 2594 * 2595 * Return: 2596 * 0 for CPU or GPU id 2597 */ 2598 static uint32_t 2599 svm_range_best_prefetch_location(struct svm_range *prange) 2600 { 2601 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 2602 uint32_t best_loc = prange->prefetch_loc; 2603 struct kfd_process_device *pdd; 2604 struct amdgpu_device *bo_adev; 2605 struct amdgpu_device *adev; 2606 struct kfd_process *p; 2607 uint32_t gpuidx; 2608 2609 p = container_of(prange->svms, struct kfd_process, svms); 2610 2611 /* xnack on */ 2612 if (p->xnack_enabled) 2613 goto out; 2614 2615 /* xnack off */ 2616 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) 2617 goto out; 2618 2619 bo_adev = svm_range_get_adev_by_id(prange, best_loc); 2620 if (!bo_adev) { 2621 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); 2622 best_loc = 0; 2623 goto out; 2624 } 2625 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 2626 MAX_GPU_INSTANCE); 2627 2628 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 2629 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2630 if (!pdd) { 2631 pr_debug("failed to get device by idx 0x%x\n", gpuidx); 2632 continue; 2633 } 2634 adev = (struct amdgpu_device *)pdd->dev->kgd; 2635 2636 if (adev == bo_adev) 2637 continue; 2638 2639 if (!amdgpu_xgmi_same_hive(adev, bo_adev)) { 2640 best_loc = 0; 2641 break; 2642 } 2643 } 2644 2645 out: 2646 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", 2647 p->xnack_enabled, &p->svms, prange->start, prange->last, 2648 best_loc); 2649 2650 return best_loc; 2651 } 2652 2653 /* FIXME: This is a workaround for page locking bug when some pages are 2654 * invalid during migration to VRAM 2655 */ 2656 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm) 2657 { 2658 struct hmm_range *hmm_range; 2659 int r; 2660 2661 if (prange->validated_once) 2662 return; 2663 2664 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 2665 prange->start << PAGE_SHIFT, 2666 prange->npages, &hmm_range, 2667 false, true); 2668 if (!r) { 2669 amdgpu_hmm_range_get_pages_done(hmm_range); 2670 prange->validated_once = true; 2671 } 2672 } 2673 2674 /* svm_range_trigger_migration - start page migration if prefetch loc changed 2675 * @mm: current process mm_struct 2676 * @prange: svm range structure 2677 * @migrated: output, true if migration is triggered 2678 * 2679 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range 2680 * from ram to vram. 2681 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range 2682 * from vram to ram. 2683 * 2684 * If GPU vm fault retry is not enabled, migration interact with MMU notifier 2685 * and restore work: 2686 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict 2687 * stops all queues, schedule restore work 2688 * 2. svm_range_restore_work wait for migration is done by 2689 * a. svm_range_validate_vram takes prange->migrate_mutex 2690 * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns 2691 * 3. restore work update mappings of GPU, resume all queues. 2692 * 2693 * Context: Process context 2694 * 2695 * Return: 2696 * 0 - OK, otherwise - error code of migration 2697 */ 2698 static int 2699 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, 2700 bool *migrated) 2701 { 2702 uint32_t best_loc; 2703 int r = 0; 2704 2705 *migrated = false; 2706 best_loc = svm_range_best_prefetch_location(prange); 2707 2708 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || 2709 best_loc == prange->actual_loc) 2710 return 0; 2711 2712 /* 2713 * Prefetch to GPU without host access flag, set actual_loc to gpu, then 2714 * validate on gpu and map to gpus will be handled afterwards. 2715 */ 2716 if (best_loc && !prange->actual_loc && 2717 !(prange->flags & KFD_IOCTL_SVM_FLAG_HOST_ACCESS)) { 2718 prange->actual_loc = best_loc; 2719 return 0; 2720 } 2721 2722 if (!best_loc) { 2723 r = svm_migrate_vram_to_ram(prange, mm); 2724 *migrated = !r; 2725 return r; 2726 } 2727 2728 r = svm_migrate_to_vram(prange, best_loc, mm); 2729 *migrated = !r; 2730 2731 return r; 2732 } 2733 2734 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) 2735 { 2736 if (!fence) 2737 return -EINVAL; 2738 2739 if (dma_fence_is_signaled(&fence->base)) 2740 return 0; 2741 2742 if (fence->svm_bo) { 2743 WRITE_ONCE(fence->svm_bo->evicting, 1); 2744 schedule_work(&fence->svm_bo->eviction_work); 2745 } 2746 2747 return 0; 2748 } 2749 2750 static void svm_range_evict_svm_bo_worker(struct work_struct *work) 2751 { 2752 struct svm_range_bo *svm_bo; 2753 struct kfd_process *p; 2754 struct mm_struct *mm; 2755 2756 svm_bo = container_of(work, struct svm_range_bo, eviction_work); 2757 if (!svm_bo_ref_unless_zero(svm_bo)) 2758 return; /* svm_bo was freed while eviction was pending */ 2759 2760 /* svm_range_bo_release destroys this worker thread. So during 2761 * the lifetime of this thread, kfd_process and mm will be valid. 2762 */ 2763 p = container_of(svm_bo->svms, struct kfd_process, svms); 2764 mm = p->mm; 2765 if (!mm) 2766 return; 2767 2768 mmap_read_lock(mm); 2769 spin_lock(&svm_bo->list_lock); 2770 while (!list_empty(&svm_bo->range_list)) { 2771 struct svm_range *prange = 2772 list_first_entry(&svm_bo->range_list, 2773 struct svm_range, svm_bo_list); 2774 list_del_init(&prange->svm_bo_list); 2775 spin_unlock(&svm_bo->list_lock); 2776 2777 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 2778 prange->start, prange->last); 2779 2780 mutex_lock(&prange->migrate_mutex); 2781 svm_migrate_vram_to_ram(prange, svm_bo->eviction_fence->mm); 2782 2783 mutex_lock(&prange->lock); 2784 prange->svm_bo = NULL; 2785 mutex_unlock(&prange->lock); 2786 2787 mutex_unlock(&prange->migrate_mutex); 2788 2789 spin_lock(&svm_bo->list_lock); 2790 } 2791 spin_unlock(&svm_bo->list_lock); 2792 mmap_read_unlock(mm); 2793 2794 dma_fence_signal(&svm_bo->eviction_fence->base); 2795 /* This is the last reference to svm_bo, after svm_range_vram_node_free 2796 * has been called in svm_migrate_vram_to_ram 2797 */ 2798 WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); 2799 svm_range_bo_unref(svm_bo); 2800 } 2801 2802 static int 2803 svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size, 2804 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 2805 { 2806 struct amdkfd_process_info *process_info = p->kgd_process_info; 2807 struct mm_struct *mm = current->mm; 2808 struct list_head update_list; 2809 struct list_head insert_list; 2810 struct list_head remove_list; 2811 struct svm_range_list *svms; 2812 struct svm_range *prange; 2813 struct svm_range *next; 2814 int r = 0; 2815 2816 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", 2817 p->pasid, &p->svms, start, start + size - 1, size); 2818 2819 r = svm_range_check_attr(p, nattr, attrs); 2820 if (r) 2821 return r; 2822 2823 svms = &p->svms; 2824 2825 mutex_lock(&process_info->lock); 2826 2827 svm_range_list_lock_and_flush_work(svms, mm); 2828 2829 if (!svm_range_is_valid(mm, start, size)) { 2830 pr_debug("invalid range\n"); 2831 r = -EFAULT; 2832 mmap_write_unlock(mm); 2833 goto out; 2834 } 2835 2836 mutex_lock(&svms->lock); 2837 2838 /* Add new range and split existing ranges as needed */ 2839 r = svm_range_add(p, start, size, nattr, attrs, &update_list, 2840 &insert_list, &remove_list); 2841 if (r) { 2842 mutex_unlock(&svms->lock); 2843 mmap_write_unlock(mm); 2844 goto out; 2845 } 2846 /* Apply changes as a transaction */ 2847 list_for_each_entry_safe(prange, next, &insert_list, insert_list) { 2848 svm_range_add_to_svms(prange); 2849 svm_range_add_notifier_locked(mm, prange); 2850 } 2851 list_for_each_entry(prange, &update_list, update_list) { 2852 svm_range_apply_attrs(p, prange, nattr, attrs); 2853 /* TODO: unmap ranges from GPU that lost access */ 2854 } 2855 list_for_each_entry_safe(prange, next, &remove_list, 2856 remove_list) { 2857 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2858 prange->svms, prange, prange->start, 2859 prange->last); 2860 svm_range_unlink(prange); 2861 svm_range_remove_notifier(prange); 2862 svm_range_free(prange); 2863 } 2864 2865 mmap_write_downgrade(mm); 2866 /* Trigger migrations and revalidate and map to GPUs as needed. If 2867 * this fails we may be left with partially completed actions. There 2868 * is no clean way of rolling back to the previous state in such a 2869 * case because the rollback wouldn't be guaranteed to work either. 2870 */ 2871 list_for_each_entry(prange, &update_list, update_list) { 2872 bool migrated; 2873 2874 mutex_lock(&prange->migrate_mutex); 2875 2876 r = svm_range_trigger_migration(mm, prange, &migrated); 2877 if (r) 2878 goto out_unlock_range; 2879 2880 if (migrated && !p->xnack_enabled) { 2881 pr_debug("restore_work will update mappings of GPUs\n"); 2882 mutex_unlock(&prange->migrate_mutex); 2883 continue; 2884 } 2885 2886 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 2887 true, true); 2888 if (r) 2889 pr_debug("failed %d to map svm range\n", r); 2890 2891 out_unlock_range: 2892 mutex_unlock(&prange->migrate_mutex); 2893 if (r) 2894 break; 2895 } 2896 2897 svm_range_debug_dump(svms); 2898 2899 mutex_unlock(&svms->lock); 2900 mmap_read_unlock(mm); 2901 out: 2902 mutex_unlock(&process_info->lock); 2903 2904 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, 2905 &p->svms, start, start + size - 1, r); 2906 2907 return r; 2908 } 2909 2910 static int 2911 svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size, 2912 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 2913 { 2914 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); 2915 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); 2916 bool get_preferred_loc = false; 2917 bool get_prefetch_loc = false; 2918 bool get_granularity = false; 2919 bool get_accessible = false; 2920 bool get_flags = false; 2921 uint64_t last = start + size - 1UL; 2922 struct mm_struct *mm = current->mm; 2923 uint8_t granularity = 0xff; 2924 struct interval_tree_node *node; 2925 struct svm_range_list *svms; 2926 struct svm_range *prange; 2927 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 2928 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 2929 uint32_t flags = 0xffffffff; 2930 int gpuidx; 2931 uint32_t i; 2932 2933 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, 2934 start + size - 1, nattr); 2935 2936 mmap_read_lock(mm); 2937 if (!svm_range_is_valid(mm, start, size)) { 2938 pr_debug("invalid range\n"); 2939 mmap_read_unlock(mm); 2940 return -EINVAL; 2941 } 2942 mmap_read_unlock(mm); 2943 2944 for (i = 0; i < nattr; i++) { 2945 switch (attrs[i].type) { 2946 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 2947 get_preferred_loc = true; 2948 break; 2949 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 2950 get_prefetch_loc = true; 2951 break; 2952 case KFD_IOCTL_SVM_ATTR_ACCESS: 2953 get_accessible = true; 2954 break; 2955 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 2956 get_flags = true; 2957 break; 2958 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 2959 get_granularity = true; 2960 break; 2961 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 2962 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 2963 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 2964 fallthrough; 2965 default: 2966 pr_debug("get invalid attr type 0x%x\n", attrs[i].type); 2967 return -EINVAL; 2968 } 2969 } 2970 2971 svms = &p->svms; 2972 2973 mutex_lock(&svms->lock); 2974 2975 node = interval_tree_iter_first(&svms->objects, start, last); 2976 if (!node) { 2977 pr_debug("range attrs not found return default values\n"); 2978 svm_range_set_default_attributes(&location, &prefetch_loc, 2979 &granularity, &flags); 2980 if (p->xnack_enabled) 2981 bitmap_fill(bitmap_access, MAX_GPU_INSTANCE); 2982 else 2983 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); 2984 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); 2985 goto fill_values; 2986 } 2987 bitmap_fill(bitmap_access, MAX_GPU_INSTANCE); 2988 bitmap_fill(bitmap_aip, MAX_GPU_INSTANCE); 2989 2990 while (node) { 2991 struct interval_tree_node *next; 2992 2993 prange = container_of(node, struct svm_range, it_node); 2994 next = interval_tree_iter_next(node, start, last); 2995 2996 if (get_preferred_loc) { 2997 if (prange->preferred_loc == 2998 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 2999 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3000 location != prange->preferred_loc)) { 3001 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3002 get_preferred_loc = false; 3003 } else { 3004 location = prange->preferred_loc; 3005 } 3006 } 3007 if (get_prefetch_loc) { 3008 if (prange->prefetch_loc == 3009 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3010 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3011 prefetch_loc != prange->prefetch_loc)) { 3012 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3013 get_prefetch_loc = false; 3014 } else { 3015 prefetch_loc = prange->prefetch_loc; 3016 } 3017 } 3018 if (get_accessible) { 3019 bitmap_and(bitmap_access, bitmap_access, 3020 prange->bitmap_access, MAX_GPU_INSTANCE); 3021 bitmap_and(bitmap_aip, bitmap_aip, 3022 prange->bitmap_aip, MAX_GPU_INSTANCE); 3023 } 3024 if (get_flags) 3025 flags &= prange->flags; 3026 3027 if (get_granularity && prange->granularity < granularity) 3028 granularity = prange->granularity; 3029 3030 node = next; 3031 } 3032 fill_values: 3033 mutex_unlock(&svms->lock); 3034 3035 for (i = 0; i < nattr; i++) { 3036 switch (attrs[i].type) { 3037 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3038 attrs[i].value = location; 3039 break; 3040 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3041 attrs[i].value = prefetch_loc; 3042 break; 3043 case KFD_IOCTL_SVM_ATTR_ACCESS: 3044 gpuidx = kfd_process_gpuidx_from_gpuid(p, 3045 attrs[i].value); 3046 if (gpuidx < 0) { 3047 pr_debug("invalid gpuid %x\n", attrs[i].value); 3048 return -EINVAL; 3049 } 3050 if (test_bit(gpuidx, bitmap_access)) 3051 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; 3052 else if (test_bit(gpuidx, bitmap_aip)) 3053 attrs[i].type = 3054 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; 3055 else 3056 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; 3057 break; 3058 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3059 attrs[i].value = flags; 3060 break; 3061 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3062 attrs[i].value = (uint32_t)granularity; 3063 break; 3064 } 3065 } 3066 3067 return 0; 3068 } 3069 3070 int 3071 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, 3072 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) 3073 { 3074 int r; 3075 3076 start >>= PAGE_SHIFT; 3077 size >>= PAGE_SHIFT; 3078 3079 switch (op) { 3080 case KFD_IOCTL_SVM_OP_SET_ATTR: 3081 r = svm_range_set_attr(p, start, size, nattrs, attrs); 3082 break; 3083 case KFD_IOCTL_SVM_OP_GET_ATTR: 3084 r = svm_range_get_attr(p, start, size, nattrs, attrs); 3085 break; 3086 default: 3087 r = EINVAL; 3088 break; 3089 } 3090 3091 return r; 3092 } 3093