1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2020-2021 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/types.h> 25 #include <linux/sched/task.h> 26 #include "amdgpu_sync.h" 27 #include "amdgpu_object.h" 28 #include "amdgpu_vm.h" 29 #include "amdgpu_mn.h" 30 #include "amdgpu.h" 31 #include "amdgpu_xgmi.h" 32 #include "kfd_priv.h" 33 #include "kfd_svm.h" 34 #include "kfd_migrate.h" 35 36 #ifdef dev_fmt 37 #undef dev_fmt 38 #endif 39 #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__ 40 41 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 42 43 /* Long enough to ensure no retry fault comes after svm range is restored and 44 * page table is updated. 45 */ 46 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 47 48 static void svm_range_evict_svm_bo_worker(struct work_struct *work); 49 static bool 50 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 51 const struct mmu_notifier_range *range, 52 unsigned long cur_seq); 53 static int 54 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 55 uint64_t *bo_s, uint64_t *bo_l); 56 static const struct mmu_interval_notifier_ops svm_range_mn_ops = { 57 .invalidate = svm_range_cpu_invalidate_pagetables, 58 }; 59 60 /** 61 * svm_range_unlink - unlink svm_range from lists and interval tree 62 * @prange: svm range structure to be removed 63 * 64 * Remove the svm_range from the svms and svm_bo lists and the svms 65 * interval tree. 66 * 67 * Context: The caller must hold svms->lock 68 */ 69 static void svm_range_unlink(struct svm_range *prange) 70 { 71 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 72 prange, prange->start, prange->last); 73 74 if (prange->svm_bo) { 75 spin_lock(&prange->svm_bo->list_lock); 76 list_del(&prange->svm_bo_list); 77 spin_unlock(&prange->svm_bo->list_lock); 78 } 79 80 list_del(&prange->list); 81 if (prange->it_node.start != 0 && prange->it_node.last != 0) 82 interval_tree_remove(&prange->it_node, &prange->svms->objects); 83 } 84 85 static void 86 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) 87 { 88 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 89 prange, prange->start, prange->last); 90 91 mmu_interval_notifier_insert_locked(&prange->notifier, mm, 92 prange->start << PAGE_SHIFT, 93 prange->npages << PAGE_SHIFT, 94 &svm_range_mn_ops); 95 } 96 97 /** 98 * svm_range_add_to_svms - add svm range to svms 99 * @prange: svm range structure to be added 100 * 101 * Add the svm range to svms interval tree and link list 102 * 103 * Context: The caller must hold svms->lock 104 */ 105 static void svm_range_add_to_svms(struct svm_range *prange) 106 { 107 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 108 prange, prange->start, prange->last); 109 110 list_add_tail(&prange->list, &prange->svms->list); 111 prange->it_node.start = prange->start; 112 prange->it_node.last = prange->last; 113 interval_tree_insert(&prange->it_node, &prange->svms->objects); 114 } 115 116 static void svm_range_remove_notifier(struct svm_range *prange) 117 { 118 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", 119 prange->svms, prange, 120 prange->notifier.interval_tree.start >> PAGE_SHIFT, 121 prange->notifier.interval_tree.last >> PAGE_SHIFT); 122 123 if (prange->notifier.interval_tree.start != 0 && 124 prange->notifier.interval_tree.last != 0) 125 mmu_interval_notifier_remove(&prange->notifier); 126 } 127 128 static bool 129 svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) 130 { 131 return dma_addr && !dma_mapping_error(dev, dma_addr) && 132 !(dma_addr & SVM_RANGE_VRAM_DOMAIN); 133 } 134 135 static int 136 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, 137 unsigned long offset, unsigned long npages, 138 unsigned long *hmm_pfns, uint32_t gpuidx) 139 { 140 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 141 dma_addr_t *addr = prange->dma_addr[gpuidx]; 142 struct device *dev = adev->dev; 143 struct page *page; 144 int i, r; 145 146 if (!addr) { 147 addr = kvmalloc_array(prange->npages, sizeof(*addr), 148 GFP_KERNEL | __GFP_ZERO); 149 if (!addr) 150 return -ENOMEM; 151 prange->dma_addr[gpuidx] = addr; 152 } 153 154 addr += offset; 155 for (i = 0; i < npages; i++) { 156 if (svm_is_valid_dma_mapping_addr(dev, addr[i])) 157 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); 158 159 page = hmm_pfn_to_page(hmm_pfns[i]); 160 if (is_zone_device_page(page)) { 161 struct amdgpu_device *bo_adev = 162 amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 163 164 addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + 165 bo_adev->vm_manager.vram_base_offset - 166 bo_adev->kfd.dev->pgmap.range.start; 167 addr[i] |= SVM_RANGE_VRAM_DOMAIN; 168 pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); 169 continue; 170 } 171 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); 172 r = dma_mapping_error(dev, addr[i]); 173 if (r) { 174 dev_err(dev, "failed %d dma_map_page\n", r); 175 return r; 176 } 177 pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", 178 addr[i] >> PAGE_SHIFT, page_to_pfn(page)); 179 } 180 return 0; 181 } 182 183 static int 184 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, 185 unsigned long offset, unsigned long npages, 186 unsigned long *hmm_pfns) 187 { 188 struct kfd_process *p; 189 uint32_t gpuidx; 190 int r; 191 192 p = container_of(prange->svms, struct kfd_process, svms); 193 194 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 195 struct kfd_process_device *pdd; 196 197 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 198 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 199 if (!pdd) { 200 pr_debug("failed to find device idx %d\n", gpuidx); 201 return -EINVAL; 202 } 203 204 r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages, 205 hmm_pfns, gpuidx); 206 if (r) 207 break; 208 } 209 210 return r; 211 } 212 213 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, 214 unsigned long offset, unsigned long npages) 215 { 216 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 217 int i; 218 219 if (!dma_addr) 220 return; 221 222 for (i = offset; i < offset + npages; i++) { 223 if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) 224 continue; 225 pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); 226 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); 227 dma_addr[i] = 0; 228 } 229 } 230 231 void svm_range_free_dma_mappings(struct svm_range *prange) 232 { 233 struct kfd_process_device *pdd; 234 dma_addr_t *dma_addr; 235 struct device *dev; 236 struct kfd_process *p; 237 uint32_t gpuidx; 238 239 p = container_of(prange->svms, struct kfd_process, svms); 240 241 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { 242 dma_addr = prange->dma_addr[gpuidx]; 243 if (!dma_addr) 244 continue; 245 246 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 247 if (!pdd) { 248 pr_debug("failed to find device idx %d\n", gpuidx); 249 continue; 250 } 251 dev = &pdd->dev->pdev->dev; 252 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); 253 kvfree(dma_addr); 254 prange->dma_addr[gpuidx] = NULL; 255 } 256 } 257 258 static void svm_range_free(struct svm_range *prange) 259 { 260 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, 261 prange->start, prange->last); 262 263 svm_range_vram_node_free(prange); 264 svm_range_free_dma_mappings(prange); 265 mutex_destroy(&prange->lock); 266 mutex_destroy(&prange->migrate_mutex); 267 kfree(prange); 268 } 269 270 static void 271 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, 272 uint8_t *granularity, uint32_t *flags) 273 { 274 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 275 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 276 *granularity = 9; 277 *flags = 278 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; 279 } 280 281 static struct 282 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, 283 uint64_t last) 284 { 285 uint64_t size = last - start + 1; 286 struct svm_range *prange; 287 struct kfd_process *p; 288 289 prange = kzalloc(sizeof(*prange), GFP_KERNEL); 290 if (!prange) 291 return NULL; 292 prange->npages = size; 293 prange->svms = svms; 294 prange->start = start; 295 prange->last = last; 296 INIT_LIST_HEAD(&prange->list); 297 INIT_LIST_HEAD(&prange->update_list); 298 INIT_LIST_HEAD(&prange->remove_list); 299 INIT_LIST_HEAD(&prange->insert_list); 300 INIT_LIST_HEAD(&prange->svm_bo_list); 301 INIT_LIST_HEAD(&prange->deferred_list); 302 INIT_LIST_HEAD(&prange->child_list); 303 atomic_set(&prange->invalid, 0); 304 prange->validate_timestamp = 0; 305 mutex_init(&prange->migrate_mutex); 306 mutex_init(&prange->lock); 307 308 p = container_of(svms, struct kfd_process, svms); 309 if (p->xnack_enabled) 310 bitmap_copy(prange->bitmap_access, svms->bitmap_supported, 311 MAX_GPU_INSTANCE); 312 313 svm_range_set_default_attributes(&prange->preferred_loc, 314 &prange->prefetch_loc, 315 &prange->granularity, &prange->flags); 316 317 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); 318 319 return prange; 320 } 321 322 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) 323 { 324 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) 325 return false; 326 327 return true; 328 } 329 330 static void svm_range_bo_release(struct kref *kref) 331 { 332 struct svm_range_bo *svm_bo; 333 334 svm_bo = container_of(kref, struct svm_range_bo, kref); 335 spin_lock(&svm_bo->list_lock); 336 while (!list_empty(&svm_bo->range_list)) { 337 struct svm_range *prange = 338 list_first_entry(&svm_bo->range_list, 339 struct svm_range, svm_bo_list); 340 /* list_del_init tells a concurrent svm_range_vram_node_new when 341 * it's safe to reuse the svm_bo pointer and svm_bo_list head. 342 */ 343 list_del_init(&prange->svm_bo_list); 344 spin_unlock(&svm_bo->list_lock); 345 346 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 347 prange->start, prange->last); 348 mutex_lock(&prange->lock); 349 prange->svm_bo = NULL; 350 mutex_unlock(&prange->lock); 351 352 spin_lock(&svm_bo->list_lock); 353 } 354 spin_unlock(&svm_bo->list_lock); 355 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { 356 /* We're not in the eviction worker. 357 * Signal the fence and synchronize with any 358 * pending eviction work. 359 */ 360 dma_fence_signal(&svm_bo->eviction_fence->base); 361 cancel_work_sync(&svm_bo->eviction_work); 362 } 363 dma_fence_put(&svm_bo->eviction_fence->base); 364 amdgpu_bo_unref(&svm_bo->bo); 365 kfree(svm_bo); 366 } 367 368 void svm_range_bo_unref(struct svm_range_bo *svm_bo) 369 { 370 if (!svm_bo) 371 return; 372 373 kref_put(&svm_bo->kref, svm_range_bo_release); 374 } 375 376 static bool 377 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) 378 { 379 struct amdgpu_device *bo_adev; 380 381 mutex_lock(&prange->lock); 382 if (!prange->svm_bo) { 383 mutex_unlock(&prange->lock); 384 return false; 385 } 386 if (prange->ttm_res) { 387 /* We still have a reference, all is well */ 388 mutex_unlock(&prange->lock); 389 return true; 390 } 391 if (svm_bo_ref_unless_zero(prange->svm_bo)) { 392 /* 393 * Migrate from GPU to GPU, remove range from source bo_adev 394 * svm_bo range list, and return false to allocate svm_bo from 395 * destination adev. 396 */ 397 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 398 if (bo_adev != adev) { 399 mutex_unlock(&prange->lock); 400 401 spin_lock(&prange->svm_bo->list_lock); 402 list_del_init(&prange->svm_bo_list); 403 spin_unlock(&prange->svm_bo->list_lock); 404 405 svm_range_bo_unref(prange->svm_bo); 406 return false; 407 } 408 if (READ_ONCE(prange->svm_bo->evicting)) { 409 struct dma_fence *f; 410 struct svm_range_bo *svm_bo; 411 /* The BO is getting evicted, 412 * we need to get a new one 413 */ 414 mutex_unlock(&prange->lock); 415 svm_bo = prange->svm_bo; 416 f = dma_fence_get(&svm_bo->eviction_fence->base); 417 svm_range_bo_unref(prange->svm_bo); 418 /* wait for the fence to avoid long spin-loop 419 * at list_empty_careful 420 */ 421 dma_fence_wait(f, false); 422 dma_fence_put(f); 423 } else { 424 /* The BO was still around and we got 425 * a new reference to it 426 */ 427 mutex_unlock(&prange->lock); 428 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", 429 prange->svms, prange->start, prange->last); 430 431 prange->ttm_res = prange->svm_bo->bo->tbo.resource; 432 return true; 433 } 434 435 } else { 436 mutex_unlock(&prange->lock); 437 } 438 439 /* We need a new svm_bo. Spin-loop to wait for concurrent 440 * svm_range_bo_release to finish removing this range from 441 * its range list. After this, it is safe to reuse the 442 * svm_bo pointer and svm_bo_list head. 443 */ 444 while (!list_empty_careful(&prange->svm_bo_list)) 445 ; 446 447 return false; 448 } 449 450 static struct svm_range_bo *svm_range_bo_new(void) 451 { 452 struct svm_range_bo *svm_bo; 453 454 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); 455 if (!svm_bo) 456 return NULL; 457 458 kref_init(&svm_bo->kref); 459 INIT_LIST_HEAD(&svm_bo->range_list); 460 spin_lock_init(&svm_bo->list_lock); 461 462 return svm_bo; 463 } 464 465 int 466 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, 467 bool clear) 468 { 469 struct amdgpu_bo_param bp; 470 struct svm_range_bo *svm_bo; 471 struct amdgpu_bo_user *ubo; 472 struct amdgpu_bo *bo; 473 struct kfd_process *p; 474 struct mm_struct *mm; 475 int r; 476 477 p = container_of(prange->svms, struct kfd_process, svms); 478 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, 479 prange->start, prange->last); 480 481 if (svm_range_validate_svm_bo(adev, prange)) 482 return 0; 483 484 svm_bo = svm_range_bo_new(); 485 if (!svm_bo) { 486 pr_debug("failed to alloc svm bo\n"); 487 return -ENOMEM; 488 } 489 mm = get_task_mm(p->lead_thread); 490 if (!mm) { 491 pr_debug("failed to get mm\n"); 492 kfree(svm_bo); 493 return -ESRCH; 494 } 495 svm_bo->svms = prange->svms; 496 svm_bo->eviction_fence = 497 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 498 mm, 499 svm_bo); 500 mmput(mm); 501 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); 502 svm_bo->evicting = 0; 503 memset(&bp, 0, sizeof(bp)); 504 bp.size = prange->npages * PAGE_SIZE; 505 bp.byte_align = PAGE_SIZE; 506 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 507 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 508 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; 509 bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO; 510 bp.type = ttm_bo_type_device; 511 bp.resv = NULL; 512 513 r = amdgpu_bo_create_user(adev, &bp, &ubo); 514 if (r) { 515 pr_debug("failed %d to create bo\n", r); 516 goto create_bo_failed; 517 } 518 bo = &ubo->bo; 519 r = amdgpu_bo_reserve(bo, true); 520 if (r) { 521 pr_debug("failed %d to reserve bo\n", r); 522 goto reserve_bo_failed; 523 } 524 525 r = dma_resv_reserve_shared(bo->tbo.base.resv, 1); 526 if (r) { 527 pr_debug("failed %d to reserve bo\n", r); 528 amdgpu_bo_unreserve(bo); 529 goto reserve_bo_failed; 530 } 531 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); 532 533 amdgpu_bo_unreserve(bo); 534 535 svm_bo->bo = bo; 536 prange->svm_bo = svm_bo; 537 prange->ttm_res = bo->tbo.resource; 538 prange->offset = 0; 539 540 spin_lock(&svm_bo->list_lock); 541 list_add(&prange->svm_bo_list, &svm_bo->range_list); 542 spin_unlock(&svm_bo->list_lock); 543 544 return 0; 545 546 reserve_bo_failed: 547 amdgpu_bo_unref(&bo); 548 create_bo_failed: 549 dma_fence_put(&svm_bo->eviction_fence->base); 550 kfree(svm_bo); 551 prange->ttm_res = NULL; 552 553 return r; 554 } 555 556 void svm_range_vram_node_free(struct svm_range *prange) 557 { 558 svm_range_bo_unref(prange->svm_bo); 559 prange->ttm_res = NULL; 560 } 561 562 struct amdgpu_device * 563 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) 564 { 565 struct kfd_process_device *pdd; 566 struct kfd_process *p; 567 int32_t gpu_idx; 568 569 p = container_of(prange->svms, struct kfd_process, svms); 570 571 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); 572 if (gpu_idx < 0) { 573 pr_debug("failed to get device by id 0x%x\n", gpu_id); 574 return NULL; 575 } 576 pdd = kfd_process_device_from_gpuidx(p, gpu_idx); 577 if (!pdd) { 578 pr_debug("failed to get device by idx 0x%x\n", gpu_idx); 579 return NULL; 580 } 581 582 return pdd->dev->adev; 583 } 584 585 struct kfd_process_device * 586 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev) 587 { 588 struct kfd_process *p; 589 int32_t gpu_idx, gpuid; 590 int r; 591 592 p = container_of(prange->svms, struct kfd_process, svms); 593 594 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx); 595 if (r) { 596 pr_debug("failed to get device id by adev %p\n", adev); 597 return NULL; 598 } 599 600 return kfd_process_device_from_gpuidx(p, gpu_idx); 601 } 602 603 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) 604 { 605 struct ttm_operation_ctx ctx = { false, false }; 606 607 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); 608 609 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 610 } 611 612 static int 613 svm_range_check_attr(struct kfd_process *p, 614 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 615 { 616 uint32_t i; 617 618 for (i = 0; i < nattr; i++) { 619 uint32_t val = attrs[i].value; 620 int gpuidx = MAX_GPU_INSTANCE; 621 622 switch (attrs[i].type) { 623 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 624 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && 625 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) 626 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 627 break; 628 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 629 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) 630 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 631 break; 632 case KFD_IOCTL_SVM_ATTR_ACCESS: 633 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 634 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 635 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 636 break; 637 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 638 break; 639 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 640 break; 641 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 642 break; 643 default: 644 pr_debug("unknown attr type 0x%x\n", attrs[i].type); 645 return -EINVAL; 646 } 647 648 if (gpuidx < 0) { 649 pr_debug("no GPU 0x%x found\n", val); 650 return -EINVAL; 651 } else if (gpuidx < MAX_GPU_INSTANCE && 652 !test_bit(gpuidx, p->svms.bitmap_supported)) { 653 pr_debug("GPU 0x%x not supported\n", val); 654 return -EINVAL; 655 } 656 } 657 658 return 0; 659 } 660 661 static void 662 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, 663 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 664 { 665 uint32_t i; 666 int gpuidx; 667 668 for (i = 0; i < nattr; i++) { 669 switch (attrs[i].type) { 670 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 671 prange->preferred_loc = attrs[i].value; 672 break; 673 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 674 prange->prefetch_loc = attrs[i].value; 675 break; 676 case KFD_IOCTL_SVM_ATTR_ACCESS: 677 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 678 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 679 gpuidx = kfd_process_gpuidx_from_gpuid(p, 680 attrs[i].value); 681 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 682 bitmap_clear(prange->bitmap_access, gpuidx, 1); 683 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 684 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 685 bitmap_set(prange->bitmap_access, gpuidx, 1); 686 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 687 } else { 688 bitmap_clear(prange->bitmap_access, gpuidx, 1); 689 bitmap_set(prange->bitmap_aip, gpuidx, 1); 690 } 691 break; 692 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 693 prange->flags |= attrs[i].value; 694 break; 695 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 696 prange->flags &= ~attrs[i].value; 697 break; 698 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 699 prange->granularity = attrs[i].value; 700 break; 701 default: 702 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 703 } 704 } 705 } 706 707 /** 708 * svm_range_debug_dump - print all range information from svms 709 * @svms: svm range list header 710 * 711 * debug output svm range start, end, prefetch location from svms 712 * interval tree and link list 713 * 714 * Context: The caller must hold svms->lock 715 */ 716 static void svm_range_debug_dump(struct svm_range_list *svms) 717 { 718 struct interval_tree_node *node; 719 struct svm_range *prange; 720 721 pr_debug("dump svms 0x%p list\n", svms); 722 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 723 724 list_for_each_entry(prange, &svms->list, list) { 725 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 726 prange, prange->start, prange->npages, 727 prange->start + prange->npages - 1, 728 prange->actual_loc); 729 } 730 731 pr_debug("dump svms 0x%p interval tree\n", svms); 732 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 733 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); 734 while (node) { 735 prange = container_of(node, struct svm_range, it_node); 736 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 737 prange, prange->start, prange->npages, 738 prange->start + prange->npages - 1, 739 prange->actual_loc); 740 node = interval_tree_iter_next(node, 0, ~0ULL); 741 } 742 } 743 744 static bool 745 svm_range_is_same_attrs(struct svm_range *old, struct svm_range *new) 746 { 747 return (old->prefetch_loc == new->prefetch_loc && 748 old->flags == new->flags && 749 old->granularity == new->granularity); 750 } 751 752 static int 753 svm_range_split_array(void *ppnew, void *ppold, size_t size, 754 uint64_t old_start, uint64_t old_n, 755 uint64_t new_start, uint64_t new_n) 756 { 757 unsigned char *new, *old, *pold; 758 uint64_t d; 759 760 if (!ppold) 761 return 0; 762 pold = *(unsigned char **)ppold; 763 if (!pold) 764 return 0; 765 766 new = kvmalloc_array(new_n, size, GFP_KERNEL); 767 if (!new) 768 return -ENOMEM; 769 770 d = (new_start - old_start) * size; 771 memcpy(new, pold + d, new_n * size); 772 773 old = kvmalloc_array(old_n, size, GFP_KERNEL); 774 if (!old) { 775 kvfree(new); 776 return -ENOMEM; 777 } 778 779 d = (new_start == old_start) ? new_n * size : 0; 780 memcpy(old, pold + d, old_n * size); 781 782 kvfree(pold); 783 *(void **)ppold = old; 784 *(void **)ppnew = new; 785 786 return 0; 787 } 788 789 static int 790 svm_range_split_pages(struct svm_range *new, struct svm_range *old, 791 uint64_t start, uint64_t last) 792 { 793 uint64_t npages = last - start + 1; 794 int i, r; 795 796 for (i = 0; i < MAX_GPU_INSTANCE; i++) { 797 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], 798 sizeof(*old->dma_addr[i]), old->start, 799 npages, new->start, new->npages); 800 if (r) 801 return r; 802 } 803 804 return 0; 805 } 806 807 static int 808 svm_range_split_nodes(struct svm_range *new, struct svm_range *old, 809 uint64_t start, uint64_t last) 810 { 811 uint64_t npages = last - start + 1; 812 813 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", 814 new->svms, new, new->start, start, last); 815 816 if (new->start == old->start) { 817 new->offset = old->offset; 818 old->offset += new->npages; 819 } else { 820 new->offset = old->offset + npages; 821 } 822 823 new->svm_bo = svm_range_bo_ref(old->svm_bo); 824 new->ttm_res = old->ttm_res; 825 826 spin_lock(&new->svm_bo->list_lock); 827 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 828 spin_unlock(&new->svm_bo->list_lock); 829 830 return 0; 831 } 832 833 /** 834 * svm_range_split_adjust - split range and adjust 835 * 836 * @new: new range 837 * @old: the old range 838 * @start: the old range adjust to start address in pages 839 * @last: the old range adjust to last address in pages 840 * 841 * Copy system memory dma_addr or vram ttm_res in old range to new 842 * range from new_start up to size new->npages, the remaining old range is from 843 * start to last 844 * 845 * Return: 846 * 0 - OK, -ENOMEM - out of memory 847 */ 848 static int 849 svm_range_split_adjust(struct svm_range *new, struct svm_range *old, 850 uint64_t start, uint64_t last) 851 { 852 int r; 853 854 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", 855 new->svms, new->start, old->start, old->last, start, last); 856 857 if (new->start < old->start || 858 new->last > old->last) { 859 WARN_ONCE(1, "invalid new range start or last\n"); 860 return -EINVAL; 861 } 862 863 r = svm_range_split_pages(new, old, start, last); 864 if (r) 865 return r; 866 867 if (old->actual_loc && old->ttm_res) { 868 r = svm_range_split_nodes(new, old, start, last); 869 if (r) 870 return r; 871 } 872 873 old->npages = last - start + 1; 874 old->start = start; 875 old->last = last; 876 new->flags = old->flags; 877 new->preferred_loc = old->preferred_loc; 878 new->prefetch_loc = old->prefetch_loc; 879 new->actual_loc = old->actual_loc; 880 new->granularity = old->granularity; 881 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 882 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 883 884 return 0; 885 } 886 887 /** 888 * svm_range_split - split a range in 2 ranges 889 * 890 * @prange: the svm range to split 891 * @start: the remaining range start address in pages 892 * @last: the remaining range last address in pages 893 * @new: the result new range generated 894 * 895 * Two cases only: 896 * case 1: if start == prange->start 897 * prange ==> prange[start, last] 898 * new range [last + 1, prange->last] 899 * 900 * case 2: if last == prange->last 901 * prange ==> prange[start, last] 902 * new range [prange->start, start - 1] 903 * 904 * Return: 905 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last 906 */ 907 static int 908 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, 909 struct svm_range **new) 910 { 911 uint64_t old_start = prange->start; 912 uint64_t old_last = prange->last; 913 struct svm_range_list *svms; 914 int r = 0; 915 916 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, 917 old_start, old_last, start, last); 918 919 if (old_start != start && old_last != last) 920 return -EINVAL; 921 if (start < old_start || last > old_last) 922 return -EINVAL; 923 924 svms = prange->svms; 925 if (old_start == start) 926 *new = svm_range_new(svms, last + 1, old_last); 927 else 928 *new = svm_range_new(svms, old_start, start - 1); 929 if (!*new) 930 return -ENOMEM; 931 932 r = svm_range_split_adjust(*new, prange, start, last); 933 if (r) { 934 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", 935 r, old_start, old_last, start, last); 936 svm_range_free(*new); 937 *new = NULL; 938 } 939 940 return r; 941 } 942 943 static int 944 svm_range_split_tail(struct svm_range *prange, struct svm_range *new, 945 uint64_t new_last, struct list_head *insert_list) 946 { 947 struct svm_range *tail; 948 int r = svm_range_split(prange, prange->start, new_last, &tail); 949 950 if (!r) 951 list_add(&tail->insert_list, insert_list); 952 return r; 953 } 954 955 static int 956 svm_range_split_head(struct svm_range *prange, struct svm_range *new, 957 uint64_t new_start, struct list_head *insert_list) 958 { 959 struct svm_range *head; 960 int r = svm_range_split(prange, new_start, prange->last, &head); 961 962 if (!r) 963 list_add(&head->insert_list, insert_list); 964 return r; 965 } 966 967 static void 968 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 969 struct svm_range *pchild, enum svm_work_list_ops op) 970 { 971 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 972 pchild, pchild->start, pchild->last, prange, op); 973 974 pchild->work_item.mm = mm; 975 pchild->work_item.op = op; 976 list_add_tail(&pchild->child_list, &prange->child_list); 977 } 978 979 /** 980 * svm_range_split_by_granularity - collect ranges within granularity boundary 981 * 982 * @p: the process with svms list 983 * @mm: mm structure 984 * @addr: the vm fault address in pages, to split the prange 985 * @parent: parent range if prange is from child list 986 * @prange: prange to split 987 * 988 * Trims @prange to be a single aligned block of prange->granularity if 989 * possible. The head and tail are added to the child_list in @parent. 990 * 991 * Context: caller must hold mmap_read_lock and prange->lock 992 * 993 * Return: 994 * 0 - OK, otherwise error code 995 */ 996 int 997 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, 998 unsigned long addr, struct svm_range *parent, 999 struct svm_range *prange) 1000 { 1001 struct svm_range *head, *tail; 1002 unsigned long start, last, size; 1003 int r; 1004 1005 /* Align splited range start and size to granularity size, then a single 1006 * PTE will be used for whole range, this reduces the number of PTE 1007 * updated and the L1 TLB space used for translation. 1008 */ 1009 size = 1UL << prange->granularity; 1010 start = ALIGN_DOWN(addr, size); 1011 last = ALIGN(addr + 1, size) - 1; 1012 1013 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", 1014 prange->svms, prange->start, prange->last, start, last, size); 1015 1016 if (start > prange->start) { 1017 r = svm_range_split(prange, start, prange->last, &head); 1018 if (r) 1019 return r; 1020 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); 1021 } 1022 1023 if (last < prange->last) { 1024 r = svm_range_split(prange, prange->start, last, &tail); 1025 if (r) 1026 return r; 1027 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 1028 } 1029 1030 /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ 1031 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { 1032 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; 1033 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", 1034 prange, prange->start, prange->last, 1035 SVM_OP_ADD_RANGE_AND_MAP); 1036 } 1037 return 0; 1038 } 1039 1040 static uint64_t 1041 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, 1042 int domain) 1043 { 1044 struct amdgpu_device *bo_adev; 1045 uint32_t flags = prange->flags; 1046 uint32_t mapping_flags = 0; 1047 uint64_t pte_flags; 1048 bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); 1049 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; 1050 1051 if (domain == SVM_RANGE_VRAM_DOMAIN) 1052 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1053 1054 switch (KFD_GC_VERSION(adev->kfd.dev)) { 1055 case IP_VERSION(9, 4, 1): 1056 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1057 if (bo_adev == adev) { 1058 mapping_flags |= coherent ? 1059 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1060 } else { 1061 mapping_flags |= coherent ? 1062 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1063 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1064 snoop = true; 1065 } 1066 } else { 1067 mapping_flags |= coherent ? 1068 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1069 } 1070 break; 1071 case IP_VERSION(9, 4, 2): 1072 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1073 if (bo_adev == adev) { 1074 mapping_flags |= coherent ? 1075 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1076 if (adev->gmc.xgmi.connected_to_cpu) 1077 snoop = true; 1078 } else { 1079 mapping_flags |= coherent ? 1080 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1081 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1082 snoop = true; 1083 } 1084 } else { 1085 mapping_flags |= coherent ? 1086 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1087 } 1088 break; 1089 default: 1090 mapping_flags |= coherent ? 1091 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1092 } 1093 1094 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; 1095 1096 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) 1097 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; 1098 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) 1099 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 1100 1101 pte_flags = AMDGPU_PTE_VALID; 1102 pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; 1103 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 1104 1105 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); 1106 return pte_flags; 1107 } 1108 1109 static int 1110 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1111 uint64_t start, uint64_t last, 1112 struct dma_fence **fence) 1113 { 1114 uint64_t init_pte_value = 0; 1115 1116 pr_debug("[0x%llx 0x%llx]\n", start, last); 1117 1118 return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL, 1119 start, last, init_pte_value, 0, 1120 NULL, NULL, fence, NULL); 1121 } 1122 1123 static int 1124 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, 1125 unsigned long last) 1126 { 1127 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1128 struct kfd_process_device *pdd; 1129 struct dma_fence *fence = NULL; 1130 struct kfd_process *p; 1131 uint32_t gpuidx; 1132 int r = 0; 1133 1134 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 1135 MAX_GPU_INSTANCE); 1136 p = container_of(prange->svms, struct kfd_process, svms); 1137 1138 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1139 pr_debug("unmap from gpu idx 0x%x\n", gpuidx); 1140 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1141 if (!pdd) { 1142 pr_debug("failed to find device idx %d\n", gpuidx); 1143 return -EINVAL; 1144 } 1145 1146 r = svm_range_unmap_from_gpu(pdd->dev->adev, 1147 drm_priv_to_vm(pdd->drm_priv), 1148 start, last, &fence); 1149 if (r) 1150 break; 1151 1152 if (fence) { 1153 r = dma_fence_wait(fence, false); 1154 dma_fence_put(fence); 1155 fence = NULL; 1156 if (r) 1157 break; 1158 } 1159 amdgpu_amdkfd_flush_gpu_tlb_pasid(pdd->dev->adev, 1160 p->pasid, TLB_FLUSH_HEAVYWEIGHT); 1161 } 1162 1163 return r; 1164 } 1165 1166 static int 1167 svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1168 struct svm_range *prange, unsigned long offset, 1169 unsigned long npages, bool readonly, dma_addr_t *dma_addr, 1170 struct amdgpu_device *bo_adev, struct dma_fence **fence) 1171 { 1172 struct amdgpu_bo_va bo_va; 1173 bool table_freed = false; 1174 uint64_t pte_flags; 1175 unsigned long last_start; 1176 int last_domain; 1177 int r = 0; 1178 int64_t i, j; 1179 1180 last_start = prange->start + offset; 1181 1182 pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms, 1183 last_start, last_start + npages - 1, readonly); 1184 1185 if (prange->svm_bo && prange->ttm_res) 1186 bo_va.is_xgmi = amdgpu_xgmi_same_hive(adev, bo_adev); 1187 1188 for (i = offset; i < offset + npages; i++) { 1189 last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; 1190 dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; 1191 1192 /* Collect all pages in the same address range and memory domain 1193 * that can be mapped with a single call to update mapping. 1194 */ 1195 if (i < offset + npages - 1 && 1196 last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) 1197 continue; 1198 1199 pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", 1200 last_start, prange->start + i, last_domain ? "GPU" : "CPU"); 1201 1202 pte_flags = svm_range_get_pte_flags(adev, prange, last_domain); 1203 if (readonly) 1204 pte_flags &= ~AMDGPU_PTE_WRITEABLE; 1205 1206 pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n", 1207 prange->svms, last_start, prange->start + i, 1208 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, 1209 pte_flags); 1210 1211 r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false, 1212 NULL, last_start, 1213 prange->start + i, pte_flags, 1214 last_start - prange->start, 1215 NULL, dma_addr, 1216 &vm->last_update, 1217 &table_freed); 1218 1219 for (j = last_start - prange->start; j <= i; j++) 1220 dma_addr[j] |= last_domain; 1221 1222 if (r) { 1223 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); 1224 goto out; 1225 } 1226 last_start = prange->start + i + 1; 1227 } 1228 1229 r = amdgpu_vm_update_pdes(adev, vm, false); 1230 if (r) { 1231 pr_debug("failed %d to update directories 0x%lx\n", r, 1232 prange->start); 1233 goto out; 1234 } 1235 1236 if (fence) 1237 *fence = dma_fence_get(vm->last_update); 1238 1239 if (table_freed) { 1240 struct kfd_process *p; 1241 1242 p = container_of(prange->svms, struct kfd_process, svms); 1243 amdgpu_amdkfd_flush_gpu_tlb_pasid(adev, p->pasid, TLB_FLUSH_LEGACY); 1244 } 1245 out: 1246 return r; 1247 } 1248 1249 static int 1250 svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, 1251 unsigned long npages, bool readonly, 1252 unsigned long *bitmap, bool wait) 1253 { 1254 struct kfd_process_device *pdd; 1255 struct amdgpu_device *bo_adev; 1256 struct kfd_process *p; 1257 struct dma_fence *fence = NULL; 1258 uint32_t gpuidx; 1259 int r = 0; 1260 1261 if (prange->svm_bo && prange->ttm_res) 1262 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1263 else 1264 bo_adev = NULL; 1265 1266 p = container_of(prange->svms, struct kfd_process, svms); 1267 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1268 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 1269 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1270 if (!pdd) { 1271 pr_debug("failed to find device idx %d\n", gpuidx); 1272 return -EINVAL; 1273 } 1274 1275 pdd = kfd_bind_process_to_device(pdd->dev, p); 1276 if (IS_ERR(pdd)) 1277 return -EINVAL; 1278 1279 if (bo_adev && pdd->dev->adev != bo_adev && 1280 !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 1281 pr_debug("cannot map to device idx %d\n", gpuidx); 1282 continue; 1283 } 1284 1285 r = svm_range_map_to_gpu(pdd->dev->adev, drm_priv_to_vm(pdd->drm_priv), 1286 prange, offset, npages, readonly, 1287 prange->dma_addr[gpuidx], 1288 bo_adev, wait ? &fence : NULL); 1289 if (r) 1290 break; 1291 1292 if (fence) { 1293 r = dma_fence_wait(fence, false); 1294 dma_fence_put(fence); 1295 fence = NULL; 1296 if (r) { 1297 pr_debug("failed %d to dma fence wait\n", r); 1298 break; 1299 } 1300 } 1301 } 1302 1303 return r; 1304 } 1305 1306 struct svm_validate_context { 1307 struct kfd_process *process; 1308 struct svm_range *prange; 1309 bool intr; 1310 unsigned long bitmap[MAX_GPU_INSTANCE]; 1311 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE]; 1312 struct list_head validate_list; 1313 struct ww_acquire_ctx ticket; 1314 }; 1315 1316 static int svm_range_reserve_bos(struct svm_validate_context *ctx) 1317 { 1318 struct kfd_process_device *pdd; 1319 struct amdgpu_vm *vm; 1320 uint32_t gpuidx; 1321 int r; 1322 1323 INIT_LIST_HEAD(&ctx->validate_list); 1324 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1325 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1326 if (!pdd) { 1327 pr_debug("failed to find device idx %d\n", gpuidx); 1328 return -EINVAL; 1329 } 1330 vm = drm_priv_to_vm(pdd->drm_priv); 1331 1332 ctx->tv[gpuidx].bo = &vm->root.bo->tbo; 1333 ctx->tv[gpuidx].num_shared = 4; 1334 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); 1335 } 1336 1337 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, 1338 ctx->intr, NULL); 1339 if (r) { 1340 pr_debug("failed %d to reserve bo\n", r); 1341 return r; 1342 } 1343 1344 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1345 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1346 if (!pdd) { 1347 pr_debug("failed to find device idx %d\n", gpuidx); 1348 r = -EINVAL; 1349 goto unreserve_out; 1350 } 1351 1352 r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, 1353 drm_priv_to_vm(pdd->drm_priv), 1354 svm_range_bo_validate, NULL); 1355 if (r) { 1356 pr_debug("failed %d validate pt bos\n", r); 1357 goto unreserve_out; 1358 } 1359 } 1360 1361 return 0; 1362 1363 unreserve_out: 1364 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1365 return r; 1366 } 1367 1368 static void svm_range_unreserve_bos(struct svm_validate_context *ctx) 1369 { 1370 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1371 } 1372 1373 static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) 1374 { 1375 struct kfd_process_device *pdd; 1376 1377 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1378 1379 return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); 1380 } 1381 1382 /* 1383 * Validation+GPU mapping with concurrent invalidation (MMU notifiers) 1384 * 1385 * To prevent concurrent destruction or change of range attributes, the 1386 * svm_read_lock must be held. The caller must not hold the svm_write_lock 1387 * because that would block concurrent evictions and lead to deadlocks. To 1388 * serialize concurrent migrations or validations of the same range, the 1389 * prange->migrate_mutex must be held. 1390 * 1391 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its 1392 * eviction fence. 1393 * 1394 * The following sequence ensures race-free validation and GPU mapping: 1395 * 1396 * 1. Reserve page table (and SVM BO if range is in VRAM) 1397 * 2. hmm_range_fault to get page addresses (if system memory) 1398 * 3. DMA-map pages (if system memory) 1399 * 4-a. Take notifier lock 1400 * 4-b. Check that pages still valid (mmu_interval_read_retry) 1401 * 4-c. Check that the range was not split or otherwise invalidated 1402 * 4-d. Update GPU page table 1403 * 4.e. Release notifier lock 1404 * 5. Release page table (and SVM BO) reservation 1405 */ 1406 static int svm_range_validate_and_map(struct mm_struct *mm, 1407 struct svm_range *prange, 1408 int32_t gpuidx, bool intr, bool wait) 1409 { 1410 struct svm_validate_context ctx; 1411 unsigned long start, end, addr; 1412 struct kfd_process *p; 1413 void *owner; 1414 int32_t idx; 1415 int r = 0; 1416 1417 ctx.process = container_of(prange->svms, struct kfd_process, svms); 1418 ctx.prange = prange; 1419 ctx.intr = intr; 1420 1421 if (gpuidx < MAX_GPU_INSTANCE) { 1422 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); 1423 bitmap_set(ctx.bitmap, gpuidx, 1); 1424 } else if (ctx.process->xnack_enabled) { 1425 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 1426 1427 /* If prefetch range to GPU, or GPU retry fault migrate range to 1428 * GPU, which has ACCESS attribute to the range, create mapping 1429 * on that GPU. 1430 */ 1431 if (prange->actual_loc) { 1432 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, 1433 prange->actual_loc); 1434 if (gpuidx < 0) { 1435 WARN_ONCE(1, "failed get device by id 0x%x\n", 1436 prange->actual_loc); 1437 return -EINVAL; 1438 } 1439 if (test_bit(gpuidx, prange->bitmap_access)) 1440 bitmap_set(ctx.bitmap, gpuidx, 1); 1441 } 1442 } else { 1443 bitmap_or(ctx.bitmap, prange->bitmap_access, 1444 prange->bitmap_aip, MAX_GPU_INSTANCE); 1445 } 1446 1447 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) 1448 return 0; 1449 1450 if (prange->actual_loc && !prange->ttm_res) { 1451 /* This should never happen. actual_loc gets set by 1452 * svm_migrate_ram_to_vram after allocating a BO. 1453 */ 1454 WARN_ONCE(1, "VRAM BO missing during validation\n"); 1455 return -EINVAL; 1456 } 1457 1458 svm_range_reserve_bos(&ctx); 1459 1460 p = container_of(prange->svms, struct kfd_process, svms); 1461 owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap, 1462 MAX_GPU_INSTANCE)); 1463 for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) { 1464 if (kfd_svm_page_owner(p, idx) != owner) { 1465 owner = NULL; 1466 break; 1467 } 1468 } 1469 1470 start = prange->start << PAGE_SHIFT; 1471 end = (prange->last + 1) << PAGE_SHIFT; 1472 for (addr = start; addr < end && !r; ) { 1473 struct hmm_range *hmm_range; 1474 struct vm_area_struct *vma; 1475 unsigned long next; 1476 unsigned long offset; 1477 unsigned long npages; 1478 bool readonly; 1479 1480 vma = find_vma(mm, addr); 1481 if (!vma || addr < vma->vm_start) { 1482 r = -EFAULT; 1483 goto unreserve_out; 1484 } 1485 readonly = !(vma->vm_flags & VM_WRITE); 1486 1487 next = min(vma->vm_end, end); 1488 npages = (next - addr) >> PAGE_SHIFT; 1489 WRITE_ONCE(p->svms.faulting_task, current); 1490 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 1491 addr, npages, &hmm_range, 1492 readonly, true, owner); 1493 WRITE_ONCE(p->svms.faulting_task, NULL); 1494 if (r) { 1495 pr_debug("failed %d to get svm range pages\n", r); 1496 goto unreserve_out; 1497 } 1498 1499 offset = (addr - start) >> PAGE_SHIFT; 1500 r = svm_range_dma_map(prange, ctx.bitmap, offset, npages, 1501 hmm_range->hmm_pfns); 1502 if (r) { 1503 pr_debug("failed %d to dma map range\n", r); 1504 goto unreserve_out; 1505 } 1506 1507 svm_range_lock(prange); 1508 if (amdgpu_hmm_range_get_pages_done(hmm_range)) { 1509 pr_debug("hmm update the range, need validate again\n"); 1510 r = -EAGAIN; 1511 goto unlock_out; 1512 } 1513 if (!list_empty(&prange->child_list)) { 1514 pr_debug("range split by unmap in parallel, validate again\n"); 1515 r = -EAGAIN; 1516 goto unlock_out; 1517 } 1518 1519 r = svm_range_map_to_gpus(prange, offset, npages, readonly, 1520 ctx.bitmap, wait); 1521 1522 unlock_out: 1523 svm_range_unlock(prange); 1524 1525 addr = next; 1526 } 1527 1528 if (addr == end) 1529 prange->validated_once = true; 1530 1531 unreserve_out: 1532 svm_range_unreserve_bos(&ctx); 1533 1534 if (!r) 1535 prange->validate_timestamp = ktime_to_us(ktime_get()); 1536 1537 return r; 1538 } 1539 1540 /** 1541 * svm_range_list_lock_and_flush_work - flush pending deferred work 1542 * 1543 * @svms: the svm range list 1544 * @mm: the mm structure 1545 * 1546 * Context: Returns with mmap write lock held, pending deferred work flushed 1547 * 1548 */ 1549 void 1550 svm_range_list_lock_and_flush_work(struct svm_range_list *svms, 1551 struct mm_struct *mm) 1552 { 1553 retry_flush_work: 1554 flush_work(&svms->deferred_list_work); 1555 mmap_write_lock(mm); 1556 1557 if (list_empty(&svms->deferred_range_list)) 1558 return; 1559 mmap_write_unlock(mm); 1560 pr_debug("retry flush\n"); 1561 goto retry_flush_work; 1562 } 1563 1564 static void svm_range_restore_work(struct work_struct *work) 1565 { 1566 struct delayed_work *dwork = to_delayed_work(work); 1567 struct amdkfd_process_info *process_info; 1568 struct svm_range_list *svms; 1569 struct svm_range *prange; 1570 struct kfd_process *p; 1571 struct mm_struct *mm; 1572 int evicted_ranges; 1573 int invalid; 1574 int r; 1575 1576 svms = container_of(dwork, struct svm_range_list, restore_work); 1577 evicted_ranges = atomic_read(&svms->evicted_ranges); 1578 if (!evicted_ranges) 1579 return; 1580 1581 pr_debug("restore svm ranges\n"); 1582 1583 /* kfd_process_notifier_release destroys this worker thread. So during 1584 * the lifetime of this thread, kfd_process and mm will be valid. 1585 */ 1586 p = container_of(svms, struct kfd_process, svms); 1587 process_info = p->kgd_process_info; 1588 mm = p->mm; 1589 if (!mm) 1590 return; 1591 1592 mutex_lock(&process_info->lock); 1593 svm_range_list_lock_and_flush_work(svms, mm); 1594 mutex_lock(&svms->lock); 1595 1596 evicted_ranges = atomic_read(&svms->evicted_ranges); 1597 1598 list_for_each_entry(prange, &svms->list, list) { 1599 invalid = atomic_read(&prange->invalid); 1600 if (!invalid) 1601 continue; 1602 1603 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", 1604 prange->svms, prange, prange->start, prange->last, 1605 invalid); 1606 1607 /* 1608 * If range is migrating, wait for migration is done. 1609 */ 1610 mutex_lock(&prange->migrate_mutex); 1611 1612 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 1613 false, true); 1614 if (r) 1615 pr_debug("failed %d to map 0x%lx to gpus\n", r, 1616 prange->start); 1617 1618 mutex_unlock(&prange->migrate_mutex); 1619 if (r) 1620 goto out_reschedule; 1621 1622 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) 1623 goto out_reschedule; 1624 } 1625 1626 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != 1627 evicted_ranges) 1628 goto out_reschedule; 1629 1630 evicted_ranges = 0; 1631 1632 r = kgd2kfd_resume_mm(mm); 1633 if (r) { 1634 /* No recovery from this failure. Probably the CP is 1635 * hanging. No point trying again. 1636 */ 1637 pr_debug("failed %d to resume KFD\n", r); 1638 } 1639 1640 pr_debug("restore svm ranges successfully\n"); 1641 1642 out_reschedule: 1643 mutex_unlock(&svms->lock); 1644 mmap_write_unlock(mm); 1645 mutex_unlock(&process_info->lock); 1646 1647 /* If validation failed, reschedule another attempt */ 1648 if (evicted_ranges) { 1649 pr_debug("reschedule to restore svm range\n"); 1650 schedule_delayed_work(&svms->restore_work, 1651 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1652 } 1653 } 1654 1655 /** 1656 * svm_range_evict - evict svm range 1657 * 1658 * Stop all queues of the process to ensure GPU doesn't access the memory, then 1659 * return to let CPU evict the buffer and proceed CPU pagetable update. 1660 * 1661 * Don't need use lock to sync cpu pagetable invalidation with GPU execution. 1662 * If invalidation happens while restore work is running, restore work will 1663 * restart to ensure to get the latest CPU pages mapping to GPU, then start 1664 * the queues. 1665 */ 1666 static int 1667 svm_range_evict(struct svm_range *prange, struct mm_struct *mm, 1668 unsigned long start, unsigned long last) 1669 { 1670 struct svm_range_list *svms = prange->svms; 1671 struct svm_range *pchild; 1672 struct kfd_process *p; 1673 int r = 0; 1674 1675 p = container_of(svms, struct kfd_process, svms); 1676 1677 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1678 svms, prange->start, prange->last, start, last); 1679 1680 if (!p->xnack_enabled) { 1681 int evicted_ranges; 1682 1683 list_for_each_entry(pchild, &prange->child_list, child_list) { 1684 mutex_lock_nested(&pchild->lock, 1); 1685 if (pchild->start <= last && pchild->last >= start) { 1686 pr_debug("increment pchild invalid [0x%lx 0x%lx]\n", 1687 pchild->start, pchild->last); 1688 atomic_inc(&pchild->invalid); 1689 } 1690 mutex_unlock(&pchild->lock); 1691 } 1692 1693 if (prange->start <= last && prange->last >= start) 1694 atomic_inc(&prange->invalid); 1695 1696 evicted_ranges = atomic_inc_return(&svms->evicted_ranges); 1697 if (evicted_ranges != 1) 1698 return r; 1699 1700 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", 1701 prange->svms, prange->start, prange->last); 1702 1703 /* First eviction, stop the queues */ 1704 r = kgd2kfd_quiesce_mm(mm); 1705 if (r) 1706 pr_debug("failed to quiesce KFD\n"); 1707 1708 pr_debug("schedule to restore svm %p ranges\n", svms); 1709 schedule_delayed_work(&svms->restore_work, 1710 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1711 } else { 1712 unsigned long s, l; 1713 1714 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", 1715 prange->svms, start, last); 1716 list_for_each_entry(pchild, &prange->child_list, child_list) { 1717 mutex_lock_nested(&pchild->lock, 1); 1718 s = max(start, pchild->start); 1719 l = min(last, pchild->last); 1720 if (l >= s) 1721 svm_range_unmap_from_gpus(pchild, s, l); 1722 mutex_unlock(&pchild->lock); 1723 } 1724 s = max(start, prange->start); 1725 l = min(last, prange->last); 1726 if (l >= s) 1727 svm_range_unmap_from_gpus(prange, s, l); 1728 } 1729 1730 return r; 1731 } 1732 1733 static struct svm_range *svm_range_clone(struct svm_range *old) 1734 { 1735 struct svm_range *new; 1736 1737 new = svm_range_new(old->svms, old->start, old->last); 1738 if (!new) 1739 return NULL; 1740 1741 if (old->svm_bo) { 1742 new->ttm_res = old->ttm_res; 1743 new->offset = old->offset; 1744 new->svm_bo = svm_range_bo_ref(old->svm_bo); 1745 spin_lock(&new->svm_bo->list_lock); 1746 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 1747 spin_unlock(&new->svm_bo->list_lock); 1748 } 1749 new->flags = old->flags; 1750 new->preferred_loc = old->preferred_loc; 1751 new->prefetch_loc = old->prefetch_loc; 1752 new->actual_loc = old->actual_loc; 1753 new->granularity = old->granularity; 1754 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 1755 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 1756 1757 return new; 1758 } 1759 1760 /** 1761 * svm_range_handle_overlap - split overlap ranges 1762 * @svms: svm range list header 1763 * @new: range added with this attributes 1764 * @start: range added start address, in pages 1765 * @last: range last address, in pages 1766 * @update_list: output, the ranges attributes are updated. For set_attr, this 1767 * will do validation and map to GPUs. For unmap, this will be 1768 * removed and unmap from GPUs 1769 * @insert_list: output, the ranges will be inserted into svms, attributes are 1770 * not changes. For set_attr, this will add into svms. 1771 * @remove_list:output, the ranges will be removed from svms 1772 * @left: the remaining range after overlap, For set_attr, this will be added 1773 * as new range. 1774 * 1775 * Total have 5 overlap cases. 1776 * 1777 * This function handles overlap of an address interval with existing 1778 * struct svm_ranges for applying new attributes. This may require 1779 * splitting existing struct svm_ranges. All changes should be applied to 1780 * the range_list and interval tree transactionally. If any split operation 1781 * fails, the entire update fails. Therefore the existing overlapping 1782 * svm_ranges are cloned and the original svm_ranges left unchanged. If the 1783 * transaction succeeds, the modified clones are added and the originals 1784 * freed. Otherwise the clones are removed and the old svm_ranges remain. 1785 * 1786 * Context: The caller must hold svms->lock 1787 */ 1788 static int 1789 svm_range_handle_overlap(struct svm_range_list *svms, struct svm_range *new, 1790 unsigned long start, unsigned long last, 1791 struct list_head *update_list, 1792 struct list_head *insert_list, 1793 struct list_head *remove_list, 1794 unsigned long *left) 1795 { 1796 struct interval_tree_node *node; 1797 struct svm_range *prange; 1798 struct svm_range *tmp; 1799 int r = 0; 1800 1801 INIT_LIST_HEAD(update_list); 1802 INIT_LIST_HEAD(insert_list); 1803 INIT_LIST_HEAD(remove_list); 1804 1805 node = interval_tree_iter_first(&svms->objects, start, last); 1806 while (node) { 1807 struct interval_tree_node *next; 1808 struct svm_range *old; 1809 unsigned long next_start; 1810 1811 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, 1812 node->last); 1813 1814 old = container_of(node, struct svm_range, it_node); 1815 next = interval_tree_iter_next(node, start, last); 1816 next_start = min(node->last, last) + 1; 1817 1818 if (node->start < start || node->last > last) { 1819 /* node intersects the updated range, clone+split it */ 1820 prange = svm_range_clone(old); 1821 if (!prange) { 1822 r = -ENOMEM; 1823 goto out; 1824 } 1825 1826 list_add(&old->remove_list, remove_list); 1827 list_add(&prange->insert_list, insert_list); 1828 1829 if (node->start < start) { 1830 pr_debug("change old range start\n"); 1831 r = svm_range_split_head(prange, new, start, 1832 insert_list); 1833 if (r) 1834 goto out; 1835 } 1836 if (node->last > last) { 1837 pr_debug("change old range last\n"); 1838 r = svm_range_split_tail(prange, new, last, 1839 insert_list); 1840 if (r) 1841 goto out; 1842 } 1843 } else { 1844 /* The node is contained within start..last, 1845 * just update it 1846 */ 1847 prange = old; 1848 } 1849 1850 if (!svm_range_is_same_attrs(prange, new)) 1851 list_add(&prange->update_list, update_list); 1852 1853 /* insert a new node if needed */ 1854 if (node->start > start) { 1855 prange = svm_range_new(prange->svms, start, 1856 node->start - 1); 1857 if (!prange) { 1858 r = -ENOMEM; 1859 goto out; 1860 } 1861 1862 list_add(&prange->insert_list, insert_list); 1863 list_add(&prange->update_list, update_list); 1864 } 1865 1866 node = next; 1867 start = next_start; 1868 } 1869 1870 if (left && start <= last) 1871 *left = last - start + 1; 1872 1873 out: 1874 if (r) 1875 list_for_each_entry_safe(prange, tmp, insert_list, insert_list) 1876 svm_range_free(prange); 1877 1878 return r; 1879 } 1880 1881 static void 1882 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, 1883 struct svm_range *prange) 1884 { 1885 unsigned long start; 1886 unsigned long last; 1887 1888 start = prange->notifier.interval_tree.start >> PAGE_SHIFT; 1889 last = prange->notifier.interval_tree.last >> PAGE_SHIFT; 1890 1891 if (prange->start == start && prange->last == last) 1892 return; 1893 1894 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1895 prange->svms, prange, start, last, prange->start, 1896 prange->last); 1897 1898 if (start != 0 && last != 0) { 1899 interval_tree_remove(&prange->it_node, &prange->svms->objects); 1900 svm_range_remove_notifier(prange); 1901 } 1902 prange->it_node.start = prange->start; 1903 prange->it_node.last = prange->last; 1904 1905 interval_tree_insert(&prange->it_node, &prange->svms->objects); 1906 svm_range_add_notifier_locked(mm, prange); 1907 } 1908 1909 static void 1910 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange) 1911 { 1912 struct mm_struct *mm = prange->work_item.mm; 1913 1914 switch (prange->work_item.op) { 1915 case SVM_OP_NULL: 1916 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1917 svms, prange, prange->start, prange->last); 1918 break; 1919 case SVM_OP_UNMAP_RANGE: 1920 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1921 svms, prange, prange->start, prange->last); 1922 svm_range_unlink(prange); 1923 svm_range_remove_notifier(prange); 1924 svm_range_free(prange); 1925 break; 1926 case SVM_OP_UPDATE_RANGE_NOTIFIER: 1927 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1928 svms, prange, prange->start, prange->last); 1929 svm_range_update_notifier_and_interval_tree(mm, prange); 1930 break; 1931 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: 1932 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1933 svms, prange, prange->start, prange->last); 1934 svm_range_update_notifier_and_interval_tree(mm, prange); 1935 /* TODO: implement deferred validation and mapping */ 1936 break; 1937 case SVM_OP_ADD_RANGE: 1938 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, 1939 prange->start, prange->last); 1940 svm_range_add_to_svms(prange); 1941 svm_range_add_notifier_locked(mm, prange); 1942 break; 1943 case SVM_OP_ADD_RANGE_AND_MAP: 1944 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, 1945 prange, prange->start, prange->last); 1946 svm_range_add_to_svms(prange); 1947 svm_range_add_notifier_locked(mm, prange); 1948 /* TODO: implement deferred validation and mapping */ 1949 break; 1950 default: 1951 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, 1952 prange->work_item.op); 1953 } 1954 } 1955 1956 static void svm_range_drain_retry_fault(struct svm_range_list *svms) 1957 { 1958 struct kfd_process_device *pdd; 1959 struct kfd_process *p; 1960 uint32_t i; 1961 1962 p = container_of(svms, struct kfd_process, svms); 1963 1964 for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { 1965 pdd = p->pdds[i]; 1966 if (!pdd) 1967 continue; 1968 1969 pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 1970 1971 amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev, 1972 &pdd->dev->adev->irq.ih1); 1973 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 1974 } 1975 } 1976 1977 static void svm_range_deferred_list_work(struct work_struct *work) 1978 { 1979 struct svm_range_list *svms; 1980 struct svm_range *prange; 1981 struct mm_struct *mm; 1982 1983 svms = container_of(work, struct svm_range_list, deferred_list_work); 1984 pr_debug("enter svms 0x%p\n", svms); 1985 1986 spin_lock(&svms->deferred_list_lock); 1987 while (!list_empty(&svms->deferred_range_list)) { 1988 prange = list_first_entry(&svms->deferred_range_list, 1989 struct svm_range, deferred_list); 1990 spin_unlock(&svms->deferred_list_lock); 1991 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, 1992 prange->start, prange->last, prange->work_item.op); 1993 1994 mm = prange->work_item.mm; 1995 retry: 1996 mmap_write_lock(mm); 1997 mutex_lock(&svms->lock); 1998 1999 /* Checking for the need to drain retry faults must be in 2000 * mmap write lock to serialize with munmap notifiers. 2001 * 2002 * Remove from deferred_list must be inside mmap write lock, 2003 * otherwise, svm_range_list_lock_and_flush_work may hold mmap 2004 * write lock, and continue because deferred_list is empty, then 2005 * deferred_list handle is blocked by mmap write lock. 2006 */ 2007 spin_lock(&svms->deferred_list_lock); 2008 if (unlikely(svms->drain_pagefaults)) { 2009 svms->drain_pagefaults = false; 2010 spin_unlock(&svms->deferred_list_lock); 2011 mutex_unlock(&svms->lock); 2012 mmap_write_unlock(mm); 2013 svm_range_drain_retry_fault(svms); 2014 goto retry; 2015 } 2016 list_del_init(&prange->deferred_list); 2017 spin_unlock(&svms->deferred_list_lock); 2018 2019 mutex_lock(&prange->migrate_mutex); 2020 while (!list_empty(&prange->child_list)) { 2021 struct svm_range *pchild; 2022 2023 pchild = list_first_entry(&prange->child_list, 2024 struct svm_range, child_list); 2025 pr_debug("child prange 0x%p op %d\n", pchild, 2026 pchild->work_item.op); 2027 list_del_init(&pchild->child_list); 2028 svm_range_handle_list_op(svms, pchild); 2029 } 2030 mutex_unlock(&prange->migrate_mutex); 2031 2032 svm_range_handle_list_op(svms, prange); 2033 mutex_unlock(&svms->lock); 2034 mmap_write_unlock(mm); 2035 2036 spin_lock(&svms->deferred_list_lock); 2037 } 2038 spin_unlock(&svms->deferred_list_lock); 2039 2040 pr_debug("exit svms 0x%p\n", svms); 2041 } 2042 2043 void 2044 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, 2045 struct mm_struct *mm, enum svm_work_list_ops op) 2046 { 2047 spin_lock(&svms->deferred_list_lock); 2048 /* Make sure pending page faults are drained in the deferred worker 2049 * before the range is freed to avoid straggler interrupts on 2050 * unmapped memory causing "phantom faults". 2051 */ 2052 if (op == SVM_OP_UNMAP_RANGE) 2053 svms->drain_pagefaults = true; 2054 /* if prange is on the deferred list */ 2055 if (!list_empty(&prange->deferred_list)) { 2056 pr_debug("update exist prange 0x%p work op %d\n", prange, op); 2057 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); 2058 if (op != SVM_OP_NULL && 2059 prange->work_item.op != SVM_OP_UNMAP_RANGE) 2060 prange->work_item.op = op; 2061 } else { 2062 prange->work_item.op = op; 2063 prange->work_item.mm = mm; 2064 list_add_tail(&prange->deferred_list, 2065 &prange->svms->deferred_range_list); 2066 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 2067 prange, prange->start, prange->last, op); 2068 } 2069 spin_unlock(&svms->deferred_list_lock); 2070 } 2071 2072 void schedule_deferred_list_work(struct svm_range_list *svms) 2073 { 2074 spin_lock(&svms->deferred_list_lock); 2075 if (!list_empty(&svms->deferred_range_list)) 2076 schedule_work(&svms->deferred_list_work); 2077 spin_unlock(&svms->deferred_list_lock); 2078 } 2079 2080 static void 2081 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 2082 struct svm_range *prange, unsigned long start, 2083 unsigned long last) 2084 { 2085 struct svm_range *head; 2086 struct svm_range *tail; 2087 2088 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2089 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, 2090 prange->start, prange->last); 2091 return; 2092 } 2093 if (start > prange->last || last < prange->start) 2094 return; 2095 2096 head = tail = prange; 2097 if (start > prange->start) 2098 svm_range_split(prange, prange->start, start - 1, &tail); 2099 if (last < tail->last) 2100 svm_range_split(tail, last + 1, tail->last, &head); 2101 2102 if (head != prange && tail != prange) { 2103 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2104 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 2105 } else if (tail != prange) { 2106 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 2107 } else if (head != prange) { 2108 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2109 } else if (parent != prange) { 2110 prange->work_item.op = SVM_OP_UNMAP_RANGE; 2111 } 2112 } 2113 2114 static void 2115 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, 2116 unsigned long start, unsigned long last) 2117 { 2118 struct svm_range_list *svms; 2119 struct svm_range *pchild; 2120 struct kfd_process *p; 2121 unsigned long s, l; 2122 bool unmap_parent; 2123 2124 p = kfd_lookup_process_by_mm(mm); 2125 if (!p) 2126 return; 2127 svms = &p->svms; 2128 2129 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, 2130 prange, prange->start, prange->last, start, last); 2131 2132 unmap_parent = start <= prange->start && last >= prange->last; 2133 2134 list_for_each_entry(pchild, &prange->child_list, child_list) { 2135 mutex_lock_nested(&pchild->lock, 1); 2136 s = max(start, pchild->start); 2137 l = min(last, pchild->last); 2138 if (l >= s) 2139 svm_range_unmap_from_gpus(pchild, s, l); 2140 svm_range_unmap_split(mm, prange, pchild, start, last); 2141 mutex_unlock(&pchild->lock); 2142 } 2143 s = max(start, prange->start); 2144 l = min(last, prange->last); 2145 if (l >= s) 2146 svm_range_unmap_from_gpus(prange, s, l); 2147 svm_range_unmap_split(mm, prange, prange, start, last); 2148 2149 if (unmap_parent) 2150 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); 2151 else 2152 svm_range_add_list_work(svms, prange, mm, 2153 SVM_OP_UPDATE_RANGE_NOTIFIER); 2154 schedule_deferred_list_work(svms); 2155 2156 kfd_unref_process(p); 2157 } 2158 2159 /** 2160 * svm_range_cpu_invalidate_pagetables - interval notifier callback 2161 * 2162 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it 2163 * is from migration, or CPU page invalidation callback. 2164 * 2165 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed 2166 * work thread, and split prange if only part of prange is unmapped. 2167 * 2168 * For invalidation event, if GPU retry fault is not enabled, evict the queues, 2169 * then schedule svm_range_restore_work to update GPU mapping and resume queues. 2170 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will 2171 * update GPU mapping to recover. 2172 * 2173 * Context: mmap lock, notifier_invalidate_start lock are held 2174 * for invalidate event, prange lock is held if this is from migration 2175 */ 2176 static bool 2177 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 2178 const struct mmu_notifier_range *range, 2179 unsigned long cur_seq) 2180 { 2181 struct svm_range *prange; 2182 unsigned long start; 2183 unsigned long last; 2184 2185 if (range->event == MMU_NOTIFY_RELEASE) 2186 return true; 2187 2188 start = mni->interval_tree.start; 2189 last = mni->interval_tree.last; 2190 start = (start > range->start ? start : range->start) >> PAGE_SHIFT; 2191 last = (last < (range->end - 1) ? last : range->end - 1) >> PAGE_SHIFT; 2192 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", 2193 start, last, range->start >> PAGE_SHIFT, 2194 (range->end - 1) >> PAGE_SHIFT, 2195 mni->interval_tree.start >> PAGE_SHIFT, 2196 mni->interval_tree.last >> PAGE_SHIFT, range->event); 2197 2198 prange = container_of(mni, struct svm_range, notifier); 2199 2200 svm_range_lock(prange); 2201 mmu_interval_set_seq(mni, cur_seq); 2202 2203 switch (range->event) { 2204 case MMU_NOTIFY_UNMAP: 2205 svm_range_unmap_from_cpu(mni->mm, prange, start, last); 2206 break; 2207 default: 2208 svm_range_evict(prange, mni->mm, start, last); 2209 break; 2210 } 2211 2212 svm_range_unlock(prange); 2213 2214 return true; 2215 } 2216 2217 /** 2218 * svm_range_from_addr - find svm range from fault address 2219 * @svms: svm range list header 2220 * @addr: address to search range interval tree, in pages 2221 * @parent: parent range if range is on child list 2222 * 2223 * Context: The caller must hold svms->lock 2224 * 2225 * Return: the svm_range found or NULL 2226 */ 2227 struct svm_range * 2228 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, 2229 struct svm_range **parent) 2230 { 2231 struct interval_tree_node *node; 2232 struct svm_range *prange; 2233 struct svm_range *pchild; 2234 2235 node = interval_tree_iter_first(&svms->objects, addr, addr); 2236 if (!node) 2237 return NULL; 2238 2239 prange = container_of(node, struct svm_range, it_node); 2240 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", 2241 addr, prange->start, prange->last, node->start, node->last); 2242 2243 if (addr >= prange->start && addr <= prange->last) { 2244 if (parent) 2245 *parent = prange; 2246 return prange; 2247 } 2248 list_for_each_entry(pchild, &prange->child_list, child_list) 2249 if (addr >= pchild->start && addr <= pchild->last) { 2250 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", 2251 addr, pchild->start, pchild->last); 2252 if (parent) 2253 *parent = prange; 2254 return pchild; 2255 } 2256 2257 return NULL; 2258 } 2259 2260 /* svm_range_best_restore_location - decide the best fault restore location 2261 * @prange: svm range structure 2262 * @adev: the GPU on which vm fault happened 2263 * 2264 * This is only called when xnack is on, to decide the best location to restore 2265 * the range mapping after GPU vm fault. Caller uses the best location to do 2266 * migration if actual loc is not best location, then update GPU page table 2267 * mapping to the best location. 2268 * 2269 * If the preferred loc is accessible by faulting GPU, use preferred loc. 2270 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu 2271 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then 2272 * if range actual loc is cpu, best_loc is cpu 2273 * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is 2274 * range actual loc. 2275 * Otherwise, GPU no access, best_loc is -1. 2276 * 2277 * Return: 2278 * -1 means vm fault GPU no access 2279 * 0 for CPU or GPU id 2280 */ 2281 static int32_t 2282 svm_range_best_restore_location(struct svm_range *prange, 2283 struct amdgpu_device *adev, 2284 int32_t *gpuidx) 2285 { 2286 struct amdgpu_device *bo_adev, *preferred_adev; 2287 struct kfd_process *p; 2288 uint32_t gpuid; 2289 int r; 2290 2291 p = container_of(prange->svms, struct kfd_process, svms); 2292 2293 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx); 2294 if (r < 0) { 2295 pr_debug("failed to get gpuid from kgd\n"); 2296 return -1; 2297 } 2298 2299 if (prange->preferred_loc == gpuid || 2300 prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { 2301 return prange->preferred_loc; 2302 } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { 2303 preferred_adev = svm_range_get_adev_by_id(prange, 2304 prange->preferred_loc); 2305 if (amdgpu_xgmi_same_hive(adev, preferred_adev)) 2306 return prange->preferred_loc; 2307 /* fall through */ 2308 } 2309 2310 if (test_bit(*gpuidx, prange->bitmap_access)) 2311 return gpuid; 2312 2313 if (test_bit(*gpuidx, prange->bitmap_aip)) { 2314 if (!prange->actual_loc) 2315 return 0; 2316 2317 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); 2318 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 2319 return prange->actual_loc; 2320 else 2321 return 0; 2322 } 2323 2324 return -1; 2325 } 2326 2327 static int 2328 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, 2329 unsigned long *start, unsigned long *last, 2330 bool *is_heap_stack) 2331 { 2332 struct vm_area_struct *vma; 2333 struct interval_tree_node *node; 2334 unsigned long start_limit, end_limit; 2335 2336 vma = find_vma(p->mm, addr << PAGE_SHIFT); 2337 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2338 pr_debug("VMA does not exist in address [0x%llx]\n", addr); 2339 return -EFAULT; 2340 } 2341 2342 *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && 2343 vma->vm_end >= vma->vm_mm->start_brk) || 2344 (vma->vm_start <= vma->vm_mm->start_stack && 2345 vma->vm_end >= vma->vm_mm->start_stack); 2346 2347 start_limit = max(vma->vm_start >> PAGE_SHIFT, 2348 (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); 2349 end_limit = min(vma->vm_end >> PAGE_SHIFT, 2350 (unsigned long)ALIGN(addr + 1, 2UL << 8)); 2351 /* First range that starts after the fault address */ 2352 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); 2353 if (node) { 2354 end_limit = min(end_limit, node->start); 2355 /* Last range that ends before the fault address */ 2356 node = container_of(rb_prev(&node->rb), 2357 struct interval_tree_node, rb); 2358 } else { 2359 /* Last range must end before addr because 2360 * there was no range after addr 2361 */ 2362 node = container_of(rb_last(&p->svms.objects.rb_root), 2363 struct interval_tree_node, rb); 2364 } 2365 if (node) { 2366 if (node->last >= addr) { 2367 WARN(1, "Overlap with prev node and page fault addr\n"); 2368 return -EFAULT; 2369 } 2370 start_limit = max(start_limit, node->last + 1); 2371 } 2372 2373 *start = start_limit; 2374 *last = end_limit - 1; 2375 2376 pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n", 2377 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT, 2378 *start, *last, *is_heap_stack); 2379 2380 return 0; 2381 } 2382 2383 static int 2384 svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, 2385 uint64_t *bo_s, uint64_t *bo_l) 2386 { 2387 struct amdgpu_bo_va_mapping *mapping; 2388 struct interval_tree_node *node; 2389 struct amdgpu_bo *bo = NULL; 2390 unsigned long userptr; 2391 uint32_t i; 2392 int r; 2393 2394 for (i = 0; i < p->n_pdds; i++) { 2395 struct amdgpu_vm *vm; 2396 2397 if (!p->pdds[i]->drm_priv) 2398 continue; 2399 2400 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2401 r = amdgpu_bo_reserve(vm->root.bo, false); 2402 if (r) 2403 return r; 2404 2405 /* Check userptr by searching entire vm->va interval tree */ 2406 node = interval_tree_iter_first(&vm->va, 0, ~0ULL); 2407 while (node) { 2408 mapping = container_of((struct rb_node *)node, 2409 struct amdgpu_bo_va_mapping, rb); 2410 bo = mapping->bo_va->base.bo; 2411 2412 if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, 2413 start << PAGE_SHIFT, 2414 last << PAGE_SHIFT, 2415 &userptr)) { 2416 node = interval_tree_iter_next(node, 0, ~0ULL); 2417 continue; 2418 } 2419 2420 pr_debug("[0x%llx 0x%llx] already userptr mapped\n", 2421 start, last); 2422 if (bo_s && bo_l) { 2423 *bo_s = userptr >> PAGE_SHIFT; 2424 *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1; 2425 } 2426 amdgpu_bo_unreserve(vm->root.bo); 2427 return -EADDRINUSE; 2428 } 2429 amdgpu_bo_unreserve(vm->root.bo); 2430 } 2431 return 0; 2432 } 2433 2434 static struct 2435 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, 2436 struct kfd_process *p, 2437 struct mm_struct *mm, 2438 int64_t addr) 2439 { 2440 struct svm_range *prange = NULL; 2441 unsigned long start, last; 2442 uint32_t gpuid, gpuidx; 2443 bool is_heap_stack; 2444 uint64_t bo_s = 0; 2445 uint64_t bo_l = 0; 2446 int r; 2447 2448 if (svm_range_get_range_boundaries(p, addr, &start, &last, 2449 &is_heap_stack)) 2450 return NULL; 2451 2452 r = svm_range_check_vm(p, start, last, &bo_s, &bo_l); 2453 if (r != -EADDRINUSE) 2454 r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l); 2455 2456 if (r == -EADDRINUSE) { 2457 if (addr >= bo_s && addr <= bo_l) 2458 return NULL; 2459 2460 /* Create one page svm range if 2MB range overlapping */ 2461 start = addr; 2462 last = addr; 2463 } 2464 2465 prange = svm_range_new(&p->svms, start, last); 2466 if (!prange) { 2467 pr_debug("Failed to create prange in address [0x%llx]\n", addr); 2468 return NULL; 2469 } 2470 if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) { 2471 pr_debug("failed to get gpuid from kgd\n"); 2472 svm_range_free(prange); 2473 return NULL; 2474 } 2475 2476 if (is_heap_stack) 2477 prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM; 2478 2479 svm_range_add_to_svms(prange); 2480 svm_range_add_notifier_locked(mm, prange); 2481 2482 return prange; 2483 } 2484 2485 /* svm_range_skip_recover - decide if prange can be recovered 2486 * @prange: svm range structure 2487 * 2488 * GPU vm retry fault handle skip recover the range for cases: 2489 * 1. prange is on deferred list to be removed after unmap, it is stale fault, 2490 * deferred list work will drain the stale fault before free the prange. 2491 * 2. prange is on deferred list to add interval notifier after split, or 2492 * 3. prange is child range, it is split from parent prange, recover later 2493 * after interval notifier is added. 2494 * 2495 * Return: true to skip recover, false to recover 2496 */ 2497 static bool svm_range_skip_recover(struct svm_range *prange) 2498 { 2499 struct svm_range_list *svms = prange->svms; 2500 2501 spin_lock(&svms->deferred_list_lock); 2502 if (list_empty(&prange->deferred_list) && 2503 list_empty(&prange->child_list)) { 2504 spin_unlock(&svms->deferred_list_lock); 2505 return false; 2506 } 2507 spin_unlock(&svms->deferred_list_lock); 2508 2509 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2510 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", 2511 svms, prange, prange->start, prange->last); 2512 return true; 2513 } 2514 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || 2515 prange->work_item.op == SVM_OP_ADD_RANGE) { 2516 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", 2517 svms, prange, prange->start, prange->last); 2518 return true; 2519 } 2520 return false; 2521 } 2522 2523 static void 2524 svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p, 2525 int32_t gpuidx) 2526 { 2527 struct kfd_process_device *pdd; 2528 2529 /* fault is on different page of same range 2530 * or fault is skipped to recover later 2531 * or fault is on invalid virtual address 2532 */ 2533 if (gpuidx == MAX_GPU_INSTANCE) { 2534 uint32_t gpuid; 2535 int r; 2536 2537 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx); 2538 if (r < 0) 2539 return; 2540 } 2541 2542 /* fault is recovered 2543 * or fault cannot recover because GPU no access on the range 2544 */ 2545 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2546 if (pdd) 2547 WRITE_ONCE(pdd->faults, pdd->faults + 1); 2548 } 2549 2550 static bool 2551 svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool write_fault) 2552 { 2553 unsigned long requested = VM_READ; 2554 struct vm_area_struct *vma; 2555 2556 if (write_fault) 2557 requested |= VM_WRITE; 2558 2559 vma = find_vma(mm, addr << PAGE_SHIFT); 2560 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2561 pr_debug("address 0x%llx VMA is removed\n", addr); 2562 return true; 2563 } 2564 2565 pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested, 2566 vma->vm_flags); 2567 return (vma->vm_flags & requested) == requested; 2568 } 2569 2570 int 2571 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2572 uint64_t addr, bool write_fault) 2573 { 2574 struct mm_struct *mm = NULL; 2575 struct svm_range_list *svms; 2576 struct svm_range *prange; 2577 struct kfd_process *p; 2578 uint64_t timestamp; 2579 int32_t best_loc; 2580 int32_t gpuidx = MAX_GPU_INSTANCE; 2581 bool write_locked = false; 2582 int r = 0; 2583 2584 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { 2585 pr_debug("device does not support SVM\n"); 2586 return -EFAULT; 2587 } 2588 2589 p = kfd_lookup_process_by_pasid(pasid); 2590 if (!p) { 2591 pr_debug("kfd process not founded pasid 0x%x\n", pasid); 2592 return -ESRCH; 2593 } 2594 if (!p->xnack_enabled) { 2595 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 2596 r = -EFAULT; 2597 goto out; 2598 } 2599 svms = &p->svms; 2600 2601 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 2602 2603 mm = get_task_mm(p->lead_thread); 2604 if (!mm) { 2605 pr_debug("svms 0x%p failed to get mm\n", svms); 2606 r = -ESRCH; 2607 goto out; 2608 } 2609 2610 mmap_read_lock(mm); 2611 retry_write_locked: 2612 mutex_lock(&svms->lock); 2613 prange = svm_range_from_addr(svms, addr, NULL); 2614 if (!prange) { 2615 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 2616 svms, addr); 2617 if (!write_locked) { 2618 /* Need the write lock to create new range with MMU notifier. 2619 * Also flush pending deferred work to make sure the interval 2620 * tree is up to date before we add a new range 2621 */ 2622 mutex_unlock(&svms->lock); 2623 mmap_read_unlock(mm); 2624 mmap_write_lock(mm); 2625 write_locked = true; 2626 goto retry_write_locked; 2627 } 2628 prange = svm_range_create_unregistered_range(adev, p, mm, addr); 2629 if (!prange) { 2630 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 2631 svms, addr); 2632 mmap_write_downgrade(mm); 2633 r = -EFAULT; 2634 goto out_unlock_svms; 2635 } 2636 } 2637 if (write_locked) 2638 mmap_write_downgrade(mm); 2639 2640 mutex_lock(&prange->migrate_mutex); 2641 2642 if (svm_range_skip_recover(prange)) { 2643 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2644 goto out_unlock_range; 2645 } 2646 2647 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp; 2648 /* skip duplicate vm fault on different pages of same range */ 2649 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { 2650 pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 2651 svms, prange->start, prange->last); 2652 goto out_unlock_range; 2653 } 2654 2655 if (!svm_fault_allowed(mm, addr, write_fault)) { 2656 pr_debug("fault addr 0x%llx no %s permission\n", addr, 2657 write_fault ? "write" : "read"); 2658 r = -EPERM; 2659 goto out_unlock_range; 2660 } 2661 2662 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); 2663 if (best_loc == -1) { 2664 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 2665 svms, prange->start, prange->last); 2666 r = -EACCES; 2667 goto out_unlock_range; 2668 } 2669 2670 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 2671 svms, prange->start, prange->last, best_loc, 2672 prange->actual_loc); 2673 2674 if (prange->actual_loc != best_loc) { 2675 if (best_loc) { 2676 r = svm_migrate_to_vram(prange, best_loc, mm); 2677 if (r) { 2678 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 2679 r, addr); 2680 /* Fallback to system memory if migration to 2681 * VRAM failed 2682 */ 2683 if (prange->actual_loc) 2684 r = svm_migrate_vram_to_ram(prange, mm); 2685 else 2686 r = 0; 2687 } 2688 } else { 2689 r = svm_migrate_vram_to_ram(prange, mm); 2690 } 2691 if (r) { 2692 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 2693 r, svms, prange->start, prange->last); 2694 goto out_unlock_range; 2695 } 2696 } 2697 2698 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false); 2699 if (r) 2700 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 2701 r, svms, prange->start, prange->last); 2702 2703 out_unlock_range: 2704 mutex_unlock(&prange->migrate_mutex); 2705 out_unlock_svms: 2706 mutex_unlock(&svms->lock); 2707 mmap_read_unlock(mm); 2708 2709 svm_range_count_fault(adev, p, gpuidx); 2710 2711 mmput(mm); 2712 out: 2713 kfd_unref_process(p); 2714 2715 if (r == -EAGAIN) { 2716 pr_debug("recover vm fault later\n"); 2717 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2718 r = 0; 2719 } 2720 return r; 2721 } 2722 2723 void svm_range_list_fini(struct kfd_process *p) 2724 { 2725 struct svm_range *prange; 2726 struct svm_range *next; 2727 2728 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); 2729 2730 /* Ensure list work is finished before process is destroyed */ 2731 flush_work(&p->svms.deferred_list_work); 2732 2733 list_for_each_entry_safe(prange, next, &p->svms.list, list) { 2734 svm_range_unlink(prange); 2735 svm_range_remove_notifier(prange); 2736 svm_range_free(prange); 2737 } 2738 2739 mutex_destroy(&p->svms.lock); 2740 2741 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); 2742 } 2743 2744 int svm_range_list_init(struct kfd_process *p) 2745 { 2746 struct svm_range_list *svms = &p->svms; 2747 int i; 2748 2749 svms->objects = RB_ROOT_CACHED; 2750 mutex_init(&svms->lock); 2751 INIT_LIST_HEAD(&svms->list); 2752 atomic_set(&svms->evicted_ranges, 0); 2753 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); 2754 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); 2755 INIT_LIST_HEAD(&svms->deferred_range_list); 2756 spin_lock_init(&svms->deferred_list_lock); 2757 2758 for (i = 0; i < p->n_pdds; i++) 2759 if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev)) 2760 bitmap_set(svms->bitmap_supported, i, 1); 2761 2762 return 0; 2763 } 2764 2765 /** 2766 * svm_range_check_vm - check if virtual address range mapped already 2767 * @p: current kfd_process 2768 * @start: range start address, in pages 2769 * @last: range last address, in pages 2770 * @bo_s: mapping start address in pages if address range already mapped 2771 * @bo_l: mapping last address in pages if address range already mapped 2772 * 2773 * The purpose is to avoid virtual address ranges already allocated by 2774 * kfd_ioctl_alloc_memory_of_gpu ioctl. 2775 * It looks for each pdd in the kfd_process. 2776 * 2777 * Context: Process context 2778 * 2779 * Return 0 - OK, if the range is not mapped. 2780 * Otherwise error code: 2781 * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu 2782 * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by 2783 * a signal. Release all buffer reservations and return to user-space. 2784 */ 2785 static int 2786 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 2787 uint64_t *bo_s, uint64_t *bo_l) 2788 { 2789 struct amdgpu_bo_va_mapping *mapping; 2790 struct interval_tree_node *node; 2791 uint32_t i; 2792 int r; 2793 2794 for (i = 0; i < p->n_pdds; i++) { 2795 struct amdgpu_vm *vm; 2796 2797 if (!p->pdds[i]->drm_priv) 2798 continue; 2799 2800 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2801 r = amdgpu_bo_reserve(vm->root.bo, false); 2802 if (r) 2803 return r; 2804 2805 node = interval_tree_iter_first(&vm->va, start, last); 2806 if (node) { 2807 pr_debug("range [0x%llx 0x%llx] already TTM mapped\n", 2808 start, last); 2809 mapping = container_of((struct rb_node *)node, 2810 struct amdgpu_bo_va_mapping, rb); 2811 if (bo_s && bo_l) { 2812 *bo_s = mapping->start; 2813 *bo_l = mapping->last; 2814 } 2815 amdgpu_bo_unreserve(vm->root.bo); 2816 return -EADDRINUSE; 2817 } 2818 amdgpu_bo_unreserve(vm->root.bo); 2819 } 2820 2821 return 0; 2822 } 2823 2824 /** 2825 * svm_range_is_valid - check if virtual address range is valid 2826 * @p: current kfd_process 2827 * @start: range start address, in pages 2828 * @size: range size, in pages 2829 * 2830 * Valid virtual address range means it belongs to one or more VMAs 2831 * 2832 * Context: Process context 2833 * 2834 * Return: 2835 * 0 - OK, otherwise error code 2836 */ 2837 static int 2838 svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) 2839 { 2840 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 2841 struct vm_area_struct *vma; 2842 unsigned long end; 2843 unsigned long start_unchg = start; 2844 2845 start <<= PAGE_SHIFT; 2846 end = start + (size << PAGE_SHIFT); 2847 do { 2848 vma = find_vma(p->mm, start); 2849 if (!vma || start < vma->vm_start || 2850 (vma->vm_flags & device_vma)) 2851 return -EFAULT; 2852 start = min(end, vma->vm_end); 2853 } while (start < end); 2854 2855 return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL, 2856 NULL); 2857 } 2858 2859 /** 2860 * svm_range_add - add svm range and handle overlap 2861 * @p: the range add to this process svms 2862 * @start: page size aligned 2863 * @size: page size aligned 2864 * @nattr: number of attributes 2865 * @attrs: array of attributes 2866 * @update_list: output, the ranges need validate and update GPU mapping 2867 * @insert_list: output, the ranges need insert to svms 2868 * @remove_list: output, the ranges are replaced and need remove from svms 2869 * 2870 * Check if the virtual address range has overlap with the registered ranges, 2871 * split the overlapped range, copy and adjust pages address and vram nodes in 2872 * old and new ranges. 2873 * 2874 * Context: Process context, caller must hold svms->lock 2875 * 2876 * Return: 2877 * 0 - OK, otherwise error code 2878 */ 2879 static int 2880 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, 2881 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 2882 struct list_head *update_list, struct list_head *insert_list, 2883 struct list_head *remove_list) 2884 { 2885 uint64_t last = start + size - 1UL; 2886 struct svm_range_list *svms; 2887 struct svm_range new = {0}; 2888 struct svm_range *prange; 2889 unsigned long left = 0; 2890 int r = 0; 2891 2892 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", &p->svms, start, last); 2893 2894 svm_range_apply_attrs(p, &new, nattr, attrs); 2895 2896 svms = &p->svms; 2897 2898 r = svm_range_handle_overlap(svms, &new, start, last, update_list, 2899 insert_list, remove_list, &left); 2900 if (r) 2901 return r; 2902 2903 if (left) { 2904 prange = svm_range_new(svms, last - left + 1, last); 2905 list_add(&prange->insert_list, insert_list); 2906 list_add(&prange->update_list, update_list); 2907 } 2908 2909 return 0; 2910 } 2911 2912 /** 2913 * svm_range_best_prefetch_location - decide the best prefetch location 2914 * @prange: svm range structure 2915 * 2916 * For xnack off: 2917 * If range map to single GPU, the best prefetch location is prefetch_loc, which 2918 * can be CPU or GPU. 2919 * 2920 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on 2921 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise 2922 * the best prefetch location is always CPU, because GPU can not have coherent 2923 * mapping VRAM of other GPUs even with large-BAR PCIe connection. 2924 * 2925 * For xnack on: 2926 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is 2927 * prefetch_loc, other GPU access will generate vm fault and trigger migration. 2928 * 2929 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same 2930 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best 2931 * prefetch location is always CPU. 2932 * 2933 * Context: Process context 2934 * 2935 * Return: 2936 * 0 for CPU or GPU id 2937 */ 2938 static uint32_t 2939 svm_range_best_prefetch_location(struct svm_range *prange) 2940 { 2941 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 2942 uint32_t best_loc = prange->prefetch_loc; 2943 struct kfd_process_device *pdd; 2944 struct amdgpu_device *bo_adev; 2945 struct kfd_process *p; 2946 uint32_t gpuidx; 2947 2948 p = container_of(prange->svms, struct kfd_process, svms); 2949 2950 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) 2951 goto out; 2952 2953 bo_adev = svm_range_get_adev_by_id(prange, best_loc); 2954 if (!bo_adev) { 2955 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); 2956 best_loc = 0; 2957 goto out; 2958 } 2959 2960 if (p->xnack_enabled) 2961 bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 2962 else 2963 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 2964 MAX_GPU_INSTANCE); 2965 2966 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 2967 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2968 if (!pdd) { 2969 pr_debug("failed to get device by idx 0x%x\n", gpuidx); 2970 continue; 2971 } 2972 2973 if (pdd->dev->adev == bo_adev) 2974 continue; 2975 2976 if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 2977 best_loc = 0; 2978 break; 2979 } 2980 } 2981 2982 out: 2983 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", 2984 p->xnack_enabled, &p->svms, prange->start, prange->last, 2985 best_loc); 2986 2987 return best_loc; 2988 } 2989 2990 /* FIXME: This is a workaround for page locking bug when some pages are 2991 * invalid during migration to VRAM 2992 */ 2993 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm, 2994 void *owner) 2995 { 2996 struct hmm_range *hmm_range; 2997 int r; 2998 2999 if (prange->validated_once) 3000 return; 3001 3002 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 3003 prange->start << PAGE_SHIFT, 3004 prange->npages, &hmm_range, 3005 false, true, owner); 3006 if (!r) { 3007 amdgpu_hmm_range_get_pages_done(hmm_range); 3008 prange->validated_once = true; 3009 } 3010 } 3011 3012 /* svm_range_trigger_migration - start page migration if prefetch loc changed 3013 * @mm: current process mm_struct 3014 * @prange: svm range structure 3015 * @migrated: output, true if migration is triggered 3016 * 3017 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range 3018 * from ram to vram. 3019 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range 3020 * from vram to ram. 3021 * 3022 * If GPU vm fault retry is not enabled, migration interact with MMU notifier 3023 * and restore work: 3024 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict 3025 * stops all queues, schedule restore work 3026 * 2. svm_range_restore_work wait for migration is done by 3027 * a. svm_range_validate_vram takes prange->migrate_mutex 3028 * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns 3029 * 3. restore work update mappings of GPU, resume all queues. 3030 * 3031 * Context: Process context 3032 * 3033 * Return: 3034 * 0 - OK, otherwise - error code of migration 3035 */ 3036 static int 3037 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, 3038 bool *migrated) 3039 { 3040 uint32_t best_loc; 3041 int r = 0; 3042 3043 *migrated = false; 3044 best_loc = svm_range_best_prefetch_location(prange); 3045 3046 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3047 best_loc == prange->actual_loc) 3048 return 0; 3049 3050 if (!best_loc) { 3051 r = svm_migrate_vram_to_ram(prange, mm); 3052 *migrated = !r; 3053 return r; 3054 } 3055 3056 r = svm_migrate_to_vram(prange, best_loc, mm); 3057 *migrated = !r; 3058 3059 return r; 3060 } 3061 3062 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) 3063 { 3064 if (!fence) 3065 return -EINVAL; 3066 3067 if (dma_fence_is_signaled(&fence->base)) 3068 return 0; 3069 3070 if (fence->svm_bo) { 3071 WRITE_ONCE(fence->svm_bo->evicting, 1); 3072 schedule_work(&fence->svm_bo->eviction_work); 3073 } 3074 3075 return 0; 3076 } 3077 3078 static void svm_range_evict_svm_bo_worker(struct work_struct *work) 3079 { 3080 struct svm_range_bo *svm_bo; 3081 struct kfd_process *p; 3082 struct mm_struct *mm; 3083 3084 svm_bo = container_of(work, struct svm_range_bo, eviction_work); 3085 if (!svm_bo_ref_unless_zero(svm_bo)) 3086 return; /* svm_bo was freed while eviction was pending */ 3087 3088 /* svm_range_bo_release destroys this worker thread. So during 3089 * the lifetime of this thread, kfd_process and mm will be valid. 3090 */ 3091 p = container_of(svm_bo->svms, struct kfd_process, svms); 3092 mm = p->mm; 3093 if (!mm) 3094 return; 3095 3096 mmap_read_lock(mm); 3097 spin_lock(&svm_bo->list_lock); 3098 while (!list_empty(&svm_bo->range_list)) { 3099 struct svm_range *prange = 3100 list_first_entry(&svm_bo->range_list, 3101 struct svm_range, svm_bo_list); 3102 int retries = 3; 3103 3104 list_del_init(&prange->svm_bo_list); 3105 spin_unlock(&svm_bo->list_lock); 3106 3107 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 3108 prange->start, prange->last); 3109 3110 mutex_lock(&prange->migrate_mutex); 3111 do { 3112 svm_migrate_vram_to_ram(prange, 3113 svm_bo->eviction_fence->mm); 3114 } while (prange->actual_loc && --retries); 3115 WARN(prange->actual_loc, "Migration failed during eviction"); 3116 3117 mutex_lock(&prange->lock); 3118 prange->svm_bo = NULL; 3119 mutex_unlock(&prange->lock); 3120 3121 mutex_unlock(&prange->migrate_mutex); 3122 3123 spin_lock(&svm_bo->list_lock); 3124 } 3125 spin_unlock(&svm_bo->list_lock); 3126 mmap_read_unlock(mm); 3127 3128 dma_fence_signal(&svm_bo->eviction_fence->base); 3129 /* This is the last reference to svm_bo, after svm_range_vram_node_free 3130 * has been called in svm_migrate_vram_to_ram 3131 */ 3132 WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); 3133 svm_range_bo_unref(svm_bo); 3134 } 3135 3136 static int 3137 svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size, 3138 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 3139 { 3140 struct amdkfd_process_info *process_info = p->kgd_process_info; 3141 struct mm_struct *mm = current->mm; 3142 struct list_head update_list; 3143 struct list_head insert_list; 3144 struct list_head remove_list; 3145 struct svm_range_list *svms; 3146 struct svm_range *prange; 3147 struct svm_range *next; 3148 int r = 0; 3149 3150 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", 3151 p->pasid, &p->svms, start, start + size - 1, size); 3152 3153 r = svm_range_check_attr(p, nattr, attrs); 3154 if (r) 3155 return r; 3156 3157 svms = &p->svms; 3158 3159 mutex_lock(&process_info->lock); 3160 3161 svm_range_list_lock_and_flush_work(svms, mm); 3162 3163 r = svm_range_is_valid(p, start, size); 3164 if (r) { 3165 pr_debug("invalid range r=%d\n", r); 3166 mmap_write_unlock(mm); 3167 goto out; 3168 } 3169 3170 mutex_lock(&svms->lock); 3171 3172 /* Add new range and split existing ranges as needed */ 3173 r = svm_range_add(p, start, size, nattr, attrs, &update_list, 3174 &insert_list, &remove_list); 3175 if (r) { 3176 mutex_unlock(&svms->lock); 3177 mmap_write_unlock(mm); 3178 goto out; 3179 } 3180 /* Apply changes as a transaction */ 3181 list_for_each_entry_safe(prange, next, &insert_list, insert_list) { 3182 svm_range_add_to_svms(prange); 3183 svm_range_add_notifier_locked(mm, prange); 3184 } 3185 list_for_each_entry(prange, &update_list, update_list) { 3186 svm_range_apply_attrs(p, prange, nattr, attrs); 3187 /* TODO: unmap ranges from GPU that lost access */ 3188 } 3189 list_for_each_entry_safe(prange, next, &remove_list, 3190 remove_list) { 3191 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", 3192 prange->svms, prange, prange->start, 3193 prange->last); 3194 svm_range_unlink(prange); 3195 svm_range_remove_notifier(prange); 3196 svm_range_free(prange); 3197 } 3198 3199 mmap_write_downgrade(mm); 3200 /* Trigger migrations and revalidate and map to GPUs as needed. If 3201 * this fails we may be left with partially completed actions. There 3202 * is no clean way of rolling back to the previous state in such a 3203 * case because the rollback wouldn't be guaranteed to work either. 3204 */ 3205 list_for_each_entry(prange, &update_list, update_list) { 3206 bool migrated; 3207 3208 mutex_lock(&prange->migrate_mutex); 3209 3210 r = svm_range_trigger_migration(mm, prange, &migrated); 3211 if (r) 3212 goto out_unlock_range; 3213 3214 if (migrated && !p->xnack_enabled) { 3215 pr_debug("restore_work will update mappings of GPUs\n"); 3216 mutex_unlock(&prange->migrate_mutex); 3217 continue; 3218 } 3219 3220 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 3221 true, true); 3222 if (r) 3223 pr_debug("failed %d to map svm range\n", r); 3224 3225 out_unlock_range: 3226 mutex_unlock(&prange->migrate_mutex); 3227 if (r) 3228 break; 3229 } 3230 3231 svm_range_debug_dump(svms); 3232 3233 mutex_unlock(&svms->lock); 3234 mmap_read_unlock(mm); 3235 out: 3236 mutex_unlock(&process_info->lock); 3237 3238 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, 3239 &p->svms, start, start + size - 1, r); 3240 3241 return r; 3242 } 3243 3244 static int 3245 svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size, 3246 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 3247 { 3248 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); 3249 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); 3250 bool get_preferred_loc = false; 3251 bool get_prefetch_loc = false; 3252 bool get_granularity = false; 3253 bool get_accessible = false; 3254 bool get_flags = false; 3255 uint64_t last = start + size - 1UL; 3256 struct mm_struct *mm = current->mm; 3257 uint8_t granularity = 0xff; 3258 struct interval_tree_node *node; 3259 struct svm_range_list *svms; 3260 struct svm_range *prange; 3261 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3262 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3263 uint32_t flags_and = 0xffffffff; 3264 uint32_t flags_or = 0; 3265 int gpuidx; 3266 uint32_t i; 3267 int r = 0; 3268 3269 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, 3270 start + size - 1, nattr); 3271 3272 /* Flush pending deferred work to avoid racing with deferred actions from 3273 * previous memory map changes (e.g. munmap). Concurrent memory map changes 3274 * can still race with get_attr because we don't hold the mmap lock. But that 3275 * would be a race condition in the application anyway, and undefined 3276 * behaviour is acceptable in that case. 3277 */ 3278 flush_work(&p->svms.deferred_list_work); 3279 3280 mmap_read_lock(mm); 3281 r = svm_range_is_valid(p, start, size); 3282 mmap_read_unlock(mm); 3283 if (r) { 3284 pr_debug("invalid range r=%d\n", r); 3285 return r; 3286 } 3287 3288 for (i = 0; i < nattr; i++) { 3289 switch (attrs[i].type) { 3290 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3291 get_preferred_loc = true; 3292 break; 3293 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3294 get_prefetch_loc = true; 3295 break; 3296 case KFD_IOCTL_SVM_ATTR_ACCESS: 3297 get_accessible = true; 3298 break; 3299 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3300 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3301 get_flags = true; 3302 break; 3303 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3304 get_granularity = true; 3305 break; 3306 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 3307 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 3308 fallthrough; 3309 default: 3310 pr_debug("get invalid attr type 0x%x\n", attrs[i].type); 3311 return -EINVAL; 3312 } 3313 } 3314 3315 svms = &p->svms; 3316 3317 mutex_lock(&svms->lock); 3318 3319 node = interval_tree_iter_first(&svms->objects, start, last); 3320 if (!node) { 3321 pr_debug("range attrs not found return default values\n"); 3322 svm_range_set_default_attributes(&location, &prefetch_loc, 3323 &granularity, &flags_and); 3324 flags_or = flags_and; 3325 if (p->xnack_enabled) 3326 bitmap_copy(bitmap_access, svms->bitmap_supported, 3327 MAX_GPU_INSTANCE); 3328 else 3329 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); 3330 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); 3331 goto fill_values; 3332 } 3333 bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); 3334 bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE); 3335 3336 while (node) { 3337 struct interval_tree_node *next; 3338 3339 prange = container_of(node, struct svm_range, it_node); 3340 next = interval_tree_iter_next(node, start, last); 3341 3342 if (get_preferred_loc) { 3343 if (prange->preferred_loc == 3344 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3345 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3346 location != prange->preferred_loc)) { 3347 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3348 get_preferred_loc = false; 3349 } else { 3350 location = prange->preferred_loc; 3351 } 3352 } 3353 if (get_prefetch_loc) { 3354 if (prange->prefetch_loc == 3355 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3356 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3357 prefetch_loc != prange->prefetch_loc)) { 3358 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3359 get_prefetch_loc = false; 3360 } else { 3361 prefetch_loc = prange->prefetch_loc; 3362 } 3363 } 3364 if (get_accessible) { 3365 bitmap_and(bitmap_access, bitmap_access, 3366 prange->bitmap_access, MAX_GPU_INSTANCE); 3367 bitmap_and(bitmap_aip, bitmap_aip, 3368 prange->bitmap_aip, MAX_GPU_INSTANCE); 3369 } 3370 if (get_flags) { 3371 flags_and &= prange->flags; 3372 flags_or |= prange->flags; 3373 } 3374 3375 if (get_granularity && prange->granularity < granularity) 3376 granularity = prange->granularity; 3377 3378 node = next; 3379 } 3380 fill_values: 3381 mutex_unlock(&svms->lock); 3382 3383 for (i = 0; i < nattr; i++) { 3384 switch (attrs[i].type) { 3385 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3386 attrs[i].value = location; 3387 break; 3388 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3389 attrs[i].value = prefetch_loc; 3390 break; 3391 case KFD_IOCTL_SVM_ATTR_ACCESS: 3392 gpuidx = kfd_process_gpuidx_from_gpuid(p, 3393 attrs[i].value); 3394 if (gpuidx < 0) { 3395 pr_debug("invalid gpuid %x\n", attrs[i].value); 3396 return -EINVAL; 3397 } 3398 if (test_bit(gpuidx, bitmap_access)) 3399 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; 3400 else if (test_bit(gpuidx, bitmap_aip)) 3401 attrs[i].type = 3402 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; 3403 else 3404 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; 3405 break; 3406 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3407 attrs[i].value = flags_and; 3408 break; 3409 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3410 attrs[i].value = ~flags_or; 3411 break; 3412 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3413 attrs[i].value = (uint32_t)granularity; 3414 break; 3415 } 3416 } 3417 3418 return 0; 3419 } 3420 3421 int 3422 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, 3423 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) 3424 { 3425 int r; 3426 3427 start >>= PAGE_SHIFT; 3428 size >>= PAGE_SHIFT; 3429 3430 switch (op) { 3431 case KFD_IOCTL_SVM_OP_SET_ATTR: 3432 r = svm_range_set_attr(p, start, size, nattrs, attrs); 3433 break; 3434 case KFD_IOCTL_SVM_OP_GET_ATTR: 3435 r = svm_range_get_attr(p, start, size, nattrs, attrs); 3436 break; 3437 default: 3438 r = EINVAL; 3439 break; 3440 } 3441 3442 return r; 3443 } 3444