1 /* 2 * Copyright 2019 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * based on nouveau_prime.c 23 * 24 * Authors: Alex Deucher 25 */ 26 27 /** 28 * DOC: PRIME Buffer Sharing 29 * 30 * The following callback implementations are used for :ref:`sharing GEM buffer 31 * objects between different devices via PRIME <prime_buffer_sharing>`. 32 */ 33 34 #include "amdgpu.h" 35 #include "amdgpu_display.h" 36 #include "amdgpu_gem.h" 37 #include "amdgpu_dma_buf.h" 38 #include "amdgpu_xgmi.h" 39 #include "amdgpu_vm.h" 40 #include "amdgpu_ttm.h" 41 #include <drm/amdgpu_drm.h> 42 #include <drm/ttm/ttm_tt.h> 43 #include <linux/dma-buf.h> 44 #include <linux/dma-fence-array.h> 45 #include <linux/pci-p2pdma.h> 46 47 static const struct dma_buf_attach_ops amdgpu_dma_buf_attach_ops; 48 49 /** 50 * dma_buf_attach_adev - Helper to get adev of an attachment 51 * 52 * @attach: attachment 53 * 54 * Returns: 55 * A struct amdgpu_device * if the attaching device is an amdgpu device or 56 * partition, NULL otherwise. 57 */ 58 static struct amdgpu_device *dma_buf_attach_adev(struct dma_buf_attachment *attach) 59 { 60 if (attach->importer_ops == &amdgpu_dma_buf_attach_ops) { 61 struct drm_gem_object *obj = attach->importer_priv; 62 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 63 64 return amdgpu_ttm_adev(bo->tbo.bdev); 65 } 66 67 return NULL; 68 } 69 70 /** 71 * amdgpu_dma_buf_attach - &dma_buf_ops.attach implementation 72 * 73 * @dmabuf: DMA-buf where we attach to 74 * @attach: attachment to add 75 * 76 * Add the attachment as user to the exported DMA-buf. 77 */ 78 static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, 79 struct dma_buf_attachment *attach) 80 { 81 struct amdgpu_device *attach_adev = dma_buf_attach_adev(attach); 82 struct drm_gem_object *obj = dmabuf->priv; 83 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 84 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 85 int r; 86 87 /* 88 * Disable peer-to-peer access for DCC-enabled VRAM surfaces on GFX12+. 89 * Such buffers cannot be safely accessed over P2P due to device-local 90 * compression metadata. Fallback to system-memory path instead. 91 * Device supports GFX12 (GC 12.x or newer) 92 * BO was created with the AMDGPU_GEM_CREATE_GFX12_DCC flag 93 * 94 */ 95 if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0) && 96 bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) 97 attach->peer2peer = false; 98 99 if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) && 100 pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) 101 attach->peer2peer = false; 102 103 r = dma_resv_lock(bo->tbo.base.resv, NULL); 104 if (r) 105 return r; 106 107 amdgpu_vm_bo_update_shared(bo); 108 109 dma_resv_unlock(bo->tbo.base.resv); 110 111 return 0; 112 } 113 114 /** 115 * amdgpu_dma_buf_pin - &dma_buf_ops.pin implementation 116 * 117 * @attach: attachment to pin down 118 * 119 * Pin the BO which is backing the DMA-buf so that it can't move any more. 120 */ 121 static int amdgpu_dma_buf_pin(struct dma_buf_attachment *attach) 122 { 123 struct dma_buf *dmabuf = attach->dmabuf; 124 struct amdgpu_bo *bo = gem_to_amdgpu_bo(dmabuf->priv); 125 u32 domains = bo->allowed_domains; 126 127 dma_resv_assert_held(dmabuf->resv); 128 129 /* Try pinning into VRAM to allow P2P with RDMA NICs without ODP 130 * support if all attachments can do P2P. If any attachment can't do 131 * P2P just pin into GTT instead. 132 * 133 * To avoid with conflicting pinnings between GPUs and RDMA when move 134 * notifiers are disabled, only allow pinning in VRAM when move 135 * notiers are enabled. 136 */ 137 list_for_each_entry(attach, &dmabuf->attachments, node) 138 if (!attach->peer2peer) 139 domains &= ~AMDGPU_GEM_DOMAIN_VRAM; 140 141 if (domains & AMDGPU_GEM_DOMAIN_VRAM) 142 bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; 143 144 if (WARN_ON(!domains)) 145 return -EINVAL; 146 147 return amdgpu_bo_pin(bo, domains); 148 } 149 150 /** 151 * amdgpu_dma_buf_unpin - &dma_buf_ops.unpin implementation 152 * 153 * @attach: attachment to unpin 154 * 155 * Unpin a previously pinned BO to make it movable again. 156 */ 157 static void amdgpu_dma_buf_unpin(struct dma_buf_attachment *attach) 158 { 159 struct drm_gem_object *obj = attach->dmabuf->priv; 160 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 161 162 amdgpu_bo_unpin(bo); 163 } 164 165 /** 166 * amdgpu_dma_buf_map - &dma_buf_ops.map_dma_buf implementation 167 * @attach: DMA-buf attachment 168 * @dir: DMA direction 169 * 170 * Makes sure that the shared DMA buffer can be accessed by the target device. 171 * For now, simply pins it to the GTT domain, where it should be accessible by 172 * all DMA devices. 173 * 174 * Returns: 175 * sg_table filled with the DMA addresses to use or ERR_PRT with negative error 176 * code. 177 */ 178 static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, 179 enum dma_data_direction dir) 180 { 181 struct dma_buf *dma_buf = attach->dmabuf; 182 struct drm_gem_object *obj = dma_buf->priv; 183 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 184 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 185 struct sg_table *sgt; 186 long r; 187 188 if (!bo->tbo.pin_count) { 189 /* move buffer into GTT or VRAM */ 190 struct ttm_operation_ctx ctx = { false, false }; 191 unsigned int domains = AMDGPU_GEM_DOMAIN_GTT; 192 193 if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM && 194 attach->peer2peer) { 195 bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; 196 domains |= AMDGPU_GEM_DOMAIN_VRAM; 197 } 198 amdgpu_bo_placement_from_domain(bo, domains); 199 r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 200 if (r) 201 return ERR_PTR(r); 202 } 203 204 switch (bo->tbo.resource->mem_type) { 205 case TTM_PL_TT: 206 sgt = drm_prime_pages_to_sg(obj->dev, 207 bo->tbo.ttm->pages, 208 bo->tbo.ttm->num_pages); 209 if (IS_ERR(sgt)) 210 return sgt; 211 212 if (dma_map_sgtable(attach->dev, sgt, dir, 213 DMA_ATTR_SKIP_CPU_SYNC)) 214 goto error_free; 215 break; 216 217 case TTM_PL_VRAM: 218 /* XGMI-accessible memory should never be DMA-mapped */ 219 if (WARN_ON(amdgpu_dmabuf_is_xgmi_accessible( 220 dma_buf_attach_adev(attach), bo))) 221 return ERR_PTR(-EINVAL); 222 223 r = amdgpu_vram_mgr_alloc_sgt(adev, bo->tbo.resource, 0, 224 bo->tbo.base.size, attach->dev, 225 dir, &sgt); 226 if (r) 227 return ERR_PTR(r); 228 break; 229 230 case AMDGPU_PL_MMIO_REMAP: 231 r = amdgpu_ttm_mmio_remap_alloc_sgt(adev, bo->tbo.resource, 232 attach->dev, dir, &sgt); 233 if (r) 234 return ERR_PTR(r); 235 break; 236 237 default: 238 return ERR_PTR(-EINVAL); 239 } 240 241 return sgt; 242 243 error_free: 244 sg_free_table(sgt); 245 kfree(sgt); 246 return ERR_PTR(-EBUSY); 247 } 248 249 /** 250 * amdgpu_dma_buf_unmap - &dma_buf_ops.unmap_dma_buf implementation 251 * @attach: DMA-buf attachment 252 * @sgt: sg_table to unmap 253 * @dir: DMA direction 254 * 255 * This is called when a shared DMA buffer no longer needs to be accessible by 256 * another device. For now, simply unpins the buffer from GTT. 257 */ 258 static void amdgpu_dma_buf_unmap(struct dma_buf_attachment *attach, 259 struct sg_table *sgt, 260 enum dma_data_direction dir) 261 { 262 struct drm_gem_object *obj = attach->dmabuf->priv; 263 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 264 265 if (bo->tbo.resource && 266 bo->tbo.resource->mem_type == AMDGPU_PL_MMIO_REMAP) { 267 amdgpu_ttm_mmio_remap_free_sgt(attach->dev, dir, sgt); 268 return; 269 } 270 271 if (sg_page(sgt->sgl)) { 272 dma_unmap_sgtable(attach->dev, sgt, dir, 0); 273 sg_free_table(sgt); 274 kfree(sgt); 275 } else { 276 amdgpu_vram_mgr_free_sgt(attach->dev, dir, sgt); 277 } 278 } 279 280 /** 281 * amdgpu_dma_buf_begin_cpu_access - &dma_buf_ops.begin_cpu_access implementation 282 * @dma_buf: Shared DMA buffer 283 * @direction: Direction of DMA transfer 284 * 285 * This is called before CPU access to the shared DMA buffer's memory. If it's 286 * a read access, the buffer is moved to the GTT domain if possible, for optimal 287 * CPU read performance. 288 * 289 * Returns: 290 * 0 on success or a negative error code on failure. 291 */ 292 static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf *dma_buf, 293 enum dma_data_direction direction) 294 { 295 struct amdgpu_bo *bo = gem_to_amdgpu_bo(dma_buf->priv); 296 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 297 struct ttm_operation_ctx ctx = { true, false }; 298 u32 domain = amdgpu_display_supported_domains(adev, bo->flags); 299 int ret; 300 bool reads = (direction == DMA_BIDIRECTIONAL || 301 direction == DMA_FROM_DEVICE); 302 303 if (!reads || !(domain & AMDGPU_GEM_DOMAIN_GTT)) 304 return 0; 305 306 /* move to gtt */ 307 ret = amdgpu_bo_reserve(bo, false); 308 if (unlikely(ret != 0)) 309 return ret; 310 311 if (!bo->tbo.pin_count && 312 (bo->allowed_domains & AMDGPU_GEM_DOMAIN_GTT)) { 313 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT); 314 ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 315 } 316 317 amdgpu_bo_unreserve(bo); 318 return ret; 319 } 320 321 static int amdgpu_dma_buf_vmap(struct dma_buf *dma_buf, struct iosys_map *map) 322 { 323 struct drm_gem_object *obj = dma_buf->priv; 324 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 325 int ret; 326 327 /* 328 * Pin to keep buffer in place while it's vmap'ed. The actual 329 * domain is not that important as long as it's mapable. Using 330 * GTT and VRAM should be compatible with most use cases. 331 */ 332 ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT | AMDGPU_GEM_DOMAIN_VRAM); 333 if (ret) 334 return ret; 335 ret = drm_gem_dmabuf_vmap(dma_buf, map); 336 if (ret) 337 amdgpu_bo_unpin(bo); 338 339 return ret; 340 } 341 342 static void amdgpu_dma_buf_vunmap(struct dma_buf *dma_buf, struct iosys_map *map) 343 { 344 struct drm_gem_object *obj = dma_buf->priv; 345 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 346 347 drm_gem_dmabuf_vunmap(dma_buf, map); 348 amdgpu_bo_unpin(bo); 349 } 350 351 const struct dma_buf_ops amdgpu_dmabuf_ops = { 352 .attach = amdgpu_dma_buf_attach, 353 .pin = amdgpu_dma_buf_pin, 354 .unpin = amdgpu_dma_buf_unpin, 355 .map_dma_buf = amdgpu_dma_buf_map, 356 .unmap_dma_buf = amdgpu_dma_buf_unmap, 357 .release = drm_gem_dmabuf_release, 358 .begin_cpu_access = amdgpu_dma_buf_begin_cpu_access, 359 .mmap = drm_gem_dmabuf_mmap, 360 .vmap = amdgpu_dma_buf_vmap, 361 .vunmap = amdgpu_dma_buf_vunmap, 362 }; 363 364 /** 365 * amdgpu_gem_prime_export - &drm_driver.gem_prime_export implementation 366 * @gobj: GEM BO 367 * @flags: Flags such as DRM_CLOEXEC and DRM_RDWR. 368 * 369 * The main work is done by the &drm_gem_prime_export helper. 370 * 371 * Returns: 372 * Shared DMA buffer representing the GEM BO from the given device. 373 */ 374 struct dma_buf *amdgpu_gem_prime_export(struct drm_gem_object *gobj, 375 int flags) 376 { 377 struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj); 378 struct dma_buf *buf; 379 struct ttm_operation_ctx ctx = { 380 .interruptible = true, 381 .no_wait_gpu = true, 382 /* We opt to avoid OOM on system pages allocations */ 383 .gfp_retry_mayfail = true, 384 .allow_res_evict = false, 385 }; 386 int ret; 387 388 if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) || 389 bo->flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) 390 return ERR_PTR(-EPERM); 391 392 ret = ttm_bo_setup_export(&bo->tbo, &ctx); 393 if (ret) 394 return ERR_PTR(ret); 395 396 buf = drm_gem_prime_export(gobj, flags); 397 if (!IS_ERR(buf)) 398 buf->ops = &amdgpu_dmabuf_ops; 399 400 return buf; 401 } 402 403 /** 404 * amdgpu_dma_buf_create_obj - create BO for DMA-buf import 405 * 406 * @dev: DRM device 407 * @dma_buf: DMA-buf 408 * 409 * Creates an empty SG BO for DMA-buf import. 410 * 411 * Returns: 412 * A new GEM BO of the given DRM device, representing the memory 413 * described by the given DMA-buf attachment and scatter/gather table. 414 */ 415 static struct drm_gem_object * 416 amdgpu_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf) 417 { 418 struct dma_resv *resv = dma_buf->resv; 419 struct amdgpu_device *adev = drm_to_adev(dev); 420 struct drm_gem_object *gobj; 421 struct amdgpu_bo *bo; 422 uint64_t flags = 0; 423 int ret; 424 425 dma_resv_lock(resv, NULL); 426 427 if (dma_buf->ops == &amdgpu_dmabuf_ops) { 428 struct amdgpu_bo *other = gem_to_amdgpu_bo(dma_buf->priv); 429 430 flags |= other->flags & (AMDGPU_GEM_CREATE_CPU_GTT_USWC | 431 AMDGPU_GEM_CREATE_COHERENT | 432 AMDGPU_GEM_CREATE_EXT_COHERENT | 433 AMDGPU_GEM_CREATE_UNCACHED); 434 } 435 436 ret = amdgpu_gem_object_create(adev, dma_buf->size, PAGE_SIZE, 437 AMDGPU_GEM_DOMAIN_CPU, flags, 438 ttm_bo_type_sg, resv, &gobj, 0); 439 if (ret) 440 goto error; 441 442 bo = gem_to_amdgpu_bo(gobj); 443 bo->allowed_domains = AMDGPU_GEM_DOMAIN_GTT; 444 bo->preferred_domains = AMDGPU_GEM_DOMAIN_GTT; 445 446 dma_resv_unlock(resv); 447 return gobj; 448 449 error: 450 dma_resv_unlock(resv); 451 return ERR_PTR(ret); 452 } 453 454 /** 455 * amdgpu_dma_buf_move_notify - &attach.invalidate_mappings implementation 456 * 457 * @attach: the DMA-buf attachment 458 * 459 * Invalidate the DMA-buf attachment, making sure that the we re-create the 460 * mapping before the next use. 461 */ 462 static void 463 amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) 464 { 465 struct drm_gem_object *obj = attach->importer_priv; 466 struct ww_acquire_ctx *ticket = dma_resv_locking_ctx(obj->resv); 467 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 468 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 469 struct ttm_operation_ctx ctx = { false, false }; 470 struct ttm_placement placement = {}; 471 struct amdgpu_vm_bo_base *bo_base; 472 int r; 473 474 /* FIXME: This should be after the "if", but needs a fix to make sure 475 * DMABuf imports are initialized in the right VM list. 476 */ 477 amdgpu_vm_bo_invalidate(bo, false); 478 if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM) 479 return; 480 481 r = ttm_bo_validate(&bo->tbo, &placement, &ctx); 482 if (r) { 483 DRM_ERROR("Failed to invalidate DMA-buf import (%d))\n", r); 484 return; 485 } 486 487 for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { 488 struct amdgpu_vm *vm = bo_base->vm; 489 struct dma_resv *resv = vm->root.bo->tbo.base.resv; 490 491 if (ticket) { 492 /* When we get an error here it means that somebody 493 * else is holding the VM lock and updating page tables 494 * So we can just continue here. 495 */ 496 r = dma_resv_lock(resv, ticket); 497 if (r) 498 continue; 499 500 } else { 501 /* TODO: This is more problematic and we actually need 502 * to allow page tables updates without holding the 503 * lock. 504 */ 505 if (!dma_resv_trylock(resv)) 506 continue; 507 } 508 509 /* Reserve fences for two SDMA page table updates */ 510 r = dma_resv_reserve_fences(resv, 2); 511 if (!r) 512 r = amdgpu_vm_clear_freed(adev, vm, NULL); 513 514 /* Don't pass 'ticket' to amdgpu_vm_handle_moved: we want the clear=true 515 * path to be used otherwise we might update the PT of another process 516 * while it's using the BO. 517 * With clear=true, amdgpu_vm_bo_update will sync to command submission 518 * from the same VM. 519 */ 520 if (!r) 521 r = amdgpu_vm_handle_moved(adev, vm, NULL); 522 523 if (r && r != -EBUSY) 524 DRM_ERROR("Failed to invalidate VM page tables (%d))\n", 525 r); 526 527 dma_resv_unlock(resv); 528 } 529 } 530 531 static const struct dma_buf_attach_ops amdgpu_dma_buf_attach_ops = { 532 .allow_peer2peer = true, 533 .invalidate_mappings = amdgpu_dma_buf_move_notify 534 }; 535 536 /** 537 * amdgpu_gem_prime_import - &drm_driver.gem_prime_import implementation 538 * @dev: DRM device 539 * @dma_buf: Shared DMA buffer 540 * 541 * Import a dma_buf into a the driver and potentially create a new GEM object. 542 * 543 * Returns: 544 * GEM BO representing the shared DMA buffer for the given device. 545 */ 546 struct drm_gem_object *amdgpu_gem_prime_import(struct drm_device *dev, 547 struct dma_buf *dma_buf) 548 { 549 struct dma_buf_attachment *attach; 550 struct drm_gem_object *obj; 551 552 if (dma_buf->ops == &amdgpu_dmabuf_ops) { 553 obj = dma_buf->priv; 554 if (obj->dev == dev) { 555 /* 556 * Importing dmabuf exported from out own gem increases 557 * refcount on gem itself instead of f_count of dmabuf. 558 */ 559 drm_gem_object_get(obj); 560 return obj; 561 } 562 } 563 564 obj = amdgpu_dma_buf_create_obj(dev, dma_buf); 565 if (IS_ERR(obj)) 566 return obj; 567 568 attach = dma_buf_dynamic_attach(dma_buf, dev->dev, 569 &amdgpu_dma_buf_attach_ops, obj); 570 if (IS_ERR(attach)) { 571 drm_gem_object_put(obj); 572 return ERR_CAST(attach); 573 } 574 575 get_dma_buf(dma_buf); 576 obj->import_attach = attach; 577 return obj; 578 } 579 580 /** 581 * amdgpu_dmabuf_is_xgmi_accessible - Check if xgmi available for P2P transfer 582 * 583 * @adev: amdgpu_device pointer of the importer 584 * @bo: amdgpu buffer object 585 * 586 * Returns: 587 * True if dmabuf accessible over xgmi, false otherwise. 588 */ 589 bool amdgpu_dmabuf_is_xgmi_accessible(struct amdgpu_device *adev, 590 struct amdgpu_bo *bo) 591 { 592 struct drm_gem_object *obj = &bo->tbo.base; 593 struct drm_gem_object *gobj; 594 595 if (!adev) 596 return false; 597 598 if (drm_gem_is_imported(obj)) { 599 struct dma_buf *dma_buf = obj->import_attach->dmabuf; 600 601 if (dma_buf->ops != &amdgpu_dmabuf_ops) 602 /* No XGMI with non AMD GPUs */ 603 return false; 604 605 gobj = dma_buf->priv; 606 bo = gem_to_amdgpu_bo(gobj); 607 } 608 609 if (amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) && 610 (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) 611 return true; 612 613 return false; 614 } 615