1 /* 2 * Copyright 2019 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * based on nouveau_prime.c 23 * 24 * Authors: Alex Deucher 25 */ 26 27 /** 28 * DOC: PRIME Buffer Sharing 29 * 30 * The following callback implementations are used for :ref:`sharing GEM buffer 31 * objects between different devices via PRIME <prime_buffer_sharing>`. 32 */ 33 34 #include "amdgpu.h" 35 #include "amdgpu_display.h" 36 #include "amdgpu_gem.h" 37 #include "amdgpu_dma_buf.h" 38 #include "amdgpu_xgmi.h" 39 #include "amdgpu_vm.h" 40 #include "amdgpu_ttm.h" 41 #include <drm/amdgpu_drm.h> 42 #include <drm/ttm/ttm_tt.h> 43 #include <linux/dma-buf.h> 44 #include <linux/dma-fence-array.h> 45 #include <linux/pci-p2pdma.h> 46 47 static const struct dma_buf_attach_ops amdgpu_dma_buf_attach_ops; 48 49 /** 50 * dma_buf_attach_adev - Helper to get adev of an attachment 51 * 52 * @attach: attachment 53 * 54 * Returns: 55 * A struct amdgpu_device * if the attaching device is an amdgpu device or 56 * partition, NULL otherwise. 57 */ 58 static struct amdgpu_device *dma_buf_attach_adev(struct dma_buf_attachment *attach) 59 { 60 if (attach->importer_ops == &amdgpu_dma_buf_attach_ops) { 61 struct drm_gem_object *obj = attach->importer_priv; 62 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 63 64 return amdgpu_ttm_adev(bo->tbo.bdev); 65 } 66 67 return NULL; 68 } 69 70 /** 71 * amdgpu_dma_buf_attach - &dma_buf_ops.attach implementation 72 * 73 * @dmabuf: DMA-buf where we attach to 74 * @attach: attachment to add 75 * 76 * Add the attachment as user to the exported DMA-buf. 77 */ 78 static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, 79 struct dma_buf_attachment *attach) 80 { 81 struct amdgpu_device *attach_adev = dma_buf_attach_adev(attach); 82 struct drm_gem_object *obj = dmabuf->priv; 83 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 84 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 85 int r; 86 87 /* 88 * Disable peer-to-peer access for DCC-enabled VRAM surfaces on GFX12+. 89 * Such buffers cannot be safely accessed over P2P due to device-local 90 * compression metadata. Fallback to system-memory path instead. 91 * Device supports GFX12 (GC 12.x or newer) 92 * BO was created with the AMDGPU_GEM_CREATE_GFX12_DCC flag 93 * 94 */ 95 if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0) && 96 bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) 97 attach->peer2peer = false; 98 99 if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) && 100 pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) 101 attach->peer2peer = false; 102 103 r = dma_resv_lock(bo->tbo.base.resv, NULL); 104 if (r) 105 return r; 106 107 amdgpu_vm_bo_update_shared(bo); 108 109 dma_resv_unlock(bo->tbo.base.resv); 110 111 return 0; 112 } 113 114 /** 115 * amdgpu_dma_buf_pin - &dma_buf_ops.pin implementation 116 * 117 * @attach: attachment to pin down 118 * 119 * Pin the BO which is backing the DMA-buf so that it can't move any more. 120 */ 121 static int amdgpu_dma_buf_pin(struct dma_buf_attachment *attach) 122 { 123 struct dma_buf *dmabuf = attach->dmabuf; 124 struct amdgpu_bo *bo = gem_to_amdgpu_bo(dmabuf->priv); 125 u32 domains = bo->allowed_domains; 126 127 dma_resv_assert_held(dmabuf->resv); 128 129 /* Try pinning into VRAM to allow P2P with RDMA NICs without ODP 130 * support if all attachments can do P2P. If any attachment can't do 131 * P2P just pin into GTT instead. 132 * 133 * To avoid with conflicting pinnings between GPUs and RDMA when move 134 * notifiers are disabled, only allow pinning in VRAM when move 135 * notiers are enabled. 136 */ 137 if (!IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) { 138 domains &= ~AMDGPU_GEM_DOMAIN_VRAM; 139 } else { 140 list_for_each_entry(attach, &dmabuf->attachments, node) 141 if (!attach->peer2peer) 142 domains &= ~AMDGPU_GEM_DOMAIN_VRAM; 143 } 144 145 if (domains & AMDGPU_GEM_DOMAIN_VRAM) 146 bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; 147 148 if (WARN_ON(!domains)) 149 return -EINVAL; 150 151 return amdgpu_bo_pin(bo, domains); 152 } 153 154 /** 155 * amdgpu_dma_buf_unpin - &dma_buf_ops.unpin implementation 156 * 157 * @attach: attachment to unpin 158 * 159 * Unpin a previously pinned BO to make it movable again. 160 */ 161 static void amdgpu_dma_buf_unpin(struct dma_buf_attachment *attach) 162 { 163 struct drm_gem_object *obj = attach->dmabuf->priv; 164 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 165 166 amdgpu_bo_unpin(bo); 167 } 168 169 /** 170 * amdgpu_dma_buf_map - &dma_buf_ops.map_dma_buf implementation 171 * @attach: DMA-buf attachment 172 * @dir: DMA direction 173 * 174 * Makes sure that the shared DMA buffer can be accessed by the target device. 175 * For now, simply pins it to the GTT domain, where it should be accessible by 176 * all DMA devices. 177 * 178 * Returns: 179 * sg_table filled with the DMA addresses to use or ERR_PRT with negative error 180 * code. 181 */ 182 static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, 183 enum dma_data_direction dir) 184 { 185 struct dma_buf *dma_buf = attach->dmabuf; 186 struct drm_gem_object *obj = dma_buf->priv; 187 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 188 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 189 struct sg_table *sgt; 190 long r; 191 192 if (!bo->tbo.pin_count) { 193 /* move buffer into GTT or VRAM */ 194 struct ttm_operation_ctx ctx = { false, false }; 195 unsigned int domains = AMDGPU_GEM_DOMAIN_GTT; 196 197 if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM && 198 attach->peer2peer) { 199 bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; 200 domains |= AMDGPU_GEM_DOMAIN_VRAM; 201 } 202 amdgpu_bo_placement_from_domain(bo, domains); 203 r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 204 if (r) 205 return ERR_PTR(r); 206 } 207 208 switch (bo->tbo.resource->mem_type) { 209 case TTM_PL_TT: 210 sgt = drm_prime_pages_to_sg(obj->dev, 211 bo->tbo.ttm->pages, 212 bo->tbo.ttm->num_pages); 213 if (IS_ERR(sgt)) 214 return sgt; 215 216 if (dma_map_sgtable(attach->dev, sgt, dir, 217 DMA_ATTR_SKIP_CPU_SYNC)) 218 goto error_free; 219 break; 220 221 case TTM_PL_VRAM: 222 /* XGMI-accessible memory should never be DMA-mapped */ 223 if (WARN_ON(amdgpu_dmabuf_is_xgmi_accessible( 224 dma_buf_attach_adev(attach), bo))) 225 return ERR_PTR(-EINVAL); 226 227 r = amdgpu_vram_mgr_alloc_sgt(adev, bo->tbo.resource, 0, 228 bo->tbo.base.size, attach->dev, 229 dir, &sgt); 230 if (r) 231 return ERR_PTR(r); 232 break; 233 234 case AMDGPU_PL_MMIO_REMAP: 235 r = amdgpu_ttm_mmio_remap_alloc_sgt(adev, bo->tbo.resource, 236 attach->dev, dir, &sgt); 237 if (r) 238 return ERR_PTR(r); 239 break; 240 241 default: 242 return ERR_PTR(-EINVAL); 243 } 244 245 return sgt; 246 247 error_free: 248 sg_free_table(sgt); 249 kfree(sgt); 250 return ERR_PTR(-EBUSY); 251 } 252 253 /** 254 * amdgpu_dma_buf_unmap - &dma_buf_ops.unmap_dma_buf implementation 255 * @attach: DMA-buf attachment 256 * @sgt: sg_table to unmap 257 * @dir: DMA direction 258 * 259 * This is called when a shared DMA buffer no longer needs to be accessible by 260 * another device. For now, simply unpins the buffer from GTT. 261 */ 262 static void amdgpu_dma_buf_unmap(struct dma_buf_attachment *attach, 263 struct sg_table *sgt, 264 enum dma_data_direction dir) 265 { 266 struct drm_gem_object *obj = attach->dmabuf->priv; 267 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 268 269 if (bo->tbo.resource && 270 bo->tbo.resource->mem_type == AMDGPU_PL_MMIO_REMAP) { 271 amdgpu_ttm_mmio_remap_free_sgt(attach->dev, dir, sgt); 272 return; 273 } 274 275 if (sg_page(sgt->sgl)) { 276 dma_unmap_sgtable(attach->dev, sgt, dir, 0); 277 sg_free_table(sgt); 278 kfree(sgt); 279 } else { 280 amdgpu_vram_mgr_free_sgt(attach->dev, dir, sgt); 281 } 282 } 283 284 /** 285 * amdgpu_dma_buf_begin_cpu_access - &dma_buf_ops.begin_cpu_access implementation 286 * @dma_buf: Shared DMA buffer 287 * @direction: Direction of DMA transfer 288 * 289 * This is called before CPU access to the shared DMA buffer's memory. If it's 290 * a read access, the buffer is moved to the GTT domain if possible, for optimal 291 * CPU read performance. 292 * 293 * Returns: 294 * 0 on success or a negative error code on failure. 295 */ 296 static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf *dma_buf, 297 enum dma_data_direction direction) 298 { 299 struct amdgpu_bo *bo = gem_to_amdgpu_bo(dma_buf->priv); 300 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 301 struct ttm_operation_ctx ctx = { true, false }; 302 u32 domain = amdgpu_display_supported_domains(adev, bo->flags); 303 int ret; 304 bool reads = (direction == DMA_BIDIRECTIONAL || 305 direction == DMA_FROM_DEVICE); 306 307 if (!reads || !(domain & AMDGPU_GEM_DOMAIN_GTT)) 308 return 0; 309 310 /* move to gtt */ 311 ret = amdgpu_bo_reserve(bo, false); 312 if (unlikely(ret != 0)) 313 return ret; 314 315 if (!bo->tbo.pin_count && 316 (bo->allowed_domains & AMDGPU_GEM_DOMAIN_GTT)) { 317 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT); 318 ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 319 } 320 321 amdgpu_bo_unreserve(bo); 322 return ret; 323 } 324 325 static int amdgpu_dma_buf_vmap(struct dma_buf *dma_buf, struct iosys_map *map) 326 { 327 struct drm_gem_object *obj = dma_buf->priv; 328 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 329 int ret; 330 331 /* 332 * Pin to keep buffer in place while it's vmap'ed. The actual 333 * domain is not that important as long as it's mapable. Using 334 * GTT and VRAM should be compatible with most use cases. 335 */ 336 ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT | AMDGPU_GEM_DOMAIN_VRAM); 337 if (ret) 338 return ret; 339 ret = drm_gem_dmabuf_vmap(dma_buf, map); 340 if (ret) 341 amdgpu_bo_unpin(bo); 342 343 return ret; 344 } 345 346 static void amdgpu_dma_buf_vunmap(struct dma_buf *dma_buf, struct iosys_map *map) 347 { 348 struct drm_gem_object *obj = dma_buf->priv; 349 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 350 351 drm_gem_dmabuf_vunmap(dma_buf, map); 352 amdgpu_bo_unpin(bo); 353 } 354 355 const struct dma_buf_ops amdgpu_dmabuf_ops = { 356 .attach = amdgpu_dma_buf_attach, 357 .pin = amdgpu_dma_buf_pin, 358 .unpin = amdgpu_dma_buf_unpin, 359 .map_dma_buf = amdgpu_dma_buf_map, 360 .unmap_dma_buf = amdgpu_dma_buf_unmap, 361 .release = drm_gem_dmabuf_release, 362 .begin_cpu_access = amdgpu_dma_buf_begin_cpu_access, 363 .mmap = drm_gem_dmabuf_mmap, 364 .vmap = amdgpu_dma_buf_vmap, 365 .vunmap = amdgpu_dma_buf_vunmap, 366 }; 367 368 /** 369 * amdgpu_gem_prime_export - &drm_driver.gem_prime_export implementation 370 * @gobj: GEM BO 371 * @flags: Flags such as DRM_CLOEXEC and DRM_RDWR. 372 * 373 * The main work is done by the &drm_gem_prime_export helper. 374 * 375 * Returns: 376 * Shared DMA buffer representing the GEM BO from the given device. 377 */ 378 struct dma_buf *amdgpu_gem_prime_export(struct drm_gem_object *gobj, 379 int flags) 380 { 381 struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj); 382 struct dma_buf *buf; 383 struct ttm_operation_ctx ctx = { 384 .interruptible = true, 385 .no_wait_gpu = true, 386 /* We opt to avoid OOM on system pages allocations */ 387 .gfp_retry_mayfail = true, 388 .allow_res_evict = false, 389 }; 390 int ret; 391 392 if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) || 393 bo->flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) 394 return ERR_PTR(-EPERM); 395 396 ret = ttm_bo_setup_export(&bo->tbo, &ctx); 397 if (ret) 398 return ERR_PTR(ret); 399 400 buf = drm_gem_prime_export(gobj, flags); 401 if (!IS_ERR(buf)) 402 buf->ops = &amdgpu_dmabuf_ops; 403 404 return buf; 405 } 406 407 /** 408 * amdgpu_dma_buf_create_obj - create BO for DMA-buf import 409 * 410 * @dev: DRM device 411 * @dma_buf: DMA-buf 412 * 413 * Creates an empty SG BO for DMA-buf import. 414 * 415 * Returns: 416 * A new GEM BO of the given DRM device, representing the memory 417 * described by the given DMA-buf attachment and scatter/gather table. 418 */ 419 static struct drm_gem_object * 420 amdgpu_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf) 421 { 422 struct dma_resv *resv = dma_buf->resv; 423 struct amdgpu_device *adev = drm_to_adev(dev); 424 struct drm_gem_object *gobj; 425 struct amdgpu_bo *bo; 426 uint64_t flags = 0; 427 int ret; 428 429 dma_resv_lock(resv, NULL); 430 431 if (dma_buf->ops == &amdgpu_dmabuf_ops) { 432 struct amdgpu_bo *other = gem_to_amdgpu_bo(dma_buf->priv); 433 434 flags |= other->flags & (AMDGPU_GEM_CREATE_CPU_GTT_USWC | 435 AMDGPU_GEM_CREATE_COHERENT | 436 AMDGPU_GEM_CREATE_EXT_COHERENT | 437 AMDGPU_GEM_CREATE_UNCACHED); 438 } 439 440 ret = amdgpu_gem_object_create(adev, dma_buf->size, PAGE_SIZE, 441 AMDGPU_GEM_DOMAIN_CPU, flags, 442 ttm_bo_type_sg, resv, &gobj, 0); 443 if (ret) 444 goto error; 445 446 bo = gem_to_amdgpu_bo(gobj); 447 bo->allowed_domains = AMDGPU_GEM_DOMAIN_GTT; 448 bo->preferred_domains = AMDGPU_GEM_DOMAIN_GTT; 449 450 dma_resv_unlock(resv); 451 return gobj; 452 453 error: 454 dma_resv_unlock(resv); 455 return ERR_PTR(ret); 456 } 457 458 /** 459 * amdgpu_dma_buf_move_notify - &attach.move_notify implementation 460 * 461 * @attach: the DMA-buf attachment 462 * 463 * Invalidate the DMA-buf attachment, making sure that the we re-create the 464 * mapping before the next use. 465 */ 466 static void 467 amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) 468 { 469 struct drm_gem_object *obj = attach->importer_priv; 470 struct ww_acquire_ctx *ticket = dma_resv_locking_ctx(obj->resv); 471 struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 472 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 473 struct ttm_operation_ctx ctx = { false, false }; 474 struct ttm_placement placement = {}; 475 struct amdgpu_vm_bo_base *bo_base; 476 int r; 477 478 /* FIXME: This should be after the "if", but needs a fix to make sure 479 * DMABuf imports are initialized in the right VM list. 480 */ 481 amdgpu_vm_bo_invalidate(bo, false); 482 if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM) 483 return; 484 485 r = ttm_bo_validate(&bo->tbo, &placement, &ctx); 486 if (r) { 487 DRM_ERROR("Failed to invalidate DMA-buf import (%d))\n", r); 488 return; 489 } 490 491 for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { 492 struct amdgpu_vm *vm = bo_base->vm; 493 struct dma_resv *resv = vm->root.bo->tbo.base.resv; 494 495 if (ticket) { 496 /* When we get an error here it means that somebody 497 * else is holding the VM lock and updating page tables 498 * So we can just continue here. 499 */ 500 r = dma_resv_lock(resv, ticket); 501 if (r) 502 continue; 503 504 } else { 505 /* TODO: This is more problematic and we actually need 506 * to allow page tables updates without holding the 507 * lock. 508 */ 509 if (!dma_resv_trylock(resv)) 510 continue; 511 } 512 513 /* Reserve fences for two SDMA page table updates */ 514 r = dma_resv_reserve_fences(resv, 2); 515 if (!r) 516 r = amdgpu_vm_clear_freed(adev, vm, NULL); 517 if (!r) 518 r = amdgpu_vm_handle_moved(adev, vm, ticket); 519 520 if (r && r != -EBUSY) 521 DRM_ERROR("Failed to invalidate VM page tables (%d))\n", 522 r); 523 524 dma_resv_unlock(resv); 525 } 526 } 527 528 static const struct dma_buf_attach_ops amdgpu_dma_buf_attach_ops = { 529 .allow_peer2peer = true, 530 .move_notify = amdgpu_dma_buf_move_notify 531 }; 532 533 /** 534 * amdgpu_gem_prime_import - &drm_driver.gem_prime_import implementation 535 * @dev: DRM device 536 * @dma_buf: Shared DMA buffer 537 * 538 * Import a dma_buf into a the driver and potentially create a new GEM object. 539 * 540 * Returns: 541 * GEM BO representing the shared DMA buffer for the given device. 542 */ 543 struct drm_gem_object *amdgpu_gem_prime_import(struct drm_device *dev, 544 struct dma_buf *dma_buf) 545 { 546 struct dma_buf_attachment *attach; 547 struct drm_gem_object *obj; 548 549 if (dma_buf->ops == &amdgpu_dmabuf_ops) { 550 obj = dma_buf->priv; 551 if (obj->dev == dev) { 552 /* 553 * Importing dmabuf exported from out own gem increases 554 * refcount on gem itself instead of f_count of dmabuf. 555 */ 556 drm_gem_object_get(obj); 557 return obj; 558 } 559 } 560 561 obj = amdgpu_dma_buf_create_obj(dev, dma_buf); 562 if (IS_ERR(obj)) 563 return obj; 564 565 attach = dma_buf_dynamic_attach(dma_buf, dev->dev, 566 &amdgpu_dma_buf_attach_ops, obj); 567 if (IS_ERR(attach)) { 568 drm_gem_object_put(obj); 569 return ERR_CAST(attach); 570 } 571 572 get_dma_buf(dma_buf); 573 obj->import_attach = attach; 574 return obj; 575 } 576 577 /** 578 * amdgpu_dmabuf_is_xgmi_accessible - Check if xgmi available for P2P transfer 579 * 580 * @adev: amdgpu_device pointer of the importer 581 * @bo: amdgpu buffer object 582 * 583 * Returns: 584 * True if dmabuf accessible over xgmi, false otherwise. 585 */ 586 bool amdgpu_dmabuf_is_xgmi_accessible(struct amdgpu_device *adev, 587 struct amdgpu_bo *bo) 588 { 589 struct drm_gem_object *obj = &bo->tbo.base; 590 struct drm_gem_object *gobj; 591 592 if (!adev) 593 return false; 594 595 if (drm_gem_is_imported(obj)) { 596 struct dma_buf *dma_buf = obj->import_attach->dmabuf; 597 598 if (dma_buf->ops != &amdgpu_dmabuf_ops) 599 /* No XGMI with non AMD GPUs */ 600 return false; 601 602 gobj = dma_buf->priv; 603 bo = gem_to_amdgpu_bo(gobj); 604 } 605 606 if (amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) && 607 (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) 608 return true; 609 610 return false; 611 } 612