1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 3 */ 4 #include <linux/iommufd.h> 5 #include <linux/slab.h> 6 #include <linux/iommu.h> 7 #include <uapi/linux/iommufd.h> 8 #include "../iommu-priv.h" 9 10 #include "io_pagetable.h" 11 #include "iommufd_private.h" 12 13 static bool allow_unsafe_interrupts; 14 module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); 15 MODULE_PARM_DESC( 16 allow_unsafe_interrupts, 17 "Allow IOMMUFD to bind to devices even if the platform cannot isolate " 18 "the MSI interrupt window. Enabling this is a security weakness."); 19 20 static void iommufd_group_release(struct kref *kref) 21 { 22 struct iommufd_group *igroup = 23 container_of(kref, struct iommufd_group, ref); 24 25 WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list)); 26 27 xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, 28 NULL, GFP_KERNEL); 29 iommu_group_put(igroup->group); 30 mutex_destroy(&igroup->lock); 31 kfree(igroup); 32 } 33 34 static void iommufd_put_group(struct iommufd_group *group) 35 { 36 kref_put(&group->ref, iommufd_group_release); 37 } 38 39 static bool iommufd_group_try_get(struct iommufd_group *igroup, 40 struct iommu_group *group) 41 { 42 if (!igroup) 43 return false; 44 /* 45 * group ID's cannot be re-used until the group is put back which does 46 * not happen if we could get an igroup pointer under the xa_lock. 47 */ 48 if (WARN_ON(igroup->group != group)) 49 return false; 50 return kref_get_unless_zero(&igroup->ref); 51 } 52 53 /* 54 * iommufd needs to store some more data for each iommu_group, we keep a 55 * parallel xarray indexed by iommu_group id to hold this instead of putting it 56 * in the core structure. To keep things simple the iommufd_group memory is 57 * unique within the iommufd_ctx. This makes it easy to check there are no 58 * memory leaks. 59 */ 60 static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, 61 struct device *dev) 62 { 63 struct iommufd_group *new_igroup; 64 struct iommufd_group *cur_igroup; 65 struct iommufd_group *igroup; 66 struct iommu_group *group; 67 unsigned int id; 68 69 group = iommu_group_get(dev); 70 if (!group) 71 return ERR_PTR(-ENODEV); 72 73 id = iommu_group_id(group); 74 75 xa_lock(&ictx->groups); 76 igroup = xa_load(&ictx->groups, id); 77 if (iommufd_group_try_get(igroup, group)) { 78 xa_unlock(&ictx->groups); 79 iommu_group_put(group); 80 return igroup; 81 } 82 xa_unlock(&ictx->groups); 83 84 new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL); 85 if (!new_igroup) { 86 iommu_group_put(group); 87 return ERR_PTR(-ENOMEM); 88 } 89 90 kref_init(&new_igroup->ref); 91 mutex_init(&new_igroup->lock); 92 INIT_LIST_HEAD(&new_igroup->device_list); 93 new_igroup->sw_msi_start = PHYS_ADDR_MAX; 94 /* group reference moves into new_igroup */ 95 new_igroup->group = group; 96 97 /* 98 * The ictx is not additionally refcounted here becase all objects using 99 * an igroup must put it before their destroy completes. 100 */ 101 new_igroup->ictx = ictx; 102 103 /* 104 * We dropped the lock so igroup is invalid. NULL is a safe and likely 105 * value to assume for the xa_cmpxchg algorithm. 106 */ 107 cur_igroup = NULL; 108 xa_lock(&ictx->groups); 109 while (true) { 110 igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup, 111 GFP_KERNEL); 112 if (xa_is_err(igroup)) { 113 xa_unlock(&ictx->groups); 114 iommufd_put_group(new_igroup); 115 return ERR_PTR(xa_err(igroup)); 116 } 117 118 /* new_group was successfully installed */ 119 if (cur_igroup == igroup) { 120 xa_unlock(&ictx->groups); 121 return new_igroup; 122 } 123 124 /* Check again if the current group is any good */ 125 if (iommufd_group_try_get(igroup, group)) { 126 xa_unlock(&ictx->groups); 127 iommufd_put_group(new_igroup); 128 return igroup; 129 } 130 cur_igroup = igroup; 131 } 132 } 133 134 void iommufd_device_destroy(struct iommufd_object *obj) 135 { 136 struct iommufd_device *idev = 137 container_of(obj, struct iommufd_device, obj); 138 139 iommu_device_release_dma_owner(idev->dev); 140 iommufd_put_group(idev->igroup); 141 if (!iommufd_selftest_is_mock_dev(idev->dev)) 142 iommufd_ctx_put(idev->ictx); 143 } 144 145 /** 146 * iommufd_device_bind - Bind a physical device to an iommu fd 147 * @ictx: iommufd file descriptor 148 * @dev: Pointer to a physical device struct 149 * @id: Output ID number to return to userspace for this device 150 * 151 * A successful bind establishes an ownership over the device and returns 152 * struct iommufd_device pointer, otherwise returns error pointer. 153 * 154 * A driver using this API must set driver_managed_dma and must not touch 155 * the device until this routine succeeds and establishes ownership. 156 * 157 * Binding a PCI device places the entire RID under iommufd control. 158 * 159 * The caller must undo this with iommufd_device_unbind() 160 */ 161 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, 162 struct device *dev, u32 *id) 163 { 164 struct iommufd_device *idev; 165 struct iommufd_group *igroup; 166 int rc; 167 168 /* 169 * iommufd always sets IOMMU_CACHE because we offer no way for userspace 170 * to restore cache coherency. 171 */ 172 if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) 173 return ERR_PTR(-EINVAL); 174 175 igroup = iommufd_get_group(ictx, dev); 176 if (IS_ERR(igroup)) 177 return ERR_CAST(igroup); 178 179 /* 180 * For historical compat with VFIO the insecure interrupt path is 181 * allowed if the module parameter is set. Secure/Isolated means that a 182 * MemWr operation from the device (eg a simple DMA) cannot trigger an 183 * interrupt outside this iommufd context. 184 */ 185 if (!iommufd_selftest_is_mock_dev(dev) && 186 !iommu_group_has_isolated_msi(igroup->group)) { 187 if (!allow_unsafe_interrupts) { 188 rc = -EPERM; 189 goto out_group_put; 190 } 191 192 dev_warn( 193 dev, 194 "MSI interrupts are not secure, they cannot be isolated by the platform. " 195 "Check that platform features like interrupt remapping are enabled. " 196 "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); 197 } 198 199 rc = iommu_device_claim_dma_owner(dev, ictx); 200 if (rc) 201 goto out_group_put; 202 203 idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE); 204 if (IS_ERR(idev)) { 205 rc = PTR_ERR(idev); 206 goto out_release_owner; 207 } 208 idev->ictx = ictx; 209 if (!iommufd_selftest_is_mock_dev(dev)) 210 iommufd_ctx_get(ictx); 211 idev->dev = dev; 212 idev->enforce_cache_coherency = 213 device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY); 214 /* The calling driver is a user until iommufd_device_unbind() */ 215 refcount_inc(&idev->obj.users); 216 /* igroup refcount moves into iommufd_device */ 217 idev->igroup = igroup; 218 219 /* 220 * If the caller fails after this success it must call 221 * iommufd_unbind_device() which is safe since we hold this refcount. 222 * This also means the device is a leaf in the graph and no other object 223 * can take a reference on it. 224 */ 225 iommufd_object_finalize(ictx, &idev->obj); 226 *id = idev->obj.id; 227 return idev; 228 229 out_release_owner: 230 iommu_device_release_dma_owner(dev); 231 out_group_put: 232 iommufd_put_group(igroup); 233 return ERR_PTR(rc); 234 } 235 EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); 236 237 /** 238 * iommufd_ctx_has_group - True if any device within the group is bound 239 * to the ictx 240 * @ictx: iommufd file descriptor 241 * @group: Pointer to a physical iommu_group struct 242 * 243 * True if any device within the group has been bound to this ictx, ex. via 244 * iommufd_device_bind(), therefore implying ictx ownership of the group. 245 */ 246 bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group) 247 { 248 struct iommufd_object *obj; 249 unsigned long index; 250 251 if (!ictx || !group) 252 return false; 253 254 xa_lock(&ictx->objects); 255 xa_for_each(&ictx->objects, index, obj) { 256 if (obj->type == IOMMUFD_OBJ_DEVICE && 257 container_of(obj, struct iommufd_device, obj) 258 ->igroup->group == group) { 259 xa_unlock(&ictx->objects); 260 return true; 261 } 262 } 263 xa_unlock(&ictx->objects); 264 return false; 265 } 266 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD); 267 268 /** 269 * iommufd_device_unbind - Undo iommufd_device_bind() 270 * @idev: Device returned by iommufd_device_bind() 271 * 272 * Release the device from iommufd control. The DMA ownership will return back 273 * to unowned with DMA controlled by the DMA API. This invalidates the 274 * iommufd_device pointer, other APIs that consume it must not be called 275 * concurrently. 276 */ 277 void iommufd_device_unbind(struct iommufd_device *idev) 278 { 279 iommufd_object_destroy_user(idev->ictx, &idev->obj); 280 } 281 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); 282 283 struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev) 284 { 285 return idev->ictx; 286 } 287 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD); 288 289 u32 iommufd_device_to_id(struct iommufd_device *idev) 290 { 291 return idev->obj.id; 292 } 293 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); 294 295 static int iommufd_group_setup_msi(struct iommufd_group *igroup, 296 struct iommufd_hwpt_paging *hwpt_paging) 297 { 298 phys_addr_t sw_msi_start = igroup->sw_msi_start; 299 int rc; 300 301 /* 302 * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to 303 * call iommu_get_msi_cookie() on its behalf. This is necessary to setup 304 * the MSI window so iommu_dma_prepare_msi() can install pages into our 305 * domain after request_irq(). If it is not done interrupts will not 306 * work on this domain. 307 * 308 * FIXME: This is conceptually broken for iommufd since we want to allow 309 * userspace to change the domains, eg switch from an identity IOAS to a 310 * DMA IOAS. There is currently no way to create a MSI window that 311 * matches what the IRQ layer actually expects in a newly created 312 * domain. 313 */ 314 if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { 315 rc = iommu_get_msi_cookie(hwpt_paging->common.domain, 316 sw_msi_start); 317 if (rc) 318 return rc; 319 320 /* 321 * iommu_get_msi_cookie() can only be called once per domain, 322 * it returns -EBUSY on later calls. 323 */ 324 hwpt_paging->msi_cookie = true; 325 } 326 return 0; 327 } 328 329 static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, 330 struct iommufd_device *idev) 331 { 332 int rc; 333 334 lockdep_assert_held(&idev->igroup->lock); 335 336 rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, 337 idev->dev, 338 &idev->igroup->sw_msi_start); 339 if (rc) 340 return rc; 341 342 if (list_empty(&idev->igroup->device_list)) { 343 rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging); 344 if (rc) { 345 iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, 346 idev->dev); 347 return rc; 348 } 349 } 350 return 0; 351 } 352 353 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, 354 struct iommufd_device *idev) 355 { 356 int rc; 357 358 mutex_lock(&idev->igroup->lock); 359 360 if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) { 361 rc = -EINVAL; 362 goto err_unlock; 363 } 364 365 if (hwpt_is_paging(hwpt)) { 366 rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev); 367 if (rc) 368 goto err_unlock; 369 } 370 371 /* 372 * Only attach to the group once for the first device that is in the 373 * group. All the other devices will follow this attachment. The user 374 * should attach every device individually to the hwpt as the per-device 375 * reserved regions are only updated during individual device 376 * attachment. 377 */ 378 if (list_empty(&idev->igroup->device_list)) { 379 rc = iommu_attach_group(hwpt->domain, idev->igroup->group); 380 if (rc) 381 goto err_unresv; 382 idev->igroup->hwpt = hwpt; 383 } 384 refcount_inc(&hwpt->obj.users); 385 list_add_tail(&idev->group_item, &idev->igroup->device_list); 386 mutex_unlock(&idev->igroup->lock); 387 return 0; 388 err_unresv: 389 if (hwpt_is_paging(hwpt)) 390 iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, 391 idev->dev); 392 err_unlock: 393 mutex_unlock(&idev->igroup->lock); 394 return rc; 395 } 396 397 struct iommufd_hw_pagetable * 398 iommufd_hw_pagetable_detach(struct iommufd_device *idev) 399 { 400 struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; 401 402 mutex_lock(&idev->igroup->lock); 403 list_del(&idev->group_item); 404 if (list_empty(&idev->igroup->device_list)) { 405 iommu_detach_group(hwpt->domain, idev->igroup->group); 406 idev->igroup->hwpt = NULL; 407 } 408 if (hwpt_is_paging(hwpt)) 409 iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, 410 idev->dev); 411 mutex_unlock(&idev->igroup->lock); 412 413 /* Caller must destroy hwpt */ 414 return hwpt; 415 } 416 417 static struct iommufd_hw_pagetable * 418 iommufd_device_do_attach(struct iommufd_device *idev, 419 struct iommufd_hw_pagetable *hwpt) 420 { 421 int rc; 422 423 rc = iommufd_hw_pagetable_attach(hwpt, idev); 424 if (rc) 425 return ERR_PTR(rc); 426 return NULL; 427 } 428 429 static void 430 iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, 431 struct iommufd_hwpt_paging *hwpt_paging) 432 { 433 struct iommufd_device *cur; 434 435 lockdep_assert_held(&igroup->lock); 436 437 list_for_each_entry(cur, &igroup->device_list, group_item) 438 iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); 439 } 440 441 static int 442 iommufd_group_do_replace_paging(struct iommufd_group *igroup, 443 struct iommufd_hwpt_paging *hwpt_paging) 444 { 445 struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt; 446 struct iommufd_device *cur; 447 int rc; 448 449 lockdep_assert_held(&igroup->lock); 450 451 if (!hwpt_is_paging(old_hwpt) || 452 hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) { 453 list_for_each_entry(cur, &igroup->device_list, group_item) { 454 rc = iopt_table_enforce_dev_resv_regions( 455 &hwpt_paging->ioas->iopt, cur->dev, NULL); 456 if (rc) 457 goto err_unresv; 458 } 459 } 460 461 rc = iommufd_group_setup_msi(igroup, hwpt_paging); 462 if (rc) 463 goto err_unresv; 464 return 0; 465 466 err_unresv: 467 iommufd_group_remove_reserved_iova(igroup, hwpt_paging); 468 return rc; 469 } 470 471 static struct iommufd_hw_pagetable * 472 iommufd_device_do_replace(struct iommufd_device *idev, 473 struct iommufd_hw_pagetable *hwpt) 474 { 475 struct iommufd_group *igroup = idev->igroup; 476 struct iommufd_hw_pagetable *old_hwpt; 477 unsigned int num_devices; 478 int rc; 479 480 mutex_lock(&idev->igroup->lock); 481 482 if (igroup->hwpt == NULL) { 483 rc = -EINVAL; 484 goto err_unlock; 485 } 486 487 if (hwpt == igroup->hwpt) { 488 mutex_unlock(&idev->igroup->lock); 489 return NULL; 490 } 491 492 old_hwpt = igroup->hwpt; 493 if (hwpt_is_paging(hwpt)) { 494 rc = iommufd_group_do_replace_paging(igroup, 495 to_hwpt_paging(hwpt)); 496 if (rc) 497 goto err_unlock; 498 } 499 500 rc = iommu_group_replace_domain(igroup->group, hwpt->domain); 501 if (rc) 502 goto err_unresv; 503 504 if (hwpt_is_paging(old_hwpt) && 505 (!hwpt_is_paging(hwpt) || 506 to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas)) 507 iommufd_group_remove_reserved_iova(igroup, 508 to_hwpt_paging(old_hwpt)); 509 510 igroup->hwpt = hwpt; 511 512 num_devices = list_count_nodes(&igroup->device_list); 513 /* 514 * Move the refcounts held by the device_list to the new hwpt. Retain a 515 * refcount for this thread as the caller will free it. 516 */ 517 refcount_add(num_devices, &hwpt->obj.users); 518 if (num_devices > 1) 519 WARN_ON(refcount_sub_and_test(num_devices - 1, 520 &old_hwpt->obj.users)); 521 mutex_unlock(&idev->igroup->lock); 522 523 /* Caller must destroy old_hwpt */ 524 return old_hwpt; 525 err_unresv: 526 if (hwpt_is_paging(hwpt)) 527 iommufd_group_remove_reserved_iova(igroup, 528 to_hwpt_paging(old_hwpt)); 529 err_unlock: 530 mutex_unlock(&idev->igroup->lock); 531 return ERR_PTR(rc); 532 } 533 534 typedef struct iommufd_hw_pagetable *(*attach_fn)( 535 struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); 536 537 /* 538 * When automatically managing the domains we search for a compatible domain in 539 * the iopt and if one is found use it, otherwise create a new domain. 540 * Automatic domain selection will never pick a manually created domain. 541 */ 542 static struct iommufd_hw_pagetable * 543 iommufd_device_auto_get_domain(struct iommufd_device *idev, 544 struct iommufd_ioas *ioas, u32 *pt_id, 545 attach_fn do_attach) 546 { 547 /* 548 * iommufd_hw_pagetable_attach() is called by 549 * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as 550 * iommufd_device_do_attach(). So if we are in this mode then we prefer 551 * to use the immediate_attach path as it supports drivers that can't 552 * directly allocate a domain. 553 */ 554 bool immediate_attach = do_attach == iommufd_device_do_attach; 555 struct iommufd_hw_pagetable *destroy_hwpt; 556 struct iommufd_hwpt_paging *hwpt_paging; 557 struct iommufd_hw_pagetable *hwpt; 558 559 /* 560 * There is no differentiation when domains are allocated, so any domain 561 * that is willing to attach to the device is interchangeable with any 562 * other. 563 */ 564 mutex_lock(&ioas->mutex); 565 list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { 566 if (!hwpt_paging->auto_domain) 567 continue; 568 569 hwpt = &hwpt_paging->common; 570 if (!iommufd_lock_obj(&hwpt->obj)) 571 continue; 572 destroy_hwpt = (*do_attach)(idev, hwpt); 573 if (IS_ERR(destroy_hwpt)) { 574 iommufd_put_object(idev->ictx, &hwpt->obj); 575 /* 576 * -EINVAL means the domain is incompatible with the 577 * device. Other error codes should propagate to 578 * userspace as failure. Success means the domain is 579 * attached. 580 */ 581 if (PTR_ERR(destroy_hwpt) == -EINVAL) 582 continue; 583 goto out_unlock; 584 } 585 *pt_id = hwpt->obj.id; 586 iommufd_put_object(idev->ictx, &hwpt->obj); 587 goto out_unlock; 588 } 589 590 hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, 591 immediate_attach, NULL); 592 if (IS_ERR(hwpt_paging)) { 593 destroy_hwpt = ERR_CAST(hwpt_paging); 594 goto out_unlock; 595 } 596 hwpt = &hwpt_paging->common; 597 598 if (!immediate_attach) { 599 destroy_hwpt = (*do_attach)(idev, hwpt); 600 if (IS_ERR(destroy_hwpt)) 601 goto out_abort; 602 } else { 603 destroy_hwpt = NULL; 604 } 605 606 hwpt_paging->auto_domain = true; 607 *pt_id = hwpt->obj.id; 608 609 iommufd_object_finalize(idev->ictx, &hwpt->obj); 610 mutex_unlock(&ioas->mutex); 611 return destroy_hwpt; 612 613 out_abort: 614 iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj); 615 out_unlock: 616 mutex_unlock(&ioas->mutex); 617 return destroy_hwpt; 618 } 619 620 static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, 621 attach_fn do_attach) 622 { 623 struct iommufd_hw_pagetable *destroy_hwpt; 624 struct iommufd_object *pt_obj; 625 626 pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY); 627 if (IS_ERR(pt_obj)) 628 return PTR_ERR(pt_obj); 629 630 switch (pt_obj->type) { 631 case IOMMUFD_OBJ_HWPT_NESTED: 632 case IOMMUFD_OBJ_HWPT_PAGING: { 633 struct iommufd_hw_pagetable *hwpt = 634 container_of(pt_obj, struct iommufd_hw_pagetable, obj); 635 636 destroy_hwpt = (*do_attach)(idev, hwpt); 637 if (IS_ERR(destroy_hwpt)) 638 goto out_put_pt_obj; 639 break; 640 } 641 case IOMMUFD_OBJ_IOAS: { 642 struct iommufd_ioas *ioas = 643 container_of(pt_obj, struct iommufd_ioas, obj); 644 645 destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id, 646 do_attach); 647 if (IS_ERR(destroy_hwpt)) 648 goto out_put_pt_obj; 649 break; 650 } 651 default: 652 destroy_hwpt = ERR_PTR(-EINVAL); 653 goto out_put_pt_obj; 654 } 655 iommufd_put_object(idev->ictx, pt_obj); 656 657 /* This destruction has to be after we unlock everything */ 658 if (destroy_hwpt) 659 iommufd_hw_pagetable_put(idev->ictx, destroy_hwpt); 660 return 0; 661 662 out_put_pt_obj: 663 iommufd_put_object(idev->ictx, pt_obj); 664 return PTR_ERR(destroy_hwpt); 665 } 666 667 /** 668 * iommufd_device_attach - Connect a device to an iommu_domain 669 * @idev: device to attach 670 * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING 671 * Output the IOMMUFD_OBJ_HWPT_PAGING ID 672 * 673 * This connects the device to an iommu_domain, either automatically or manually 674 * selected. Once this completes the device could do DMA. 675 * 676 * The caller should return the resulting pt_id back to userspace. 677 * This function is undone by calling iommufd_device_detach(). 678 */ 679 int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) 680 { 681 int rc; 682 683 rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach); 684 if (rc) 685 return rc; 686 687 /* 688 * Pairs with iommufd_device_detach() - catches caller bugs attempting 689 * to destroy a device with an attachment. 690 */ 691 refcount_inc(&idev->obj.users); 692 return 0; 693 } 694 EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); 695 696 /** 697 * iommufd_device_replace - Change the device's iommu_domain 698 * @idev: device to change 699 * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING 700 * Output the IOMMUFD_OBJ_HWPT_PAGING ID 701 * 702 * This is the same as:: 703 * 704 * iommufd_device_detach(); 705 * iommufd_device_attach(); 706 * 707 * If it fails then no change is made to the attachment. The iommu driver may 708 * implement this so there is no disruption in translation. This can only be 709 * called if iommufd_device_attach() has already succeeded. 710 */ 711 int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) 712 { 713 return iommufd_device_change_pt(idev, pt_id, 714 &iommufd_device_do_replace); 715 } 716 EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD); 717 718 /** 719 * iommufd_device_detach - Disconnect a device to an iommu_domain 720 * @idev: device to detach 721 * 722 * Undo iommufd_device_attach(). This disconnects the idev from the previously 723 * attached pt_id. The device returns back to a blocked DMA translation. 724 */ 725 void iommufd_device_detach(struct iommufd_device *idev) 726 { 727 struct iommufd_hw_pagetable *hwpt; 728 729 hwpt = iommufd_hw_pagetable_detach(idev); 730 iommufd_hw_pagetable_put(idev->ictx, hwpt); 731 refcount_dec(&idev->obj.users); 732 } 733 EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); 734 735 /* 736 * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at 737 * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should 738 * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas. 739 */ 740 static int iommufd_access_change_ioas(struct iommufd_access *access, 741 struct iommufd_ioas *new_ioas) 742 { 743 u32 iopt_access_list_id = access->iopt_access_list_id; 744 struct iommufd_ioas *cur_ioas = access->ioas; 745 int rc; 746 747 lockdep_assert_held(&access->ioas_lock); 748 749 /* We are racing with a concurrent detach, bail */ 750 if (cur_ioas != access->ioas_unpin) 751 return -EBUSY; 752 753 if (cur_ioas == new_ioas) 754 return 0; 755 756 /* 757 * Set ioas to NULL to block any further iommufd_access_pin_pages(). 758 * iommufd_access_unpin_pages() can continue using access->ioas_unpin. 759 */ 760 access->ioas = NULL; 761 762 if (new_ioas) { 763 rc = iopt_add_access(&new_ioas->iopt, access); 764 if (rc) { 765 access->ioas = cur_ioas; 766 return rc; 767 } 768 refcount_inc(&new_ioas->obj.users); 769 } 770 771 if (cur_ioas) { 772 if (access->ops->unmap) { 773 mutex_unlock(&access->ioas_lock); 774 access->ops->unmap(access->data, 0, ULONG_MAX); 775 mutex_lock(&access->ioas_lock); 776 } 777 iopt_remove_access(&cur_ioas->iopt, access, iopt_access_list_id); 778 refcount_dec(&cur_ioas->obj.users); 779 } 780 781 access->ioas = new_ioas; 782 access->ioas_unpin = new_ioas; 783 784 return 0; 785 } 786 787 static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id) 788 { 789 struct iommufd_ioas *ioas = iommufd_get_ioas(access->ictx, id); 790 int rc; 791 792 if (IS_ERR(ioas)) 793 return PTR_ERR(ioas); 794 rc = iommufd_access_change_ioas(access, ioas); 795 iommufd_put_object(access->ictx, &ioas->obj); 796 return rc; 797 } 798 799 void iommufd_access_destroy_object(struct iommufd_object *obj) 800 { 801 struct iommufd_access *access = 802 container_of(obj, struct iommufd_access, obj); 803 804 mutex_lock(&access->ioas_lock); 805 if (access->ioas) 806 WARN_ON(iommufd_access_change_ioas(access, NULL)); 807 mutex_unlock(&access->ioas_lock); 808 iommufd_ctx_put(access->ictx); 809 } 810 811 /** 812 * iommufd_access_create - Create an iommufd_access 813 * @ictx: iommufd file descriptor 814 * @ops: Driver's ops to associate with the access 815 * @data: Opaque data to pass into ops functions 816 * @id: Output ID number to return to userspace for this access 817 * 818 * An iommufd_access allows a driver to read/write to the IOAS without using 819 * DMA. The underlying CPU memory can be accessed using the 820 * iommufd_access_pin_pages() or iommufd_access_rw() functions. 821 * 822 * The provided ops are required to use iommufd_access_pin_pages(). 823 */ 824 struct iommufd_access * 825 iommufd_access_create(struct iommufd_ctx *ictx, 826 const struct iommufd_access_ops *ops, void *data, u32 *id) 827 { 828 struct iommufd_access *access; 829 830 /* 831 * There is no uAPI for the access object, but to keep things symmetric 832 * use the object infrastructure anyhow. 833 */ 834 access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); 835 if (IS_ERR(access)) 836 return access; 837 838 access->data = data; 839 access->ops = ops; 840 841 if (ops->needs_pin_pages) 842 access->iova_alignment = PAGE_SIZE; 843 else 844 access->iova_alignment = 1; 845 846 /* The calling driver is a user until iommufd_access_destroy() */ 847 refcount_inc(&access->obj.users); 848 access->ictx = ictx; 849 iommufd_ctx_get(ictx); 850 iommufd_object_finalize(ictx, &access->obj); 851 *id = access->obj.id; 852 mutex_init(&access->ioas_lock); 853 return access; 854 } 855 EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); 856 857 /** 858 * iommufd_access_destroy - Destroy an iommufd_access 859 * @access: The access to destroy 860 * 861 * The caller must stop using the access before destroying it. 862 */ 863 void iommufd_access_destroy(struct iommufd_access *access) 864 { 865 iommufd_object_destroy_user(access->ictx, &access->obj); 866 } 867 EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); 868 869 void iommufd_access_detach(struct iommufd_access *access) 870 { 871 mutex_lock(&access->ioas_lock); 872 if (WARN_ON(!access->ioas)) { 873 mutex_unlock(&access->ioas_lock); 874 return; 875 } 876 WARN_ON(iommufd_access_change_ioas(access, NULL)); 877 mutex_unlock(&access->ioas_lock); 878 } 879 EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD); 880 881 int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) 882 { 883 int rc; 884 885 mutex_lock(&access->ioas_lock); 886 if (WARN_ON(access->ioas)) { 887 mutex_unlock(&access->ioas_lock); 888 return -EINVAL; 889 } 890 891 rc = iommufd_access_change_ioas_id(access, ioas_id); 892 mutex_unlock(&access->ioas_lock); 893 return rc; 894 } 895 EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); 896 897 int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) 898 { 899 int rc; 900 901 mutex_lock(&access->ioas_lock); 902 if (!access->ioas) { 903 mutex_unlock(&access->ioas_lock); 904 return -ENOENT; 905 } 906 rc = iommufd_access_change_ioas_id(access, ioas_id); 907 mutex_unlock(&access->ioas_lock); 908 return rc; 909 } 910 EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, IOMMUFD); 911 912 /** 913 * iommufd_access_notify_unmap - Notify users of an iopt to stop using it 914 * @iopt: iopt to work on 915 * @iova: Starting iova in the iopt 916 * @length: Number of bytes 917 * 918 * After this function returns there should be no users attached to the pages 919 * linked to this iopt that intersect with iova,length. Anyone that has attached 920 * a user through iopt_access_pages() needs to detach it through 921 * iommufd_access_unpin_pages() before this function returns. 922 * 923 * iommufd_access_destroy() will wait for any outstanding unmap callback to 924 * complete. Once iommufd_access_destroy() no unmap ops are running or will 925 * run in the future. Due to this a driver must not create locking that prevents 926 * unmap to complete while iommufd_access_destroy() is running. 927 */ 928 void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, 929 unsigned long length) 930 { 931 struct iommufd_ioas *ioas = 932 container_of(iopt, struct iommufd_ioas, iopt); 933 struct iommufd_access *access; 934 unsigned long index; 935 936 xa_lock(&ioas->iopt.access_list); 937 xa_for_each(&ioas->iopt.access_list, index, access) { 938 if (!iommufd_lock_obj(&access->obj)) 939 continue; 940 xa_unlock(&ioas->iopt.access_list); 941 942 access->ops->unmap(access->data, iova, length); 943 944 iommufd_put_object(access->ictx, &access->obj); 945 xa_lock(&ioas->iopt.access_list); 946 } 947 xa_unlock(&ioas->iopt.access_list); 948 } 949 950 /** 951 * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages 952 * @access: IOAS access to act on 953 * @iova: Starting IOVA 954 * @length: Number of bytes to access 955 * 956 * Return the struct page's. The caller must stop accessing them before calling 957 * this. The iova/length must exactly match the one provided to access_pages. 958 */ 959 void iommufd_access_unpin_pages(struct iommufd_access *access, 960 unsigned long iova, unsigned long length) 961 { 962 struct iopt_area_contig_iter iter; 963 struct io_pagetable *iopt; 964 unsigned long last_iova; 965 struct iopt_area *area; 966 967 if (WARN_ON(!length) || 968 WARN_ON(check_add_overflow(iova, length - 1, &last_iova))) 969 return; 970 971 mutex_lock(&access->ioas_lock); 972 /* 973 * The driver must be doing something wrong if it calls this before an 974 * iommufd_access_attach() or after an iommufd_access_detach(). 975 */ 976 if (WARN_ON(!access->ioas_unpin)) { 977 mutex_unlock(&access->ioas_lock); 978 return; 979 } 980 iopt = &access->ioas_unpin->iopt; 981 982 down_read(&iopt->iova_rwsem); 983 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) 984 iopt_area_remove_access( 985 area, iopt_area_iova_to_index(area, iter.cur_iova), 986 iopt_area_iova_to_index( 987 area, 988 min(last_iova, iopt_area_last_iova(area)))); 989 WARN_ON(!iopt_area_contig_done(&iter)); 990 up_read(&iopt->iova_rwsem); 991 mutex_unlock(&access->ioas_lock); 992 } 993 EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); 994 995 static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter) 996 { 997 if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE) 998 return false; 999 1000 if (!iopt_area_contig_done(iter) && 1001 (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) % 1002 PAGE_SIZE) != (PAGE_SIZE - 1)) 1003 return false; 1004 return true; 1005 } 1006 1007 static bool check_area_prot(struct iopt_area *area, unsigned int flags) 1008 { 1009 if (flags & IOMMUFD_ACCESS_RW_WRITE) 1010 return area->iommu_prot & IOMMU_WRITE; 1011 return area->iommu_prot & IOMMU_READ; 1012 } 1013 1014 /** 1015 * iommufd_access_pin_pages() - Return a list of pages under the iova 1016 * @access: IOAS access to act on 1017 * @iova: Starting IOVA 1018 * @length: Number of bytes to access 1019 * @out_pages: Output page list 1020 * @flags: IOPMMUFD_ACCESS_RW_* flags 1021 * 1022 * Reads @length bytes starting at iova and returns the struct page * pointers. 1023 * These can be kmap'd by the caller for CPU access. 1024 * 1025 * The caller must perform iommufd_access_unpin_pages() when done to balance 1026 * this. 1027 * 1028 * This API always requires a page aligned iova. This happens naturally if the 1029 * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However 1030 * smaller alignments have corner cases where this API can fail on otherwise 1031 * aligned iova. 1032 */ 1033 int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, 1034 unsigned long length, struct page **out_pages, 1035 unsigned int flags) 1036 { 1037 struct iopt_area_contig_iter iter; 1038 struct io_pagetable *iopt; 1039 unsigned long last_iova; 1040 struct iopt_area *area; 1041 int rc; 1042 1043 /* Driver's ops don't support pin_pages */ 1044 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && 1045 WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap)) 1046 return -EINVAL; 1047 1048 if (!length) 1049 return -EINVAL; 1050 if (check_add_overflow(iova, length - 1, &last_iova)) 1051 return -EOVERFLOW; 1052 1053 mutex_lock(&access->ioas_lock); 1054 if (!access->ioas) { 1055 mutex_unlock(&access->ioas_lock); 1056 return -ENOENT; 1057 } 1058 iopt = &access->ioas->iopt; 1059 1060 down_read(&iopt->iova_rwsem); 1061 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { 1062 unsigned long last = min(last_iova, iopt_area_last_iova(area)); 1063 unsigned long last_index = iopt_area_iova_to_index(area, last); 1064 unsigned long index = 1065 iopt_area_iova_to_index(area, iter.cur_iova); 1066 1067 if (area->prevent_access || 1068 !iopt_area_contig_is_aligned(&iter)) { 1069 rc = -EINVAL; 1070 goto err_remove; 1071 } 1072 1073 if (!check_area_prot(area, flags)) { 1074 rc = -EPERM; 1075 goto err_remove; 1076 } 1077 1078 rc = iopt_area_add_access(area, index, last_index, out_pages, 1079 flags); 1080 if (rc) 1081 goto err_remove; 1082 out_pages += last_index - index + 1; 1083 } 1084 if (!iopt_area_contig_done(&iter)) { 1085 rc = -ENOENT; 1086 goto err_remove; 1087 } 1088 1089 up_read(&iopt->iova_rwsem); 1090 mutex_unlock(&access->ioas_lock); 1091 return 0; 1092 1093 err_remove: 1094 if (iova < iter.cur_iova) { 1095 last_iova = iter.cur_iova - 1; 1096 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) 1097 iopt_area_remove_access( 1098 area, 1099 iopt_area_iova_to_index(area, iter.cur_iova), 1100 iopt_area_iova_to_index( 1101 area, min(last_iova, 1102 iopt_area_last_iova(area)))); 1103 } 1104 up_read(&iopt->iova_rwsem); 1105 mutex_unlock(&access->ioas_lock); 1106 return rc; 1107 } 1108 EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); 1109 1110 /** 1111 * iommufd_access_rw - Read or write data under the iova 1112 * @access: IOAS access to act on 1113 * @iova: Starting IOVA 1114 * @data: Kernel buffer to copy to/from 1115 * @length: Number of bytes to access 1116 * @flags: IOMMUFD_ACCESS_RW_* flags 1117 * 1118 * Copy kernel to/from data into the range given by IOVA/length. If flags 1119 * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized 1120 * by changing it into copy_to/from_user(). 1121 */ 1122 int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, 1123 void *data, size_t length, unsigned int flags) 1124 { 1125 struct iopt_area_contig_iter iter; 1126 struct io_pagetable *iopt; 1127 struct iopt_area *area; 1128 unsigned long last_iova; 1129 int rc; 1130 1131 if (!length) 1132 return -EINVAL; 1133 if (check_add_overflow(iova, length - 1, &last_iova)) 1134 return -EOVERFLOW; 1135 1136 mutex_lock(&access->ioas_lock); 1137 if (!access->ioas) { 1138 mutex_unlock(&access->ioas_lock); 1139 return -ENOENT; 1140 } 1141 iopt = &access->ioas->iopt; 1142 1143 down_read(&iopt->iova_rwsem); 1144 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { 1145 unsigned long last = min(last_iova, iopt_area_last_iova(area)); 1146 unsigned long bytes = (last - iter.cur_iova) + 1; 1147 1148 if (area->prevent_access) { 1149 rc = -EINVAL; 1150 goto err_out; 1151 } 1152 1153 if (!check_area_prot(area, flags)) { 1154 rc = -EPERM; 1155 goto err_out; 1156 } 1157 1158 rc = iopt_pages_rw_access( 1159 area->pages, iopt_area_start_byte(area, iter.cur_iova), 1160 data, bytes, flags); 1161 if (rc) 1162 goto err_out; 1163 data += bytes; 1164 } 1165 if (!iopt_area_contig_done(&iter)) 1166 rc = -ENOENT; 1167 err_out: 1168 up_read(&iopt->iova_rwsem); 1169 mutex_unlock(&access->ioas_lock); 1170 return rc; 1171 } 1172 EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); 1173 1174 int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) 1175 { 1176 struct iommu_hw_info *cmd = ucmd->cmd; 1177 void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr); 1178 const struct iommu_ops *ops; 1179 struct iommufd_device *idev; 1180 unsigned int data_len; 1181 unsigned int copy_len; 1182 void *data; 1183 int rc; 1184 1185 if (cmd->flags || cmd->__reserved) 1186 return -EOPNOTSUPP; 1187 1188 idev = iommufd_get_device(ucmd, cmd->dev_id); 1189 if (IS_ERR(idev)) 1190 return PTR_ERR(idev); 1191 1192 ops = dev_iommu_ops(idev->dev); 1193 if (ops->hw_info) { 1194 data = ops->hw_info(idev->dev, &data_len, &cmd->out_data_type); 1195 if (IS_ERR(data)) { 1196 rc = PTR_ERR(data); 1197 goto out_put; 1198 } 1199 1200 /* 1201 * drivers that have hw_info callback should have a unique 1202 * iommu_hw_info_type. 1203 */ 1204 if (WARN_ON_ONCE(cmd->out_data_type == 1205 IOMMU_HW_INFO_TYPE_NONE)) { 1206 rc = -ENODEV; 1207 goto out_free; 1208 } 1209 } else { 1210 cmd->out_data_type = IOMMU_HW_INFO_TYPE_NONE; 1211 data_len = 0; 1212 data = NULL; 1213 } 1214 1215 copy_len = min(cmd->data_len, data_len); 1216 if (copy_to_user(user_ptr, data, copy_len)) { 1217 rc = -EFAULT; 1218 goto out_free; 1219 } 1220 1221 /* 1222 * Zero the trailing bytes if the user buffer is bigger than the 1223 * data size kernel actually has. 1224 */ 1225 if (copy_len < cmd->data_len) { 1226 if (clear_user(user_ptr + copy_len, cmd->data_len - copy_len)) { 1227 rc = -EFAULT; 1228 goto out_free; 1229 } 1230 } 1231 1232 /* 1233 * We return the length the kernel supports so userspace may know what 1234 * the kernel capability is. It could be larger than the input buffer. 1235 */ 1236 cmd->data_len = data_len; 1237 1238 cmd->out_capabilities = 0; 1239 if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) 1240 cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; 1241 1242 rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); 1243 out_free: 1244 kfree(data); 1245 out_put: 1246 iommufd_put_object(ucmd->ictx, &idev->obj); 1247 return rc; 1248 } 1249