1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. 3 * 4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The 5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page 6 * list for access by an in-kernel user. 7 * 8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs 9 * between the domains and xarray. 10 */ 11 #include <linux/iommufd.h> 12 #include <linux/lockdep.h> 13 #include <linux/iommu.h> 14 #include <linux/sched/mm.h> 15 #include <linux/err.h> 16 #include <linux/slab.h> 17 #include <linux/errno.h> 18 19 #include "io_pagetable.h" 20 #include "double_span.h" 21 22 struct iopt_pages_list { 23 struct iopt_pages *pages; 24 struct iopt_area *area; 25 struct list_head next; 26 unsigned long start_byte; 27 unsigned long length; 28 }; 29 30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, 31 struct io_pagetable *iopt, 32 unsigned long iova, 33 unsigned long last_iova) 34 { 35 lockdep_assert_held(&iopt->iova_rwsem); 36 37 iter->cur_iova = iova; 38 iter->last_iova = last_iova; 39 iter->area = iopt_area_iter_first(iopt, iova, iova); 40 if (!iter->area) 41 return NULL; 42 if (!iter->area->pages) { 43 iter->area = NULL; 44 return NULL; 45 } 46 return iter->area; 47 } 48 49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) 50 { 51 unsigned long last_iova; 52 53 if (!iter->area) 54 return NULL; 55 last_iova = iopt_area_last_iova(iter->area); 56 if (iter->last_iova <= last_iova) 57 return NULL; 58 59 iter->cur_iova = last_iova + 1; 60 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, 61 iter->last_iova); 62 if (!iter->area) 63 return NULL; 64 if (iter->cur_iova != iopt_area_iova(iter->area) || 65 !iter->area->pages) { 66 iter->area = NULL; 67 return NULL; 68 } 69 return iter->area; 70 } 71 72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, 73 unsigned long length, 74 unsigned long iova_alignment, 75 unsigned long page_offset) 76 { 77 if (span->is_used || span->last_hole - span->start_hole < length - 1) 78 return false; 79 80 span->start_hole = ALIGN(span->start_hole, iova_alignment) | 81 page_offset; 82 if (span->start_hole > span->last_hole || 83 span->last_hole - span->start_hole < length - 1) 84 return false; 85 return true; 86 } 87 88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, 89 unsigned long length, 90 unsigned long iova_alignment, 91 unsigned long page_offset) 92 { 93 if (span->is_hole || span->last_used - span->start_used < length - 1) 94 return false; 95 96 span->start_used = ALIGN(span->start_used, iova_alignment) | 97 page_offset; 98 if (span->start_used > span->last_used || 99 span->last_used - span->start_used < length - 1) 100 return false; 101 return true; 102 } 103 104 /* 105 * Automatically find a block of IOVA that is not being used and not reserved. 106 * Does not return a 0 IOVA even if it is valid. 107 */ 108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, 109 unsigned long uptr, unsigned long length) 110 { 111 unsigned long page_offset = uptr % PAGE_SIZE; 112 struct interval_tree_double_span_iter used_span; 113 struct interval_tree_span_iter allowed_span; 114 unsigned long iova_alignment; 115 116 lockdep_assert_held(&iopt->iova_rwsem); 117 118 /* Protect roundup_pow-of_two() from overflow */ 119 if (length == 0 || length >= ULONG_MAX / 2) 120 return -EOVERFLOW; 121 122 /* 123 * Keep alignment present in the uptr when building the IOVA, this 124 * increases the chance we can map a THP. 125 */ 126 if (!uptr) 127 iova_alignment = roundup_pow_of_two(length); 128 else 129 iova_alignment = min_t(unsigned long, 130 roundup_pow_of_two(length), 131 1UL << __ffs64(uptr)); 132 133 if (iova_alignment < iopt->iova_alignment) 134 return -EINVAL; 135 136 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, 137 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { 138 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { 139 allowed_span.start_used = PAGE_SIZE; 140 allowed_span.last_used = ULONG_MAX - PAGE_SIZE; 141 allowed_span.is_hole = false; 142 } 143 144 if (!__alloc_iova_check_used(&allowed_span, length, 145 iova_alignment, page_offset)) 146 continue; 147 148 interval_tree_for_each_double_span( 149 &used_span, &iopt->reserved_itree, &iopt->area_itree, 150 allowed_span.start_used, allowed_span.last_used) { 151 if (!__alloc_iova_check_hole(&used_span, length, 152 iova_alignment, 153 page_offset)) 154 continue; 155 156 *iova = used_span.start_hole; 157 return 0; 158 } 159 } 160 return -ENOSPC; 161 } 162 163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, 164 unsigned long length) 165 { 166 unsigned long last; 167 168 lockdep_assert_held(&iopt->iova_rwsem); 169 170 if ((iova & (iopt->iova_alignment - 1))) 171 return -EINVAL; 172 173 if (check_add_overflow(iova, length - 1, &last)) 174 return -EOVERFLOW; 175 176 /* No reserved IOVA intersects the range */ 177 if (iopt_reserved_iter_first(iopt, iova, last)) 178 return -EINVAL; 179 180 /* Check that there is not already a mapping in the range */ 181 if (iopt_area_iter_first(iopt, iova, last)) 182 return -EEXIST; 183 return 0; 184 } 185 186 /* 187 * The area takes a slice of the pages from start_bytes to start_byte + length 188 */ 189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, 190 struct iopt_pages *pages, unsigned long iova, 191 unsigned long start_byte, unsigned long length, 192 int iommu_prot) 193 { 194 lockdep_assert_held_write(&iopt->iova_rwsem); 195 196 if ((iommu_prot & IOMMU_WRITE) && !pages->writable) 197 return -EPERM; 198 199 area->iommu_prot = iommu_prot; 200 area->page_offset = start_byte % PAGE_SIZE; 201 if (area->page_offset & (iopt->iova_alignment - 1)) 202 return -EINVAL; 203 204 area->node.start = iova; 205 if (check_add_overflow(iova, length - 1, &area->node.last)) 206 return -EOVERFLOW; 207 208 area->pages_node.start = start_byte / PAGE_SIZE; 209 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) 210 return -EOVERFLOW; 211 area->pages_node.last = area->pages_node.last / PAGE_SIZE; 212 if (WARN_ON(area->pages_node.last >= pages->npages)) 213 return -EOVERFLOW; 214 215 /* 216 * The area is inserted with a NULL pages indicating it is not fully 217 * initialized yet. 218 */ 219 area->iopt = iopt; 220 interval_tree_insert(&area->node, &iopt->area_itree); 221 return 0; 222 } 223 224 static int iopt_alloc_area_pages(struct io_pagetable *iopt, 225 struct list_head *pages_list, 226 unsigned long length, unsigned long *dst_iova, 227 int iommu_prot, unsigned int flags) 228 { 229 struct iopt_pages_list *elm; 230 unsigned long iova; 231 int rc = 0; 232 233 list_for_each_entry(elm, pages_list, next) { 234 elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT); 235 if (!elm->area) 236 return -ENOMEM; 237 } 238 239 down_write(&iopt->iova_rwsem); 240 if ((length & (iopt->iova_alignment - 1)) || !length) { 241 rc = -EINVAL; 242 goto out_unlock; 243 } 244 245 if (flags & IOPT_ALLOC_IOVA) { 246 /* Use the first entry to guess the ideal IOVA alignment */ 247 elm = list_first_entry(pages_list, struct iopt_pages_list, 248 next); 249 rc = iopt_alloc_iova( 250 iopt, dst_iova, 251 (uintptr_t)elm->pages->uptr + elm->start_byte, length); 252 if (rc) 253 goto out_unlock; 254 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && 255 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { 256 rc = -EINVAL; 257 goto out_unlock; 258 } 259 } else { 260 rc = iopt_check_iova(iopt, *dst_iova, length); 261 if (rc) 262 goto out_unlock; 263 } 264 265 /* 266 * Areas are created with a NULL pages so that the IOVA space is 267 * reserved and we can unlock the iova_rwsem. 268 */ 269 iova = *dst_iova; 270 list_for_each_entry(elm, pages_list, next) { 271 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, 272 elm->start_byte, elm->length, iommu_prot); 273 if (rc) 274 goto out_unlock; 275 iova += elm->length; 276 } 277 278 out_unlock: 279 up_write(&iopt->iova_rwsem); 280 return rc; 281 } 282 283 static void iopt_abort_area(struct iopt_area *area) 284 { 285 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 286 WARN_ON(area->pages); 287 if (area->iopt) { 288 down_write(&area->iopt->iova_rwsem); 289 interval_tree_remove(&area->node, &area->iopt->area_itree); 290 up_write(&area->iopt->iova_rwsem); 291 } 292 kfree(area); 293 } 294 295 void iopt_free_pages_list(struct list_head *pages_list) 296 { 297 struct iopt_pages_list *elm; 298 299 while ((elm = list_first_entry_or_null(pages_list, 300 struct iopt_pages_list, next))) { 301 if (elm->area) 302 iopt_abort_area(elm->area); 303 if (elm->pages) 304 iopt_put_pages(elm->pages); 305 list_del(&elm->next); 306 kfree(elm); 307 } 308 } 309 310 static int iopt_fill_domains_pages(struct list_head *pages_list) 311 { 312 struct iopt_pages_list *undo_elm; 313 struct iopt_pages_list *elm; 314 int rc; 315 316 list_for_each_entry(elm, pages_list, next) { 317 rc = iopt_area_fill_domains(elm->area, elm->pages); 318 if (rc) 319 goto err_undo; 320 } 321 return 0; 322 323 err_undo: 324 list_for_each_entry(undo_elm, pages_list, next) { 325 if (undo_elm == elm) 326 break; 327 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); 328 } 329 return rc; 330 } 331 332 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, 333 unsigned long length, unsigned long *dst_iova, 334 int iommu_prot, unsigned int flags) 335 { 336 struct iopt_pages_list *elm; 337 int rc; 338 339 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, 340 iommu_prot, flags); 341 if (rc) 342 return rc; 343 344 down_read(&iopt->domains_rwsem); 345 rc = iopt_fill_domains_pages(pages_list); 346 if (rc) 347 goto out_unlock_domains; 348 349 down_write(&iopt->iova_rwsem); 350 list_for_each_entry(elm, pages_list, next) { 351 /* 352 * area->pages must be set inside the domains_rwsem to ensure 353 * any newly added domains will get filled. Moves the reference 354 * in from the list. 355 */ 356 elm->area->pages = elm->pages; 357 elm->pages = NULL; 358 elm->area = NULL; 359 } 360 up_write(&iopt->iova_rwsem); 361 out_unlock_domains: 362 up_read(&iopt->domains_rwsem); 363 return rc; 364 } 365 366 /** 367 * iopt_map_user_pages() - Map a user VA to an iova in the io page table 368 * @ictx: iommufd_ctx the iopt is part of 369 * @iopt: io_pagetable to act on 370 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains 371 * the chosen iova on output. Otherwise is the iova to map to on input 372 * @uptr: User VA to map 373 * @length: Number of bytes to map 374 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping 375 * @flags: IOPT_ALLOC_IOVA or zero 376 * 377 * iova, uptr, and length must be aligned to iova_alignment. For domain backed 378 * page tables this will pin the pages and load them into the domain at iova. 379 * For non-domain page tables this will only setup a lazy reference and the 380 * caller must use iopt_access_pages() to touch them. 381 * 382 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be 383 * destroyed. 384 */ 385 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, 386 unsigned long *iova, void __user *uptr, 387 unsigned long length, int iommu_prot, 388 unsigned int flags) 389 { 390 struct iopt_pages_list elm = {}; 391 LIST_HEAD(pages_list); 392 int rc; 393 394 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); 395 if (IS_ERR(elm.pages)) 396 return PTR_ERR(elm.pages); 397 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && 398 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) 399 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; 400 elm.start_byte = uptr - elm.pages->uptr; 401 elm.length = length; 402 list_add(&elm.next, &pages_list); 403 404 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); 405 if (rc) { 406 if (elm.area) 407 iopt_abort_area(elm.area); 408 if (elm.pages) 409 iopt_put_pages(elm.pages); 410 return rc; 411 } 412 return 0; 413 } 414 415 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, 416 unsigned long length, struct list_head *pages_list) 417 { 418 struct iopt_area_contig_iter iter; 419 unsigned long last_iova; 420 struct iopt_area *area; 421 int rc; 422 423 if (!length) 424 return -EINVAL; 425 if (check_add_overflow(iova, length - 1, &last_iova)) 426 return -EOVERFLOW; 427 428 down_read(&iopt->iova_rwsem); 429 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { 430 struct iopt_pages_list *elm; 431 unsigned long last = min(last_iova, iopt_area_last_iova(area)); 432 433 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); 434 if (!elm) { 435 rc = -ENOMEM; 436 goto err_free; 437 } 438 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); 439 elm->pages = area->pages; 440 elm->length = (last - iter.cur_iova) + 1; 441 kref_get(&elm->pages->kref); 442 list_add_tail(&elm->next, pages_list); 443 } 444 if (!iopt_area_contig_done(&iter)) { 445 rc = -ENOENT; 446 goto err_free; 447 } 448 up_read(&iopt->iova_rwsem); 449 return 0; 450 err_free: 451 up_read(&iopt->iova_rwsem); 452 iopt_free_pages_list(pages_list); 453 return rc; 454 } 455 456 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, 457 unsigned long last, unsigned long *unmapped) 458 { 459 struct iopt_area *area; 460 unsigned long unmapped_bytes = 0; 461 unsigned int tries = 0; 462 int rc = -ENOENT; 463 464 /* 465 * The domains_rwsem must be held in read mode any time any area->pages 466 * is NULL. This prevents domain attach/detatch from running 467 * concurrently with cleaning up the area. 468 */ 469 again: 470 down_read(&iopt->domains_rwsem); 471 down_write(&iopt->iova_rwsem); 472 while ((area = iopt_area_iter_first(iopt, start, last))) { 473 unsigned long area_last = iopt_area_last_iova(area); 474 unsigned long area_first = iopt_area_iova(area); 475 struct iopt_pages *pages; 476 477 /* Userspace should not race map/unmap's of the same area */ 478 if (!area->pages) { 479 rc = -EBUSY; 480 goto out_unlock_iova; 481 } 482 483 if (area_first < start || area_last > last) { 484 rc = -ENOENT; 485 goto out_unlock_iova; 486 } 487 488 if (area_first != start) 489 tries = 0; 490 491 /* 492 * num_accesses writers must hold the iova_rwsem too, so we can 493 * safely read it under the write side of the iovam_rwsem 494 * without the pages->mutex. 495 */ 496 if (area->num_accesses) { 497 size_t length = iopt_area_length(area); 498 499 start = area_first; 500 area->prevent_access = true; 501 up_write(&iopt->iova_rwsem); 502 up_read(&iopt->domains_rwsem); 503 504 iommufd_access_notify_unmap(iopt, area_first, length); 505 /* Something is not responding to unmap requests. */ 506 tries++; 507 if (WARN_ON(tries > 100)) 508 return -EDEADLOCK; 509 goto again; 510 } 511 512 pages = area->pages; 513 area->pages = NULL; 514 up_write(&iopt->iova_rwsem); 515 516 iopt_area_unfill_domains(area, pages); 517 iopt_abort_area(area); 518 iopt_put_pages(pages); 519 520 unmapped_bytes += area_last - area_first + 1; 521 522 down_write(&iopt->iova_rwsem); 523 } 524 if (unmapped_bytes) 525 rc = 0; 526 527 out_unlock_iova: 528 up_write(&iopt->iova_rwsem); 529 up_read(&iopt->domains_rwsem); 530 if (unmapped) 531 *unmapped = unmapped_bytes; 532 return rc; 533 } 534 535 /** 536 * iopt_unmap_iova() - Remove a range of iova 537 * @iopt: io_pagetable to act on 538 * @iova: Starting iova to unmap 539 * @length: Number of bytes to unmap 540 * @unmapped: Return number of bytes unmapped 541 * 542 * The requested range must be a superset of existing ranges. 543 * Splitting/truncating IOVA mappings is not allowed. 544 */ 545 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, 546 unsigned long length, unsigned long *unmapped) 547 { 548 unsigned long iova_last; 549 550 if (!length) 551 return -EINVAL; 552 553 if (check_add_overflow(iova, length - 1, &iova_last)) 554 return -EOVERFLOW; 555 556 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); 557 } 558 559 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) 560 { 561 int rc; 562 563 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); 564 /* If the IOVAs are empty then unmap all succeeds */ 565 if (rc == -ENOENT) 566 return 0; 567 return rc; 568 } 569 570 /* The caller must always free all the nodes in the allowed_iova rb_root. */ 571 int iopt_set_allow_iova(struct io_pagetable *iopt, 572 struct rb_root_cached *allowed_iova) 573 { 574 struct iopt_allowed *allowed; 575 576 down_write(&iopt->iova_rwsem); 577 swap(*allowed_iova, iopt->allowed_itree); 578 579 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; 580 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { 581 if (iopt_reserved_iter_first(iopt, allowed->node.start, 582 allowed->node.last)) { 583 swap(*allowed_iova, iopt->allowed_itree); 584 up_write(&iopt->iova_rwsem); 585 return -EADDRINUSE; 586 } 587 } 588 up_write(&iopt->iova_rwsem); 589 return 0; 590 } 591 592 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, 593 unsigned long last, void *owner) 594 { 595 struct iopt_reserved *reserved; 596 597 lockdep_assert_held_write(&iopt->iova_rwsem); 598 599 if (iopt_area_iter_first(iopt, start, last) || 600 iopt_allowed_iter_first(iopt, start, last)) 601 return -EADDRINUSE; 602 603 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); 604 if (!reserved) 605 return -ENOMEM; 606 reserved->node.start = start; 607 reserved->node.last = last; 608 reserved->owner = owner; 609 interval_tree_insert(&reserved->node, &iopt->reserved_itree); 610 return 0; 611 } 612 613 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) 614 { 615 struct iopt_reserved *reserved, *next; 616 617 lockdep_assert_held_write(&iopt->iova_rwsem); 618 619 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; 620 reserved = next) { 621 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); 622 623 if (reserved->owner == owner) { 624 interval_tree_remove(&reserved->node, 625 &iopt->reserved_itree); 626 kfree(reserved); 627 } 628 } 629 } 630 631 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) 632 { 633 down_write(&iopt->iova_rwsem); 634 __iopt_remove_reserved_iova(iopt, owner); 635 up_write(&iopt->iova_rwsem); 636 } 637 638 void iopt_init_table(struct io_pagetable *iopt) 639 { 640 init_rwsem(&iopt->iova_rwsem); 641 init_rwsem(&iopt->domains_rwsem); 642 iopt->area_itree = RB_ROOT_CACHED; 643 iopt->allowed_itree = RB_ROOT_CACHED; 644 iopt->reserved_itree = RB_ROOT_CACHED; 645 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); 646 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); 647 648 /* 649 * iopt's start as SW tables that can use the entire size_t IOVA space 650 * due to the use of size_t in the APIs. They have no alignment 651 * restriction. 652 */ 653 iopt->iova_alignment = 1; 654 } 655 656 void iopt_destroy_table(struct io_pagetable *iopt) 657 { 658 struct interval_tree_node *node; 659 660 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 661 iopt_remove_reserved_iova(iopt, NULL); 662 663 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, 664 ULONG_MAX))) { 665 interval_tree_remove(node, &iopt->allowed_itree); 666 kfree(container_of(node, struct iopt_allowed, node)); 667 } 668 669 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); 670 WARN_ON(!xa_empty(&iopt->domains)); 671 WARN_ON(!xa_empty(&iopt->access_list)); 672 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); 673 } 674 675 /** 676 * iopt_unfill_domain() - Unfill a domain with PFNs 677 * @iopt: io_pagetable to act on 678 * @domain: domain to unfill 679 * 680 * This is used when removing a domain from the iopt. Every area in the iopt 681 * will be unmapped from the domain. The domain must already be removed from the 682 * domains xarray. 683 */ 684 static void iopt_unfill_domain(struct io_pagetable *iopt, 685 struct iommu_domain *domain) 686 { 687 struct iopt_area *area; 688 689 lockdep_assert_held(&iopt->iova_rwsem); 690 lockdep_assert_held_write(&iopt->domains_rwsem); 691 692 /* 693 * Some other domain is holding all the pfns still, rapidly unmap this 694 * domain. 695 */ 696 if (iopt->next_domain_id != 0) { 697 /* Pick an arbitrary remaining domain to act as storage */ 698 struct iommu_domain *storage_domain = 699 xa_load(&iopt->domains, 0); 700 701 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 702 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 703 struct iopt_pages *pages = area->pages; 704 705 if (!pages) 706 continue; 707 708 mutex_lock(&pages->mutex); 709 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 710 WARN_ON(!area->storage_domain); 711 if (area->storage_domain == domain) 712 area->storage_domain = storage_domain; 713 mutex_unlock(&pages->mutex); 714 715 iopt_area_unmap_domain(area, domain); 716 } 717 return; 718 } 719 720 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 721 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 722 struct iopt_pages *pages = area->pages; 723 724 if (!pages) 725 continue; 726 727 mutex_lock(&pages->mutex); 728 interval_tree_remove(&area->pages_node, &pages->domains_itree); 729 WARN_ON(area->storage_domain != domain); 730 area->storage_domain = NULL; 731 iopt_area_unfill_domain(area, pages, domain); 732 mutex_unlock(&pages->mutex); 733 } 734 } 735 736 /** 737 * iopt_fill_domain() - Fill a domain with PFNs 738 * @iopt: io_pagetable to act on 739 * @domain: domain to fill 740 * 741 * Fill the domain with PFNs from every area in the iopt. On failure the domain 742 * is left unchanged. 743 */ 744 static int iopt_fill_domain(struct io_pagetable *iopt, 745 struct iommu_domain *domain) 746 { 747 struct iopt_area *end_area; 748 struct iopt_area *area; 749 int rc; 750 751 lockdep_assert_held(&iopt->iova_rwsem); 752 lockdep_assert_held_write(&iopt->domains_rwsem); 753 754 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 755 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 756 struct iopt_pages *pages = area->pages; 757 758 if (!pages) 759 continue; 760 761 mutex_lock(&pages->mutex); 762 rc = iopt_area_fill_domain(area, domain); 763 if (rc) { 764 mutex_unlock(&pages->mutex); 765 goto out_unfill; 766 } 767 if (!area->storage_domain) { 768 WARN_ON(iopt->next_domain_id != 0); 769 area->storage_domain = domain; 770 interval_tree_insert(&area->pages_node, 771 &pages->domains_itree); 772 } 773 mutex_unlock(&pages->mutex); 774 } 775 return 0; 776 777 out_unfill: 778 end_area = area; 779 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 780 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 781 struct iopt_pages *pages = area->pages; 782 783 if (area == end_area) 784 break; 785 if (!pages) 786 continue; 787 mutex_lock(&pages->mutex); 788 if (iopt->next_domain_id == 0) { 789 interval_tree_remove(&area->pages_node, 790 &pages->domains_itree); 791 area->storage_domain = NULL; 792 } 793 iopt_area_unfill_domain(area, pages, domain); 794 mutex_unlock(&pages->mutex); 795 } 796 return rc; 797 } 798 799 /* All existing area's conform to an increased page size */ 800 static int iopt_check_iova_alignment(struct io_pagetable *iopt, 801 unsigned long new_iova_alignment) 802 { 803 unsigned long align_mask = new_iova_alignment - 1; 804 struct iopt_area *area; 805 806 lockdep_assert_held(&iopt->iova_rwsem); 807 lockdep_assert_held(&iopt->domains_rwsem); 808 809 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 810 area = iopt_area_iter_next(area, 0, ULONG_MAX)) 811 if ((iopt_area_iova(area) & align_mask) || 812 (iopt_area_length(area) & align_mask) || 813 (area->page_offset & align_mask)) 814 return -EADDRINUSE; 815 816 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { 817 struct iommufd_access *access; 818 unsigned long index; 819 820 xa_for_each(&iopt->access_list, index, access) 821 if (WARN_ON(access->iova_alignment > 822 new_iova_alignment)) 823 return -EADDRINUSE; 824 } 825 return 0; 826 } 827 828 int iopt_table_add_domain(struct io_pagetable *iopt, 829 struct iommu_domain *domain) 830 { 831 const struct iommu_domain_geometry *geometry = &domain->geometry; 832 struct iommu_domain *iter_domain; 833 unsigned int new_iova_alignment; 834 unsigned long index; 835 int rc; 836 837 down_write(&iopt->domains_rwsem); 838 down_write(&iopt->iova_rwsem); 839 840 xa_for_each(&iopt->domains, index, iter_domain) { 841 if (WARN_ON(iter_domain == domain)) { 842 rc = -EEXIST; 843 goto out_unlock; 844 } 845 } 846 847 /* 848 * The io page size drives the iova_alignment. Internally the iopt_pages 849 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE 850 * objects into the iommu_domain. 851 * 852 * A iommu_domain must always be able to accept PAGE_SIZE to be 853 * compatible as we can't guarantee higher contiguity. 854 */ 855 new_iova_alignment = max_t(unsigned long, 856 1UL << __ffs(domain->pgsize_bitmap), 857 iopt->iova_alignment); 858 if (new_iova_alignment > PAGE_SIZE) { 859 rc = -EINVAL; 860 goto out_unlock; 861 } 862 if (new_iova_alignment != iopt->iova_alignment) { 863 rc = iopt_check_iova_alignment(iopt, new_iova_alignment); 864 if (rc) 865 goto out_unlock; 866 } 867 868 /* No area exists that is outside the allowed domain aperture */ 869 if (geometry->aperture_start != 0) { 870 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, 871 domain); 872 if (rc) 873 goto out_reserved; 874 } 875 if (geometry->aperture_end != ULONG_MAX) { 876 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, 877 ULONG_MAX, domain); 878 if (rc) 879 goto out_reserved; 880 } 881 882 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); 883 if (rc) 884 goto out_reserved; 885 886 rc = iopt_fill_domain(iopt, domain); 887 if (rc) 888 goto out_release; 889 890 iopt->iova_alignment = new_iova_alignment; 891 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); 892 iopt->next_domain_id++; 893 up_write(&iopt->iova_rwsem); 894 up_write(&iopt->domains_rwsem); 895 return 0; 896 out_release: 897 xa_release(&iopt->domains, iopt->next_domain_id); 898 out_reserved: 899 __iopt_remove_reserved_iova(iopt, domain); 900 out_unlock: 901 up_write(&iopt->iova_rwsem); 902 up_write(&iopt->domains_rwsem); 903 return rc; 904 } 905 906 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) 907 { 908 unsigned long new_iova_alignment; 909 struct iommufd_access *access; 910 struct iommu_domain *domain; 911 unsigned long index; 912 913 lockdep_assert_held_write(&iopt->iova_rwsem); 914 lockdep_assert_held(&iopt->domains_rwsem); 915 916 /* See batch_iommu_map_small() */ 917 if (iopt->disable_large_pages) 918 new_iova_alignment = PAGE_SIZE; 919 else 920 new_iova_alignment = 1; 921 922 xa_for_each(&iopt->domains, index, domain) 923 new_iova_alignment = max_t(unsigned long, 924 1UL << __ffs(domain->pgsize_bitmap), 925 new_iova_alignment); 926 xa_for_each(&iopt->access_list, index, access) 927 new_iova_alignment = max_t(unsigned long, 928 access->iova_alignment, 929 new_iova_alignment); 930 931 if (new_iova_alignment > iopt->iova_alignment) { 932 int rc; 933 934 rc = iopt_check_iova_alignment(iopt, new_iova_alignment); 935 if (rc) 936 return rc; 937 } 938 iopt->iova_alignment = new_iova_alignment; 939 return 0; 940 } 941 942 void iopt_table_remove_domain(struct io_pagetable *iopt, 943 struct iommu_domain *domain) 944 { 945 struct iommu_domain *iter_domain = NULL; 946 unsigned long index; 947 948 down_write(&iopt->domains_rwsem); 949 down_write(&iopt->iova_rwsem); 950 951 xa_for_each(&iopt->domains, index, iter_domain) 952 if (iter_domain == domain) 953 break; 954 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) 955 goto out_unlock; 956 957 /* 958 * Compress the xarray to keep it linear by swapping the entry to erase 959 * with the tail entry and shrinking the tail. 960 */ 961 iopt->next_domain_id--; 962 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); 963 if (index != iopt->next_domain_id) 964 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); 965 966 iopt_unfill_domain(iopt, domain); 967 __iopt_remove_reserved_iova(iopt, domain); 968 969 WARN_ON(iopt_calculate_iova_alignment(iopt)); 970 out_unlock: 971 up_write(&iopt->iova_rwsem); 972 up_write(&iopt->domains_rwsem); 973 } 974 975 /** 976 * iopt_area_split - Split an area into two parts at iova 977 * @area: The area to split 978 * @iova: Becomes the last of a new area 979 * 980 * This splits an area into two. It is part of the VFIO compatibility to allow 981 * poking a hole in the mapping. The two areas continue to point at the same 982 * iopt_pages, just with different starting bytes. 983 */ 984 static int iopt_area_split(struct iopt_area *area, unsigned long iova) 985 { 986 unsigned long alignment = area->iopt->iova_alignment; 987 unsigned long last_iova = iopt_area_last_iova(area); 988 unsigned long start_iova = iopt_area_iova(area); 989 unsigned long new_start = iova + 1; 990 struct io_pagetable *iopt = area->iopt; 991 struct iopt_pages *pages = area->pages; 992 struct iopt_area *lhs; 993 struct iopt_area *rhs; 994 int rc; 995 996 lockdep_assert_held_write(&iopt->iova_rwsem); 997 998 if (iova == start_iova || iova == last_iova) 999 return 0; 1000 1001 if (!pages || area->prevent_access) 1002 return -EBUSY; 1003 1004 if (new_start & (alignment - 1) || 1005 iopt_area_start_byte(area, new_start) & (alignment - 1)) 1006 return -EINVAL; 1007 1008 lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); 1009 if (!lhs) 1010 return -ENOMEM; 1011 1012 rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); 1013 if (!rhs) { 1014 rc = -ENOMEM; 1015 goto err_free_lhs; 1016 } 1017 1018 mutex_lock(&pages->mutex); 1019 /* 1020 * Splitting is not permitted if an access exists, we don't track enough 1021 * information to split existing accesses. 1022 */ 1023 if (area->num_accesses) { 1024 rc = -EINVAL; 1025 goto err_unlock; 1026 } 1027 1028 /* 1029 * Splitting is not permitted if a domain could have been mapped with 1030 * huge pages. 1031 */ 1032 if (area->storage_domain && !iopt->disable_large_pages) { 1033 rc = -EINVAL; 1034 goto err_unlock; 1035 } 1036 1037 interval_tree_remove(&area->node, &iopt->area_itree); 1038 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, 1039 iopt_area_start_byte(area, start_iova), 1040 (new_start - 1) - start_iova + 1, 1041 area->iommu_prot); 1042 if (WARN_ON(rc)) 1043 goto err_insert; 1044 1045 rc = iopt_insert_area(iopt, rhs, area->pages, new_start, 1046 iopt_area_start_byte(area, new_start), 1047 last_iova - new_start + 1, area->iommu_prot); 1048 if (WARN_ON(rc)) 1049 goto err_remove_lhs; 1050 1051 lhs->storage_domain = area->storage_domain; 1052 lhs->pages = area->pages; 1053 rhs->storage_domain = area->storage_domain; 1054 rhs->pages = area->pages; 1055 kref_get(&rhs->pages->kref); 1056 kfree(area); 1057 mutex_unlock(&pages->mutex); 1058 1059 /* 1060 * No change to domains or accesses because the pages hasn't been 1061 * changed 1062 */ 1063 return 0; 1064 1065 err_remove_lhs: 1066 interval_tree_remove(&lhs->node, &iopt->area_itree); 1067 err_insert: 1068 interval_tree_insert(&area->node, &iopt->area_itree); 1069 err_unlock: 1070 mutex_unlock(&pages->mutex); 1071 kfree(rhs); 1072 err_free_lhs: 1073 kfree(lhs); 1074 return rc; 1075 } 1076 1077 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, 1078 size_t num_iovas) 1079 { 1080 int rc = 0; 1081 int i; 1082 1083 down_write(&iopt->iova_rwsem); 1084 for (i = 0; i < num_iovas; i++) { 1085 struct iopt_area *area; 1086 1087 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); 1088 if (!area) 1089 continue; 1090 rc = iopt_area_split(area, iovas[i]); 1091 if (rc) 1092 break; 1093 } 1094 up_write(&iopt->iova_rwsem); 1095 return rc; 1096 } 1097 1098 void iopt_enable_large_pages(struct io_pagetable *iopt) 1099 { 1100 int rc; 1101 1102 down_write(&iopt->domains_rwsem); 1103 down_write(&iopt->iova_rwsem); 1104 WRITE_ONCE(iopt->disable_large_pages, false); 1105 rc = iopt_calculate_iova_alignment(iopt); 1106 WARN_ON(rc); 1107 up_write(&iopt->iova_rwsem); 1108 up_write(&iopt->domains_rwsem); 1109 } 1110 1111 int iopt_disable_large_pages(struct io_pagetable *iopt) 1112 { 1113 int rc = 0; 1114 1115 down_write(&iopt->domains_rwsem); 1116 down_write(&iopt->iova_rwsem); 1117 if (iopt->disable_large_pages) 1118 goto out_unlock; 1119 1120 /* Won't do it if domains already have pages mapped in them */ 1121 if (!xa_empty(&iopt->domains) && 1122 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { 1123 rc = -EINVAL; 1124 goto out_unlock; 1125 } 1126 1127 WRITE_ONCE(iopt->disable_large_pages, true); 1128 rc = iopt_calculate_iova_alignment(iopt); 1129 if (rc) 1130 WRITE_ONCE(iopt->disable_large_pages, false); 1131 out_unlock: 1132 up_write(&iopt->iova_rwsem); 1133 up_write(&iopt->domains_rwsem); 1134 return rc; 1135 } 1136 1137 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) 1138 { 1139 int rc; 1140 1141 down_write(&iopt->domains_rwsem); 1142 down_write(&iopt->iova_rwsem); 1143 rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access, 1144 xa_limit_16b, GFP_KERNEL_ACCOUNT); 1145 if (rc) 1146 goto out_unlock; 1147 1148 rc = iopt_calculate_iova_alignment(iopt); 1149 if (rc) { 1150 xa_erase(&iopt->access_list, access->iopt_access_list_id); 1151 goto out_unlock; 1152 } 1153 1154 out_unlock: 1155 up_write(&iopt->iova_rwsem); 1156 up_write(&iopt->domains_rwsem); 1157 return rc; 1158 } 1159 1160 void iopt_remove_access(struct io_pagetable *iopt, 1161 struct iommufd_access *access, 1162 u32 iopt_access_list_id) 1163 { 1164 down_write(&iopt->domains_rwsem); 1165 down_write(&iopt->iova_rwsem); 1166 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); 1167 WARN_ON(iopt_calculate_iova_alignment(iopt)); 1168 up_write(&iopt->iova_rwsem); 1169 up_write(&iopt->domains_rwsem); 1170 } 1171 1172 /* Narrow the valid_iova_itree to include reserved ranges from a device. */ 1173 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, 1174 struct device *dev, 1175 phys_addr_t *sw_msi_start) 1176 { 1177 struct iommu_resv_region *resv; 1178 LIST_HEAD(resv_regions); 1179 unsigned int num_hw_msi = 0; 1180 unsigned int num_sw_msi = 0; 1181 int rc; 1182 1183 if (iommufd_should_fail()) 1184 return -EINVAL; 1185 1186 down_write(&iopt->iova_rwsem); 1187 /* FIXME: drivers allocate memory but there is no failure propogated */ 1188 iommu_get_resv_regions(dev, &resv_regions); 1189 1190 list_for_each_entry(resv, &resv_regions, list) { 1191 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) 1192 continue; 1193 1194 if (sw_msi_start && resv->type == IOMMU_RESV_MSI) 1195 num_hw_msi++; 1196 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { 1197 *sw_msi_start = resv->start; 1198 num_sw_msi++; 1199 } 1200 1201 rc = iopt_reserve_iova(iopt, resv->start, 1202 resv->length - 1 + resv->start, dev); 1203 if (rc) 1204 goto out_reserved; 1205 } 1206 1207 /* Drivers must offer sane combinations of regions */ 1208 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { 1209 rc = -EINVAL; 1210 goto out_reserved; 1211 } 1212 1213 rc = 0; 1214 goto out_free_resv; 1215 1216 out_reserved: 1217 __iopt_remove_reserved_iova(iopt, dev); 1218 out_free_resv: 1219 iommu_put_resv_regions(dev, &resv_regions); 1220 up_write(&iopt->iova_rwsem); 1221 return rc; 1222 } 1223