1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. 3 * 4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The 5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page 6 * list for access by an in-kernel user. 7 * 8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs 9 * between the domains and xarray. 10 */ 11 #include <linux/dma-buf.h> 12 #include <linux/err.h> 13 #include <linux/errno.h> 14 #include <linux/file.h> 15 #include <linux/iommu.h> 16 #include <linux/iommufd.h> 17 #include <linux/lockdep.h> 18 #include <linux/sched/mm.h> 19 #include <linux/slab.h> 20 #include <uapi/linux/iommufd.h> 21 22 #include "double_span.h" 23 #include "io_pagetable.h" 24 25 struct iopt_pages_list { 26 struct iopt_pages *pages; 27 struct iopt_area *area; 28 struct list_head next; 29 unsigned long start_byte; 30 unsigned long length; 31 }; 32 33 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, 34 struct io_pagetable *iopt, 35 unsigned long iova, 36 unsigned long last_iova) 37 { 38 lockdep_assert_held(&iopt->iova_rwsem); 39 40 iter->cur_iova = iova; 41 iter->last_iova = last_iova; 42 iter->area = iopt_area_iter_first(iopt, iova, iova); 43 if (!iter->area) 44 return NULL; 45 if (!iter->area->pages) { 46 iter->area = NULL; 47 return NULL; 48 } 49 return iter->area; 50 } 51 52 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) 53 { 54 unsigned long last_iova; 55 56 if (!iter->area) 57 return NULL; 58 last_iova = iopt_area_last_iova(iter->area); 59 if (iter->last_iova <= last_iova) 60 return NULL; 61 62 iter->cur_iova = last_iova + 1; 63 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, 64 iter->last_iova); 65 if (!iter->area) 66 return NULL; 67 if (iter->cur_iova != iopt_area_iova(iter->area) || 68 !iter->area->pages) { 69 iter->area = NULL; 70 return NULL; 71 } 72 return iter->area; 73 } 74 75 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last, 76 unsigned long length, 77 unsigned long iova_alignment, 78 unsigned long page_offset) 79 { 80 unsigned long aligned_start; 81 82 /* ALIGN_UP() */ 83 if (check_add_overflow(*start, iova_alignment - 1, &aligned_start)) 84 return false; 85 aligned_start &= ~(iova_alignment - 1); 86 aligned_start |= page_offset; 87 88 if (aligned_start >= last || last - aligned_start < length - 1) 89 return false; 90 *start = aligned_start; 91 return true; 92 } 93 94 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, 95 unsigned long length, 96 unsigned long iova_alignment, 97 unsigned long page_offset) 98 { 99 if (span->is_used) 100 return false; 101 return __alloc_iova_check_range(&span->start_hole, span->last_hole, 102 length, iova_alignment, page_offset); 103 } 104 105 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, 106 unsigned long length, 107 unsigned long iova_alignment, 108 unsigned long page_offset) 109 { 110 if (span->is_hole) 111 return false; 112 return __alloc_iova_check_range(&span->start_used, span->last_used, 113 length, iova_alignment, page_offset); 114 } 115 116 /* 117 * Automatically find a block of IOVA that is not being used and not reserved. 118 * Does not return a 0 IOVA even if it is valid. 119 */ 120 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, 121 unsigned long addr, unsigned long length) 122 { 123 unsigned long page_offset = addr % PAGE_SIZE; 124 struct interval_tree_double_span_iter used_span; 125 struct interval_tree_span_iter allowed_span; 126 unsigned long max_alignment = PAGE_SIZE; 127 unsigned long iova_alignment; 128 129 lockdep_assert_held(&iopt->iova_rwsem); 130 131 /* Protect roundup_pow-of_two() from overflow */ 132 if (length == 0 || length >= ULONG_MAX / 2) 133 return -EOVERFLOW; 134 135 /* 136 * Keep alignment present in addr when building the IOVA, which 137 * increases the chance we can map a THP. 138 */ 139 if (!addr) 140 iova_alignment = roundup_pow_of_two(length); 141 else 142 iova_alignment = min_t(unsigned long, 143 roundup_pow_of_two(length), 144 1UL << __ffs64(addr)); 145 146 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 147 max_alignment = HPAGE_SIZE; 148 #endif 149 /* Protect against ALIGN() overflow */ 150 if (iova_alignment >= max_alignment) 151 iova_alignment = max_alignment; 152 153 if (iova_alignment < iopt->iova_alignment) 154 return -EINVAL; 155 156 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, 157 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { 158 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { 159 allowed_span.start_used = PAGE_SIZE; 160 allowed_span.last_used = ULONG_MAX - PAGE_SIZE; 161 allowed_span.is_hole = false; 162 } 163 164 if (!__alloc_iova_check_used(&allowed_span, length, 165 iova_alignment, page_offset)) 166 continue; 167 168 interval_tree_for_each_double_span( 169 &used_span, &iopt->reserved_itree, &iopt->area_itree, 170 allowed_span.start_used, allowed_span.last_used) { 171 if (!__alloc_iova_check_hole(&used_span, length, 172 iova_alignment, 173 page_offset)) 174 continue; 175 176 *iova = used_span.start_hole; 177 return 0; 178 } 179 } 180 return -ENOSPC; 181 } 182 183 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, 184 unsigned long length) 185 { 186 unsigned long last; 187 188 lockdep_assert_held(&iopt->iova_rwsem); 189 190 if ((iova & (iopt->iova_alignment - 1))) 191 return -EINVAL; 192 193 if (check_add_overflow(iova, length - 1, &last)) 194 return -EOVERFLOW; 195 196 /* No reserved IOVA intersects the range */ 197 if (iopt_reserved_iter_first(iopt, iova, last)) 198 return -EINVAL; 199 200 /* Check that there is not already a mapping in the range */ 201 if (iopt_area_iter_first(iopt, iova, last)) 202 return -EEXIST; 203 return 0; 204 } 205 206 /* 207 * The area takes a slice of the pages from start_bytes to start_byte + length 208 */ 209 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, 210 struct iopt_pages *pages, unsigned long iova, 211 unsigned long start_byte, unsigned long length, 212 int iommu_prot) 213 { 214 lockdep_assert_held_write(&iopt->iova_rwsem); 215 216 if ((iommu_prot & IOMMU_WRITE) && !pages->writable) 217 return -EPERM; 218 219 area->iommu_prot = iommu_prot; 220 area->page_offset = start_byte % PAGE_SIZE; 221 if (area->page_offset & (iopt->iova_alignment - 1)) 222 return -EINVAL; 223 224 area->node.start = iova; 225 if (check_add_overflow(iova, length - 1, &area->node.last)) 226 return -EOVERFLOW; 227 228 area->pages_node.start = start_byte / PAGE_SIZE; 229 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) 230 return -EOVERFLOW; 231 area->pages_node.last = area->pages_node.last / PAGE_SIZE; 232 if (WARN_ON(area->pages_node.last >= pages->npages)) 233 return -EOVERFLOW; 234 235 /* 236 * The area is inserted with a NULL pages indicating it is not fully 237 * initialized yet. 238 */ 239 area->iopt = iopt; 240 interval_tree_insert(&area->node, &iopt->area_itree); 241 return 0; 242 } 243 244 static struct iopt_area *iopt_area_alloc(void) 245 { 246 struct iopt_area *area; 247 248 area = kzalloc_obj(*area, GFP_KERNEL_ACCOUNT); 249 if (!area) 250 return NULL; 251 RB_CLEAR_NODE(&area->node.rb); 252 RB_CLEAR_NODE(&area->pages_node.rb); 253 return area; 254 } 255 256 static int iopt_alloc_area_pages(struct io_pagetable *iopt, 257 struct list_head *pages_list, 258 unsigned long length, unsigned long *dst_iova, 259 int iommu_prot, unsigned int flags) 260 { 261 struct iopt_pages_list *elm; 262 unsigned long start; 263 unsigned long iova; 264 int rc = 0; 265 266 list_for_each_entry(elm, pages_list, next) { 267 elm->area = iopt_area_alloc(); 268 if (!elm->area) 269 return -ENOMEM; 270 } 271 272 down_write(&iopt->iova_rwsem); 273 if ((length & (iopt->iova_alignment - 1)) || !length) { 274 rc = -EINVAL; 275 goto out_unlock; 276 } 277 278 if (flags & IOPT_ALLOC_IOVA) { 279 /* Use the first entry to guess the ideal IOVA alignment */ 280 elm = list_first_entry(pages_list, struct iopt_pages_list, 281 next); 282 switch (elm->pages->type) { 283 case IOPT_ADDRESS_USER: 284 start = elm->start_byte + (uintptr_t)elm->pages->uptr; 285 break; 286 case IOPT_ADDRESS_FILE: 287 start = elm->start_byte + elm->pages->start; 288 break; 289 case IOPT_ADDRESS_DMABUF: 290 start = elm->start_byte + elm->pages->dmabuf.start; 291 break; 292 } 293 rc = iopt_alloc_iova(iopt, dst_iova, start, length); 294 if (rc) 295 goto out_unlock; 296 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && 297 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { 298 rc = -EINVAL; 299 goto out_unlock; 300 } 301 } else { 302 rc = iopt_check_iova(iopt, *dst_iova, length); 303 if (rc) 304 goto out_unlock; 305 } 306 307 /* 308 * Areas are created with a NULL pages so that the IOVA space is 309 * reserved and we can unlock the iova_rwsem. 310 */ 311 iova = *dst_iova; 312 list_for_each_entry(elm, pages_list, next) { 313 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, 314 elm->start_byte, elm->length, iommu_prot); 315 if (rc) 316 goto out_unlock; 317 iova += elm->length; 318 } 319 320 out_unlock: 321 up_write(&iopt->iova_rwsem); 322 return rc; 323 } 324 325 static void iopt_abort_area(struct iopt_area *area) 326 { 327 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 328 WARN_ON(area->pages); 329 if (area->iopt) { 330 down_write(&area->iopt->iova_rwsem); 331 interval_tree_remove(&area->node, &area->iopt->area_itree); 332 up_write(&area->iopt->iova_rwsem); 333 } 334 kfree(area); 335 } 336 337 void iopt_free_pages_list(struct list_head *pages_list) 338 { 339 struct iopt_pages_list *elm; 340 341 while ((elm = list_first_entry_or_null(pages_list, 342 struct iopt_pages_list, next))) { 343 if (elm->area) 344 iopt_abort_area(elm->area); 345 if (elm->pages) 346 iopt_put_pages(elm->pages); 347 list_del(&elm->next); 348 kfree(elm); 349 } 350 } 351 352 static int iopt_fill_domains_pages(struct list_head *pages_list) 353 { 354 struct iopt_pages_list *undo_elm; 355 struct iopt_pages_list *elm; 356 int rc; 357 358 list_for_each_entry(elm, pages_list, next) { 359 rc = iopt_area_fill_domains(elm->area, elm->pages); 360 if (rc) 361 goto err_undo; 362 } 363 return 0; 364 365 err_undo: 366 list_for_each_entry(undo_elm, pages_list, next) { 367 if (undo_elm == elm) 368 break; 369 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); 370 } 371 return rc; 372 } 373 374 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, 375 unsigned long length, unsigned long *dst_iova, 376 int iommu_prot, unsigned int flags) 377 { 378 struct iopt_pages_list *elm; 379 int rc; 380 381 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, 382 iommu_prot, flags); 383 if (rc) 384 return rc; 385 386 down_read(&iopt->domains_rwsem); 387 rc = iopt_fill_domains_pages(pages_list); 388 if (rc) 389 goto out_unlock_domains; 390 391 down_write(&iopt->iova_rwsem); 392 list_for_each_entry(elm, pages_list, next) { 393 /* 394 * area->pages must be set inside the domains_rwsem to ensure 395 * any newly added domains will get filled. Moves the reference 396 * in from the list. 397 */ 398 elm->area->pages = elm->pages; 399 elm->pages = NULL; 400 elm->area = NULL; 401 } 402 up_write(&iopt->iova_rwsem); 403 out_unlock_domains: 404 up_read(&iopt->domains_rwsem); 405 return rc; 406 } 407 408 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt, 409 struct iopt_pages *pages, unsigned long *iova, 410 unsigned long length, unsigned long start_byte, 411 int iommu_prot, unsigned int flags) 412 { 413 struct iopt_pages_list elm = {}; 414 LIST_HEAD(pages_list); 415 int rc; 416 417 elm.pages = pages; 418 elm.start_byte = start_byte; 419 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && 420 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) 421 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; 422 elm.length = length; 423 list_add(&elm.next, &pages_list); 424 425 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); 426 if (rc) { 427 if (elm.area) 428 iopt_abort_area(elm.area); 429 if (elm.pages) 430 iopt_put_pages(elm.pages); 431 return rc; 432 } 433 return 0; 434 } 435 436 /** 437 * iopt_map_user_pages() - Map a user VA to an iova in the io page table 438 * @ictx: iommufd_ctx the iopt is part of 439 * @iopt: io_pagetable to act on 440 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains 441 * the chosen iova on output. Otherwise is the iova to map to on input 442 * @uptr: User VA to map 443 * @length: Number of bytes to map 444 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping 445 * @flags: IOPT_ALLOC_IOVA or zero 446 * 447 * iova, uptr, and length must be aligned to iova_alignment. For domain backed 448 * page tables this will pin the pages and load them into the domain at iova. 449 * For non-domain page tables this will only setup a lazy reference and the 450 * caller must use iopt_access_pages() to touch them. 451 * 452 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be 453 * destroyed. 454 */ 455 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, 456 unsigned long *iova, void __user *uptr, 457 unsigned long length, int iommu_prot, 458 unsigned int flags) 459 { 460 struct iopt_pages *pages; 461 462 pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE); 463 if (IS_ERR(pages)) 464 return PTR_ERR(pages); 465 466 return iopt_map_common(ictx, iopt, pages, iova, length, 467 uptr - pages->uptr, iommu_prot, flags); 468 } 469 470 /** 471 * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file. 472 * @ictx: iommufd_ctx the iopt is part of 473 * @iopt: io_pagetable to act on 474 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains 475 * the chosen iova on output. Otherwise is the iova to map to on input 476 * @fd: fdno of a file to map 477 * @start: map file starting at this byte offset 478 * @length: Number of bytes to map 479 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping 480 * @flags: IOPT_ALLOC_IOVA or zero 481 */ 482 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, 483 unsigned long *iova, int fd, unsigned long start, 484 unsigned long length, int iommu_prot, 485 unsigned int flags) 486 { 487 struct iopt_pages *pages; 488 struct dma_buf *dmabuf; 489 unsigned long start_byte; 490 unsigned long last; 491 492 if (!length) 493 return -EINVAL; 494 if (check_add_overflow(start, length - 1, &last)) 495 return -EOVERFLOW; 496 497 start_byte = start - ALIGN_DOWN(start, PAGE_SIZE); 498 if (IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 499 dmabuf = dma_buf_get(fd); 500 else 501 dmabuf = ERR_PTR(-ENXIO); 502 503 if (!IS_ERR(dmabuf)) { 504 pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start, 505 length, 506 iommu_prot & IOMMU_WRITE); 507 if (IS_ERR(pages)) { 508 dma_buf_put(dmabuf); 509 return PTR_ERR(pages); 510 } 511 } else { 512 struct file *file; 513 514 file = fget(fd); 515 if (!file) 516 return -EBADF; 517 518 pages = iopt_alloc_file_pages(file, start_byte, start, length, 519 iommu_prot & IOMMU_WRITE); 520 fput(file); 521 if (IS_ERR(pages)) 522 return PTR_ERR(pages); 523 } 524 525 return iopt_map_common(ictx, iopt, pages, iova, length, 526 start_byte, iommu_prot, flags); 527 } 528 529 struct iova_bitmap_fn_arg { 530 unsigned long flags; 531 struct io_pagetable *iopt; 532 struct iommu_domain *domain; 533 struct iommu_dirty_bitmap *dirty; 534 }; 535 536 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, 537 unsigned long iova, size_t length, 538 void *opaque) 539 { 540 struct iopt_area *area; 541 struct iopt_area_contig_iter iter; 542 struct iova_bitmap_fn_arg *arg = opaque; 543 struct iommu_domain *domain = arg->domain; 544 struct iommu_dirty_bitmap *dirty = arg->dirty; 545 const struct iommu_dirty_ops *ops = domain->dirty_ops; 546 unsigned long last_iova = iova + length - 1; 547 unsigned long flags = arg->flags; 548 int ret; 549 550 iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { 551 unsigned long last = min(last_iova, iopt_area_last_iova(area)); 552 553 ret = ops->read_and_clear_dirty(domain, iter.cur_iova, 554 last - iter.cur_iova + 1, flags, 555 dirty); 556 if (ret) 557 return ret; 558 } 559 560 if (!iopt_area_contig_done(&iter)) 561 return -EINVAL; 562 return 0; 563 } 564 565 static int 566 iommu_read_and_clear_dirty(struct iommu_domain *domain, 567 struct io_pagetable *iopt, unsigned long flags, 568 struct iommu_hwpt_get_dirty_bitmap *bitmap) 569 { 570 const struct iommu_dirty_ops *ops = domain->dirty_ops; 571 struct iommu_iotlb_gather gather; 572 struct iommu_dirty_bitmap dirty; 573 struct iova_bitmap_fn_arg arg; 574 struct iova_bitmap *iter; 575 int ret = 0; 576 577 if (!ops || !ops->read_and_clear_dirty) 578 return -EOPNOTSUPP; 579 580 iter = iova_bitmap_alloc(bitmap->iova, bitmap->length, 581 bitmap->page_size, 582 u64_to_user_ptr(bitmap->data)); 583 if (IS_ERR(iter)) 584 return -ENOMEM; 585 586 iommu_dirty_bitmap_init(&dirty, iter, &gather); 587 588 arg.flags = flags; 589 arg.iopt = iopt; 590 arg.domain = domain; 591 arg.dirty = &dirty; 592 iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); 593 594 if (!(flags & IOMMU_DIRTY_NO_CLEAR)) 595 iommu_iotlb_sync(domain, &gather); 596 597 iova_bitmap_free(iter); 598 599 return ret; 600 } 601 602 int iommufd_check_iova_range(struct io_pagetable *iopt, 603 struct iommu_hwpt_get_dirty_bitmap *bitmap) 604 { 605 size_t iommu_pgsize = iopt->iova_alignment; 606 u64 last_iova; 607 608 if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova)) 609 return -EOVERFLOW; 610 611 if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX) 612 return -EOVERFLOW; 613 614 if ((bitmap->iova & (iommu_pgsize - 1)) || 615 ((last_iova + 1) & (iommu_pgsize - 1))) 616 return -EINVAL; 617 618 if (!bitmap->page_size) 619 return -EINVAL; 620 621 if ((bitmap->iova & (bitmap->page_size - 1)) || 622 ((last_iova + 1) & (bitmap->page_size - 1))) 623 return -EINVAL; 624 625 return 0; 626 } 627 628 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, 629 struct iommu_domain *domain, 630 unsigned long flags, 631 struct iommu_hwpt_get_dirty_bitmap *bitmap) 632 { 633 int ret; 634 635 ret = iommufd_check_iova_range(iopt, bitmap); 636 if (ret) 637 return ret; 638 639 down_read(&iopt->iova_rwsem); 640 ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); 641 up_read(&iopt->iova_rwsem); 642 643 return ret; 644 } 645 646 static int iopt_clear_dirty_data(struct io_pagetable *iopt, 647 struct iommu_domain *domain) 648 { 649 const struct iommu_dirty_ops *ops = domain->dirty_ops; 650 struct iommu_iotlb_gather gather; 651 struct iommu_dirty_bitmap dirty; 652 struct iopt_area *area; 653 int ret = 0; 654 655 lockdep_assert_held_read(&iopt->iova_rwsem); 656 657 iommu_dirty_bitmap_init(&dirty, NULL, &gather); 658 659 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 660 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 661 if (!area->pages) 662 continue; 663 664 ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), 665 iopt_area_length(area), 0, 666 &dirty); 667 if (ret) 668 break; 669 } 670 671 iommu_iotlb_sync(domain, &gather); 672 return ret; 673 } 674 675 int iopt_set_dirty_tracking(struct io_pagetable *iopt, 676 struct iommu_domain *domain, bool enable) 677 { 678 const struct iommu_dirty_ops *ops = domain->dirty_ops; 679 int ret = 0; 680 681 if (!ops) 682 return -EOPNOTSUPP; 683 684 down_read(&iopt->iova_rwsem); 685 686 /* Clear dirty bits from PTEs to ensure a clean snapshot */ 687 if (enable) { 688 ret = iopt_clear_dirty_data(iopt, domain); 689 if (ret) 690 goto out_unlock; 691 } 692 693 ret = ops->set_dirty_tracking(domain, enable); 694 695 out_unlock: 696 up_read(&iopt->iova_rwsem); 697 return ret; 698 } 699 700 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, 701 unsigned long length, struct list_head *pages_list) 702 { 703 struct iopt_area_contig_iter iter; 704 unsigned long last_iova; 705 struct iopt_area *area; 706 int rc; 707 708 if (!length) 709 return -EINVAL; 710 if (check_add_overflow(iova, length - 1, &last_iova)) 711 return -EOVERFLOW; 712 713 down_read(&iopt->iova_rwsem); 714 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { 715 struct iopt_pages_list *elm; 716 unsigned long last = min(last_iova, iopt_area_last_iova(area)); 717 718 elm = kzalloc_obj(*elm, GFP_KERNEL_ACCOUNT); 719 if (!elm) { 720 rc = -ENOMEM; 721 goto err_free; 722 } 723 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); 724 elm->pages = area->pages; 725 elm->length = (last - iter.cur_iova) + 1; 726 kref_get(&elm->pages->kref); 727 list_add_tail(&elm->next, pages_list); 728 } 729 if (!iopt_area_contig_done(&iter)) { 730 rc = -ENOENT; 731 goto err_free; 732 } 733 up_read(&iopt->iova_rwsem); 734 return 0; 735 err_free: 736 up_read(&iopt->iova_rwsem); 737 iopt_free_pages_list(pages_list); 738 return rc; 739 } 740 741 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, 742 unsigned long last, unsigned long *unmapped) 743 { 744 struct iopt_area *area; 745 unsigned long unmapped_bytes = 0; 746 unsigned int tries = 0; 747 /* If there are no mapped entries then success */ 748 int rc = 0; 749 750 /* 751 * The domains_rwsem must be held in read mode any time any area->pages 752 * is NULL. This prevents domain attach/detatch from running 753 * concurrently with cleaning up the area. 754 */ 755 again: 756 down_read(&iopt->domains_rwsem); 757 down_write(&iopt->iova_rwsem); 758 while ((area = iopt_area_iter_first(iopt, start, last))) { 759 unsigned long area_last = iopt_area_last_iova(area); 760 unsigned long area_first = iopt_area_iova(area); 761 struct iopt_pages *pages; 762 763 /* Userspace should not race map/unmap's of the same area */ 764 if (!area->pages) { 765 rc = -EBUSY; 766 goto out_unlock_iova; 767 } 768 769 /* The area is locked by an object that has not been destroyed */ 770 if (area->num_locks) { 771 rc = -EBUSY; 772 goto out_unlock_iova; 773 } 774 775 if (area_first < start || area_last > last) { 776 rc = -ENOENT; 777 goto out_unlock_iova; 778 } 779 780 if (area_first != start) 781 tries = 0; 782 783 /* 784 * num_accesses writers must hold the iova_rwsem too, so we can 785 * safely read it under the write side of the iovam_rwsem 786 * without the pages->mutex. 787 */ 788 if (area->num_accesses) { 789 size_t length = iopt_area_length(area); 790 791 start = area_first; 792 area->prevent_access = true; 793 up_write(&iopt->iova_rwsem); 794 up_read(&iopt->domains_rwsem); 795 796 iommufd_access_notify_unmap(iopt, area_first, length); 797 /* Something is not responding to unmap requests. */ 798 tries++; 799 if (WARN_ON(tries > 100)) { 800 rc = -EDEADLOCK; 801 goto out_unmapped; 802 } 803 goto again; 804 } 805 806 pages = area->pages; 807 area->pages = NULL; 808 up_write(&iopt->iova_rwsem); 809 810 iopt_area_unfill_domains(area, pages); 811 iopt_abort_area(area); 812 iopt_put_pages(pages); 813 814 unmapped_bytes += area_last - area_first + 1; 815 816 down_write(&iopt->iova_rwsem); 817 818 /* 819 * After releasing the iova_rwsem concurrent allocation could 820 * place new areas at IOVAs we have already unmapped. Keep 821 * moving the start of the search forward to ignore the area 822 * already unmapped. 823 */ 824 if (area_last >= last) 825 break; 826 start = area_last + 1; 827 } 828 829 out_unlock_iova: 830 up_write(&iopt->iova_rwsem); 831 up_read(&iopt->domains_rwsem); 832 out_unmapped: 833 if (unmapped) 834 *unmapped = unmapped_bytes; 835 return rc; 836 } 837 838 /** 839 * iopt_unmap_iova() - Remove a range of iova 840 * @iopt: io_pagetable to act on 841 * @iova: Starting iova to unmap 842 * @length: Number of bytes to unmap 843 * @unmapped: Return number of bytes unmapped 844 * 845 * The requested range must be a superset of existing ranges. 846 * Splitting/truncating IOVA mappings is not allowed. 847 */ 848 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, 849 unsigned long length, unsigned long *unmapped) 850 { 851 unsigned long iova_last; 852 853 if (!length) 854 return -EINVAL; 855 856 if (check_add_overflow(iova, length - 1, &iova_last)) 857 return -EOVERFLOW; 858 859 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); 860 } 861 862 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) 863 { 864 /* If the IOVAs are empty then unmap all succeeds */ 865 return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); 866 } 867 868 /* The caller must always free all the nodes in the allowed_iova rb_root. */ 869 int iopt_set_allow_iova(struct io_pagetable *iopt, 870 struct rb_root_cached *allowed_iova) 871 { 872 struct iopt_allowed *allowed; 873 874 down_write(&iopt->iova_rwsem); 875 swap(*allowed_iova, iopt->allowed_itree); 876 877 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; 878 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { 879 if (iopt_reserved_iter_first(iopt, allowed->node.start, 880 allowed->node.last)) { 881 swap(*allowed_iova, iopt->allowed_itree); 882 up_write(&iopt->iova_rwsem); 883 return -EADDRINUSE; 884 } 885 } 886 up_write(&iopt->iova_rwsem); 887 return 0; 888 } 889 890 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, 891 unsigned long last, void *owner) 892 { 893 struct iopt_reserved *reserved; 894 895 lockdep_assert_held_write(&iopt->iova_rwsem); 896 897 if (iopt_area_iter_first(iopt, start, last) || 898 iopt_allowed_iter_first(iopt, start, last)) 899 return -EADDRINUSE; 900 901 reserved = kzalloc_obj(*reserved, GFP_KERNEL_ACCOUNT); 902 if (!reserved) 903 return -ENOMEM; 904 reserved->node.start = start; 905 reserved->node.last = last; 906 reserved->owner = owner; 907 interval_tree_insert(&reserved->node, &iopt->reserved_itree); 908 return 0; 909 } 910 911 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) 912 { 913 struct iopt_reserved *reserved, *next; 914 915 lockdep_assert_held_write(&iopt->iova_rwsem); 916 917 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; 918 reserved = next) { 919 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); 920 921 if (reserved->owner == owner) { 922 interval_tree_remove(&reserved->node, 923 &iopt->reserved_itree); 924 kfree(reserved); 925 } 926 } 927 } 928 929 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) 930 { 931 down_write(&iopt->iova_rwsem); 932 __iopt_remove_reserved_iova(iopt, owner); 933 up_write(&iopt->iova_rwsem); 934 } 935 936 void iopt_init_table(struct io_pagetable *iopt) 937 { 938 init_rwsem(&iopt->iova_rwsem); 939 init_rwsem(&iopt->domains_rwsem); 940 iopt->area_itree = RB_ROOT_CACHED; 941 iopt->allowed_itree = RB_ROOT_CACHED; 942 iopt->reserved_itree = RB_ROOT_CACHED; 943 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); 944 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); 945 946 /* 947 * iopt's start as SW tables that can use the entire size_t IOVA space 948 * due to the use of size_t in the APIs. They have no alignment 949 * restriction. 950 */ 951 iopt->iova_alignment = 1; 952 } 953 954 void iopt_destroy_table(struct io_pagetable *iopt) 955 { 956 struct interval_tree_node *node; 957 958 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 959 iopt_remove_reserved_iova(iopt, NULL); 960 961 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, 962 ULONG_MAX))) { 963 interval_tree_remove(node, &iopt->allowed_itree); 964 kfree(container_of(node, struct iopt_allowed, node)); 965 } 966 967 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); 968 WARN_ON(!xa_empty(&iopt->domains)); 969 WARN_ON(!xa_empty(&iopt->access_list)); 970 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); 971 } 972 973 /** 974 * iopt_unfill_domain() - Unfill a domain with PFNs 975 * @iopt: io_pagetable to act on 976 * @domain: domain to unfill 977 * 978 * This is used when removing a domain from the iopt. Every area in the iopt 979 * will be unmapped from the domain. The domain must already be removed from the 980 * domains xarray. 981 */ 982 static void iopt_unfill_domain(struct io_pagetable *iopt, 983 struct iommu_domain *domain) 984 { 985 struct iopt_area *area; 986 987 lockdep_assert_held(&iopt->iova_rwsem); 988 lockdep_assert_held_write(&iopt->domains_rwsem); 989 990 /* 991 * Some other domain is holding all the pfns still, rapidly unmap this 992 * domain. 993 */ 994 if (iopt->next_domain_id != 0) { 995 /* Pick an arbitrary remaining domain to act as storage */ 996 struct iommu_domain *storage_domain = 997 xa_load(&iopt->domains, 0); 998 999 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 1000 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 1001 struct iopt_pages *pages = area->pages; 1002 1003 if (!pages) 1004 continue; 1005 1006 mutex_lock(&pages->mutex); 1007 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 1008 WARN_ON(!area->storage_domain); 1009 if (area->storage_domain == domain) 1010 area->storage_domain = storage_domain; 1011 if (iopt_is_dmabuf(pages)) { 1012 if (!iopt_dmabuf_revoked(pages)) 1013 iopt_area_unmap_domain(area, domain); 1014 iopt_dmabuf_untrack_domain(pages, area, domain); 1015 } 1016 mutex_unlock(&pages->mutex); 1017 1018 if (!iopt_is_dmabuf(pages)) 1019 iopt_area_unmap_domain(area, domain); 1020 } 1021 return; 1022 } 1023 1024 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 1025 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 1026 struct iopt_pages *pages = area->pages; 1027 1028 if (!pages) 1029 continue; 1030 1031 mutex_lock(&pages->mutex); 1032 interval_tree_remove(&area->pages_node, &pages->domains_itree); 1033 WARN_ON(area->storage_domain != domain); 1034 area->storage_domain = NULL; 1035 iopt_area_unfill_domain(area, pages, domain); 1036 if (iopt_is_dmabuf(pages)) 1037 iopt_dmabuf_untrack_domain(pages, area, domain); 1038 mutex_unlock(&pages->mutex); 1039 } 1040 } 1041 1042 /** 1043 * iopt_fill_domain() - Fill a domain with PFNs 1044 * @iopt: io_pagetable to act on 1045 * @domain: domain to fill 1046 * 1047 * Fill the domain with PFNs from every area in the iopt. On failure the domain 1048 * is left unchanged. 1049 */ 1050 static int iopt_fill_domain(struct io_pagetable *iopt, 1051 struct iommu_domain *domain) 1052 { 1053 struct iopt_area *end_area; 1054 struct iopt_area *area; 1055 int rc; 1056 1057 lockdep_assert_held(&iopt->iova_rwsem); 1058 lockdep_assert_held_write(&iopt->domains_rwsem); 1059 1060 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 1061 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 1062 struct iopt_pages *pages = area->pages; 1063 1064 if (!pages) 1065 continue; 1066 1067 guard(mutex)(&pages->mutex); 1068 if (iopt_is_dmabuf(pages)) { 1069 rc = iopt_dmabuf_track_domain(pages, area, domain); 1070 if (rc) 1071 goto out_unfill; 1072 } 1073 rc = iopt_area_fill_domain(area, domain); 1074 if (rc) { 1075 if (iopt_is_dmabuf(pages)) 1076 iopt_dmabuf_untrack_domain(pages, area, domain); 1077 goto out_unfill; 1078 } 1079 if (!area->storage_domain) { 1080 WARN_ON(iopt->next_domain_id != 0); 1081 area->storage_domain = domain; 1082 interval_tree_insert(&area->pages_node, 1083 &pages->domains_itree); 1084 } 1085 } 1086 return 0; 1087 1088 out_unfill: 1089 end_area = area; 1090 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 1091 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 1092 struct iopt_pages *pages = area->pages; 1093 1094 if (area == end_area) 1095 break; 1096 if (!pages) 1097 continue; 1098 mutex_lock(&pages->mutex); 1099 if (iopt->next_domain_id == 0) { 1100 interval_tree_remove(&area->pages_node, 1101 &pages->domains_itree); 1102 area->storage_domain = NULL; 1103 } 1104 iopt_area_unfill_domain(area, pages, domain); 1105 if (iopt_is_dmabuf(pages)) 1106 iopt_dmabuf_untrack_domain(pages, area, domain); 1107 mutex_unlock(&pages->mutex); 1108 } 1109 return rc; 1110 } 1111 1112 /* All existing area's conform to an increased page size */ 1113 static int iopt_check_iova_alignment(struct io_pagetable *iopt, 1114 unsigned long new_iova_alignment) 1115 { 1116 unsigned long align_mask = new_iova_alignment - 1; 1117 struct iopt_area *area; 1118 1119 lockdep_assert_held(&iopt->iova_rwsem); 1120 lockdep_assert_held(&iopt->domains_rwsem); 1121 1122 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 1123 area = iopt_area_iter_next(area, 0, ULONG_MAX)) 1124 if ((iopt_area_iova(area) & align_mask) || 1125 (iopt_area_length(area) & align_mask) || 1126 (area->page_offset & align_mask)) 1127 return -EADDRINUSE; 1128 1129 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { 1130 struct iommufd_access *access; 1131 unsigned long index; 1132 1133 xa_for_each(&iopt->access_list, index, access) 1134 if (WARN_ON(access->iova_alignment > 1135 new_iova_alignment)) 1136 return -EADDRINUSE; 1137 } 1138 return 0; 1139 } 1140 1141 int iopt_table_add_domain(struct io_pagetable *iopt, 1142 struct iommu_domain *domain) 1143 { 1144 const struct iommu_domain_geometry *geometry = &domain->geometry; 1145 struct iommu_domain *iter_domain; 1146 unsigned int new_iova_alignment; 1147 unsigned long index; 1148 int rc; 1149 1150 down_write(&iopt->domains_rwsem); 1151 down_write(&iopt->iova_rwsem); 1152 1153 xa_for_each(&iopt->domains, index, iter_domain) { 1154 if (WARN_ON(iter_domain == domain)) { 1155 rc = -EEXIST; 1156 goto out_unlock; 1157 } 1158 } 1159 1160 /* 1161 * The io page size drives the iova_alignment. Internally the iopt_pages 1162 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE 1163 * objects into the iommu_domain. 1164 * 1165 * A iommu_domain must always be able to accept PAGE_SIZE to be 1166 * compatible as we can't guarantee higher contiguity. 1167 */ 1168 new_iova_alignment = max_t(unsigned long, 1169 1UL << __ffs(domain->pgsize_bitmap), 1170 iopt->iova_alignment); 1171 if (new_iova_alignment > PAGE_SIZE) { 1172 rc = -EINVAL; 1173 goto out_unlock; 1174 } 1175 if (new_iova_alignment != iopt->iova_alignment) { 1176 rc = iopt_check_iova_alignment(iopt, new_iova_alignment); 1177 if (rc) 1178 goto out_unlock; 1179 } 1180 1181 /* No area exists that is outside the allowed domain aperture */ 1182 if (geometry->aperture_start != 0) { 1183 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, 1184 domain); 1185 if (rc) 1186 goto out_reserved; 1187 } 1188 if (geometry->aperture_end != ULONG_MAX) { 1189 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, 1190 ULONG_MAX, domain); 1191 if (rc) 1192 goto out_reserved; 1193 } 1194 1195 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); 1196 if (rc) 1197 goto out_reserved; 1198 1199 rc = iopt_fill_domain(iopt, domain); 1200 if (rc) 1201 goto out_release; 1202 1203 iopt->iova_alignment = new_iova_alignment; 1204 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); 1205 iopt->next_domain_id++; 1206 up_write(&iopt->iova_rwsem); 1207 up_write(&iopt->domains_rwsem); 1208 return 0; 1209 out_release: 1210 xa_release(&iopt->domains, iopt->next_domain_id); 1211 out_reserved: 1212 __iopt_remove_reserved_iova(iopt, domain); 1213 out_unlock: 1214 up_write(&iopt->iova_rwsem); 1215 up_write(&iopt->domains_rwsem); 1216 return rc; 1217 } 1218 1219 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) 1220 { 1221 unsigned long new_iova_alignment; 1222 struct iommufd_access *access; 1223 struct iommu_domain *domain; 1224 unsigned long index; 1225 1226 lockdep_assert_held_write(&iopt->iova_rwsem); 1227 lockdep_assert_held(&iopt->domains_rwsem); 1228 1229 /* See batch_iommu_map_small() */ 1230 if (iopt->disable_large_pages) 1231 new_iova_alignment = PAGE_SIZE; 1232 else 1233 new_iova_alignment = 1; 1234 1235 xa_for_each(&iopt->domains, index, domain) 1236 new_iova_alignment = max_t(unsigned long, 1237 1UL << __ffs(domain->pgsize_bitmap), 1238 new_iova_alignment); 1239 xa_for_each(&iopt->access_list, index, access) 1240 new_iova_alignment = max_t(unsigned long, 1241 access->iova_alignment, 1242 new_iova_alignment); 1243 1244 if (new_iova_alignment > iopt->iova_alignment) { 1245 int rc; 1246 1247 rc = iopt_check_iova_alignment(iopt, new_iova_alignment); 1248 if (rc) 1249 return rc; 1250 } 1251 iopt->iova_alignment = new_iova_alignment; 1252 return 0; 1253 } 1254 1255 void iopt_table_remove_domain(struct io_pagetable *iopt, 1256 struct iommu_domain *domain) 1257 { 1258 struct iommu_domain *iter_domain = NULL; 1259 unsigned long index; 1260 1261 down_write(&iopt->domains_rwsem); 1262 down_write(&iopt->iova_rwsem); 1263 1264 xa_for_each(&iopt->domains, index, iter_domain) 1265 if (iter_domain == domain) 1266 break; 1267 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) 1268 goto out_unlock; 1269 1270 /* 1271 * Compress the xarray to keep it linear by swapping the entry to erase 1272 * with the tail entry and shrinking the tail. 1273 */ 1274 iopt->next_domain_id--; 1275 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); 1276 if (index != iopt->next_domain_id) 1277 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); 1278 1279 iopt_unfill_domain(iopt, domain); 1280 __iopt_remove_reserved_iova(iopt, domain); 1281 1282 WARN_ON(iopt_calculate_iova_alignment(iopt)); 1283 out_unlock: 1284 up_write(&iopt->iova_rwsem); 1285 up_write(&iopt->domains_rwsem); 1286 } 1287 1288 /** 1289 * iopt_area_split - Split an area into two parts at iova 1290 * @area: The area to split 1291 * @iova: Becomes the last of a new area 1292 * 1293 * This splits an area into two. It is part of the VFIO compatibility to allow 1294 * poking a hole in the mapping. The two areas continue to point at the same 1295 * iopt_pages, just with different starting bytes. 1296 */ 1297 static int iopt_area_split(struct iopt_area *area, unsigned long iova) 1298 { 1299 unsigned long alignment = area->iopt->iova_alignment; 1300 unsigned long last_iova = iopt_area_last_iova(area); 1301 unsigned long start_iova = iopt_area_iova(area); 1302 unsigned long new_start = iova + 1; 1303 struct io_pagetable *iopt = area->iopt; 1304 struct iopt_pages *pages = area->pages; 1305 struct iopt_area *lhs; 1306 struct iopt_area *rhs; 1307 int rc; 1308 1309 lockdep_assert_held_write(&iopt->iova_rwsem); 1310 1311 if (iova == start_iova || iova == last_iova) 1312 return 0; 1313 1314 if (!pages || area->prevent_access) 1315 return -EBUSY; 1316 1317 /* Maintaining the domains_itree below is a bit complicated */ 1318 if (iopt_is_dmabuf(pages)) 1319 return -EOPNOTSUPP; 1320 1321 if (new_start & (alignment - 1) || 1322 iopt_area_start_byte(area, new_start) & (alignment - 1)) 1323 return -EINVAL; 1324 1325 lhs = iopt_area_alloc(); 1326 if (!lhs) 1327 return -ENOMEM; 1328 1329 rhs = iopt_area_alloc(); 1330 if (!rhs) { 1331 rc = -ENOMEM; 1332 goto err_free_lhs; 1333 } 1334 1335 mutex_lock(&pages->mutex); 1336 /* 1337 * Splitting is not permitted if an access exists, we don't track enough 1338 * information to split existing accesses. 1339 */ 1340 if (area->num_accesses) { 1341 rc = -EINVAL; 1342 goto err_unlock; 1343 } 1344 1345 /* 1346 * Splitting is not permitted if a domain could have been mapped with 1347 * huge pages. 1348 */ 1349 if (area->storage_domain && !iopt->disable_large_pages) { 1350 rc = -EINVAL; 1351 goto err_unlock; 1352 } 1353 1354 interval_tree_remove(&area->node, &iopt->area_itree); 1355 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, 1356 iopt_area_start_byte(area, start_iova), 1357 (new_start - 1) - start_iova + 1, 1358 area->iommu_prot); 1359 if (WARN_ON(rc)) 1360 goto err_insert; 1361 1362 rc = iopt_insert_area(iopt, rhs, area->pages, new_start, 1363 iopt_area_start_byte(area, new_start), 1364 last_iova - new_start + 1, area->iommu_prot); 1365 if (WARN_ON(rc)) 1366 goto err_remove_lhs; 1367 1368 /* 1369 * If the original area has filled a domain, domains_itree has to be 1370 * updated. 1371 */ 1372 if (area->storage_domain) { 1373 interval_tree_remove(&area->pages_node, &pages->domains_itree); 1374 interval_tree_insert(&lhs->pages_node, &pages->domains_itree); 1375 interval_tree_insert(&rhs->pages_node, &pages->domains_itree); 1376 } 1377 1378 lhs->storage_domain = area->storage_domain; 1379 lhs->pages = area->pages; 1380 rhs->storage_domain = area->storage_domain; 1381 rhs->pages = area->pages; 1382 kref_get(&rhs->pages->kref); 1383 kfree(area); 1384 mutex_unlock(&pages->mutex); 1385 1386 /* 1387 * No change to domains or accesses because the pages hasn't been 1388 * changed 1389 */ 1390 return 0; 1391 1392 err_remove_lhs: 1393 interval_tree_remove(&lhs->node, &iopt->area_itree); 1394 err_insert: 1395 interval_tree_insert(&area->node, &iopt->area_itree); 1396 err_unlock: 1397 mutex_unlock(&pages->mutex); 1398 kfree(rhs); 1399 err_free_lhs: 1400 kfree(lhs); 1401 return rc; 1402 } 1403 1404 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, 1405 size_t num_iovas) 1406 { 1407 int rc = 0; 1408 int i; 1409 1410 down_write(&iopt->iova_rwsem); 1411 for (i = 0; i < num_iovas; i++) { 1412 struct iopt_area *area; 1413 1414 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); 1415 if (!area) 1416 continue; 1417 rc = iopt_area_split(area, iovas[i]); 1418 if (rc) 1419 break; 1420 } 1421 up_write(&iopt->iova_rwsem); 1422 return rc; 1423 } 1424 1425 void iopt_enable_large_pages(struct io_pagetable *iopt) 1426 { 1427 int rc; 1428 1429 down_write(&iopt->domains_rwsem); 1430 down_write(&iopt->iova_rwsem); 1431 WRITE_ONCE(iopt->disable_large_pages, false); 1432 rc = iopt_calculate_iova_alignment(iopt); 1433 WARN_ON(rc); 1434 up_write(&iopt->iova_rwsem); 1435 up_write(&iopt->domains_rwsem); 1436 } 1437 1438 int iopt_disable_large_pages(struct io_pagetable *iopt) 1439 { 1440 int rc = 0; 1441 1442 down_write(&iopt->domains_rwsem); 1443 down_write(&iopt->iova_rwsem); 1444 if (iopt->disable_large_pages) 1445 goto out_unlock; 1446 1447 /* Won't do it if domains already have pages mapped in them */ 1448 if (!xa_empty(&iopt->domains) && 1449 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { 1450 rc = -EINVAL; 1451 goto out_unlock; 1452 } 1453 1454 WRITE_ONCE(iopt->disable_large_pages, true); 1455 rc = iopt_calculate_iova_alignment(iopt); 1456 if (rc) 1457 WRITE_ONCE(iopt->disable_large_pages, false); 1458 out_unlock: 1459 up_write(&iopt->iova_rwsem); 1460 up_write(&iopt->domains_rwsem); 1461 return rc; 1462 } 1463 1464 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) 1465 { 1466 u32 new_id; 1467 int rc; 1468 1469 down_write(&iopt->domains_rwsem); 1470 down_write(&iopt->iova_rwsem); 1471 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b, 1472 GFP_KERNEL_ACCOUNT); 1473 1474 if (rc) 1475 goto out_unlock; 1476 1477 rc = iopt_calculate_iova_alignment(iopt); 1478 if (rc) { 1479 xa_erase(&iopt->access_list, new_id); 1480 goto out_unlock; 1481 } 1482 access->iopt_access_list_id = new_id; 1483 1484 out_unlock: 1485 up_write(&iopt->iova_rwsem); 1486 up_write(&iopt->domains_rwsem); 1487 return rc; 1488 } 1489 1490 void iopt_remove_access(struct io_pagetable *iopt, 1491 struct iommufd_access *access, u32 iopt_access_list_id) 1492 { 1493 down_write(&iopt->domains_rwsem); 1494 down_write(&iopt->iova_rwsem); 1495 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); 1496 WARN_ON(iopt_calculate_iova_alignment(iopt)); 1497 up_write(&iopt->iova_rwsem); 1498 up_write(&iopt->domains_rwsem); 1499 } 1500 1501 /* Narrow the valid_iova_itree to include reserved ranges from a device. */ 1502 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, 1503 struct device *dev, 1504 phys_addr_t *sw_msi_start) 1505 { 1506 struct iommu_resv_region *resv; 1507 LIST_HEAD(resv_regions); 1508 unsigned int num_hw_msi = 0; 1509 unsigned int num_sw_msi = 0; 1510 int rc; 1511 1512 if (iommufd_should_fail()) 1513 return -EINVAL; 1514 1515 down_write(&iopt->iova_rwsem); 1516 /* FIXME: drivers allocate memory but there is no failure propogated */ 1517 iommu_get_resv_regions(dev, &resv_regions); 1518 1519 list_for_each_entry(resv, &resv_regions, list) { 1520 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) 1521 continue; 1522 1523 if (sw_msi_start && resv->type == IOMMU_RESV_MSI) 1524 num_hw_msi++; 1525 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { 1526 *sw_msi_start = resv->start; 1527 num_sw_msi++; 1528 } 1529 1530 rc = iopt_reserve_iova(iopt, resv->start, 1531 resv->length - 1 + resv->start, dev); 1532 if (rc) 1533 goto out_reserved; 1534 } 1535 1536 /* Drivers must offer sane combinations of regions */ 1537 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { 1538 rc = -EINVAL; 1539 goto out_reserved; 1540 } 1541 1542 rc = 0; 1543 goto out_free_resv; 1544 1545 out_reserved: 1546 __iopt_remove_reserved_iova(iopt, dev); 1547 out_free_resv: 1548 iommu_put_resv_regions(dev, &resv_regions); 1549 up_write(&iopt->iova_rwsem); 1550 return rc; 1551 } 1552