1 /* 2 * VFIO: IOMMU DMA mapping support for TCE on POWER 3 * 4 * Copyright (C) 2013 IBM Corp. All rights reserved. 5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 * 11 * Derived from original vfio_iommu_type1.c: 12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 13 * Author: Alex Williamson <alex.williamson@redhat.com> 14 */ 15 16 #include <linux/module.h> 17 #include <linux/pci.h> 18 #include <linux/slab.h> 19 #include <linux/uaccess.h> 20 #include <linux/err.h> 21 #include <linux/vfio.h> 22 #include <linux/vmalloc.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/signal.h> 25 26 #include <asm/iommu.h> 27 #include <asm/tce.h> 28 #include <asm/mmu_context.h> 29 30 #define DRIVER_VERSION "0.1" 31 #define DRIVER_AUTHOR "aik@ozlabs.ru" 32 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 33 34 static void tce_iommu_detach_group(void *iommu_data, 35 struct iommu_group *iommu_group); 36 37 static long try_increment_locked_vm(struct mm_struct *mm, long npages) 38 { 39 long ret = 0, locked, lock_limit; 40 41 if (WARN_ON_ONCE(!mm)) 42 return -EPERM; 43 44 if (!npages) 45 return 0; 46 47 down_write(&mm->mmap_sem); 48 locked = mm->locked_vm + npages; 49 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 50 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 51 ret = -ENOMEM; 52 else 53 mm->locked_vm += npages; 54 55 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, 56 npages << PAGE_SHIFT, 57 mm->locked_vm << PAGE_SHIFT, 58 rlimit(RLIMIT_MEMLOCK), 59 ret ? " - exceeded" : ""); 60 61 up_write(&mm->mmap_sem); 62 63 return ret; 64 } 65 66 static void decrement_locked_vm(struct mm_struct *mm, long npages) 67 { 68 if (!mm || !npages) 69 return; 70 71 down_write(&mm->mmap_sem); 72 if (WARN_ON_ONCE(npages > mm->locked_vm)) 73 npages = mm->locked_vm; 74 mm->locked_vm -= npages; 75 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, 76 npages << PAGE_SHIFT, 77 mm->locked_vm << PAGE_SHIFT, 78 rlimit(RLIMIT_MEMLOCK)); 79 up_write(&mm->mmap_sem); 80 } 81 82 /* 83 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 84 * 85 * This code handles mapping and unmapping of user data buffers 86 * into DMA'ble space using the IOMMU 87 */ 88 89 struct tce_iommu_group { 90 struct list_head next; 91 struct iommu_group *grp; 92 }; 93 94 /* 95 * A container needs to remember which preregistered region it has 96 * referenced to do proper cleanup at the userspace process exit. 97 */ 98 struct tce_iommu_prereg { 99 struct list_head next; 100 struct mm_iommu_table_group_mem_t *mem; 101 }; 102 103 /* 104 * The container descriptor supports only a single group per container. 105 * Required by the API as the container is not supplied with the IOMMU group 106 * at the moment of initialization. 107 */ 108 struct tce_container { 109 struct mutex lock; 110 bool enabled; 111 bool v2; 112 bool def_window_pending; 113 unsigned long locked_pages; 114 struct mm_struct *mm; 115 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 116 struct list_head group_list; 117 struct list_head prereg_list; 118 }; 119 120 static long tce_iommu_mm_set(struct tce_container *container) 121 { 122 if (container->mm) { 123 if (container->mm == current->mm) 124 return 0; 125 return -EPERM; 126 } 127 BUG_ON(!current->mm); 128 container->mm = current->mm; 129 atomic_inc(&container->mm->mm_count); 130 131 return 0; 132 } 133 134 static long tce_iommu_prereg_free(struct tce_container *container, 135 struct tce_iommu_prereg *tcemem) 136 { 137 long ret; 138 139 ret = mm_iommu_put(container->mm, tcemem->mem); 140 if (ret) 141 return ret; 142 143 list_del(&tcemem->next); 144 kfree(tcemem); 145 146 return 0; 147 } 148 149 static long tce_iommu_unregister_pages(struct tce_container *container, 150 __u64 vaddr, __u64 size) 151 { 152 struct mm_iommu_table_group_mem_t *mem; 153 struct tce_iommu_prereg *tcemem; 154 bool found = false; 155 156 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 157 return -EINVAL; 158 159 mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); 160 if (!mem) 161 return -ENOENT; 162 163 list_for_each_entry(tcemem, &container->prereg_list, next) { 164 if (tcemem->mem == mem) { 165 found = true; 166 break; 167 } 168 } 169 170 if (!found) 171 return -ENOENT; 172 173 return tce_iommu_prereg_free(container, tcemem); 174 } 175 176 static long tce_iommu_register_pages(struct tce_container *container, 177 __u64 vaddr, __u64 size) 178 { 179 long ret = 0; 180 struct mm_iommu_table_group_mem_t *mem = NULL; 181 struct tce_iommu_prereg *tcemem; 182 unsigned long entries = size >> PAGE_SHIFT; 183 184 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 185 ((vaddr + size) < vaddr)) 186 return -EINVAL; 187 188 mem = mm_iommu_find(container->mm, vaddr, entries); 189 if (mem) { 190 list_for_each_entry(tcemem, &container->prereg_list, next) { 191 if (tcemem->mem == mem) 192 return -EBUSY; 193 } 194 } 195 196 ret = mm_iommu_get(container->mm, vaddr, entries, &mem); 197 if (ret) 198 return ret; 199 200 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 201 if (!tcemem) { 202 mm_iommu_put(container->mm, mem); 203 return -ENOMEM; 204 } 205 206 tcemem->mem = mem; 207 list_add(&tcemem->next, &container->prereg_list); 208 209 container->enabled = true; 210 211 return 0; 212 } 213 214 static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, 215 struct mm_struct *mm) 216 { 217 unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * 218 tbl->it_size, PAGE_SIZE); 219 unsigned long *uas; 220 long ret; 221 222 BUG_ON(tbl->it_userspace); 223 224 ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT); 225 if (ret) 226 return ret; 227 228 uas = vzalloc(cb); 229 if (!uas) { 230 decrement_locked_vm(mm, cb >> PAGE_SHIFT); 231 return -ENOMEM; 232 } 233 tbl->it_userspace = uas; 234 235 return 0; 236 } 237 238 static void tce_iommu_userspace_view_free(struct iommu_table *tbl, 239 struct mm_struct *mm) 240 { 241 unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * 242 tbl->it_size, PAGE_SIZE); 243 244 if (!tbl->it_userspace) 245 return; 246 247 vfree(tbl->it_userspace); 248 tbl->it_userspace = NULL; 249 decrement_locked_vm(mm, cb >> PAGE_SHIFT); 250 } 251 252 static bool tce_page_is_contained(struct page *page, unsigned page_shift) 253 { 254 /* 255 * Check that the TCE table granularity is not bigger than the size of 256 * a page we just found. Otherwise the hardware can get access to 257 * a bigger memory chunk that it should. 258 */ 259 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 260 } 261 262 static inline bool tce_groups_attached(struct tce_container *container) 263 { 264 return !list_empty(&container->group_list); 265 } 266 267 static long tce_iommu_find_table(struct tce_container *container, 268 phys_addr_t ioba, struct iommu_table **ptbl) 269 { 270 long i; 271 272 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 273 struct iommu_table *tbl = container->tables[i]; 274 275 if (tbl) { 276 unsigned long entry = ioba >> tbl->it_page_shift; 277 unsigned long start = tbl->it_offset; 278 unsigned long end = start + tbl->it_size; 279 280 if ((start <= entry) && (entry < end)) { 281 *ptbl = tbl; 282 return i; 283 } 284 } 285 } 286 287 return -1; 288 } 289 290 static int tce_iommu_find_free_table(struct tce_container *container) 291 { 292 int i; 293 294 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 295 if (!container->tables[i]) 296 return i; 297 } 298 299 return -ENOSPC; 300 } 301 302 static int tce_iommu_enable(struct tce_container *container) 303 { 304 int ret = 0; 305 unsigned long locked; 306 struct iommu_table_group *table_group; 307 struct tce_iommu_group *tcegrp; 308 309 if (container->enabled) 310 return -EBUSY; 311 312 /* 313 * When userspace pages are mapped into the IOMMU, they are effectively 314 * locked memory, so, theoretically, we need to update the accounting 315 * of locked pages on each map and unmap. For powerpc, the map unmap 316 * paths can be very hot, though, and the accounting would kill 317 * performance, especially since it would be difficult to impossible 318 * to handle the accounting in real mode only. 319 * 320 * To address that, rather than precisely accounting every page, we 321 * instead account for a worst case on locked memory when the iommu is 322 * enabled and disabled. The worst case upper bound on locked memory 323 * is the size of the whole iommu window, which is usually relatively 324 * small (compared to total memory sizes) on POWER hardware. 325 * 326 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 327 * that would effectively kill the guest at random points, much better 328 * enforcing the limit based on the max that the guest can map. 329 * 330 * Unfortunately at the moment it counts whole tables, no matter how 331 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 332 * each with 2GB DMA window, 8GB will be counted here. The reason for 333 * this is that we cannot tell here the amount of RAM used by the guest 334 * as this information is only available from KVM and VFIO is 335 * KVM agnostic. 336 * 337 * So we do not allow enabling a container without a group attached 338 * as there is no way to know how much we should increment 339 * the locked_vm counter. 340 */ 341 if (!tce_groups_attached(container)) 342 return -ENODEV; 343 344 tcegrp = list_first_entry(&container->group_list, 345 struct tce_iommu_group, next); 346 table_group = iommu_group_get_iommudata(tcegrp->grp); 347 if (!table_group) 348 return -ENODEV; 349 350 if (!table_group->tce32_size) 351 return -EPERM; 352 353 ret = tce_iommu_mm_set(container); 354 if (ret) 355 return ret; 356 357 locked = table_group->tce32_size >> PAGE_SHIFT; 358 ret = try_increment_locked_vm(container->mm, locked); 359 if (ret) 360 return ret; 361 362 container->locked_pages = locked; 363 364 container->enabled = true; 365 366 return ret; 367 } 368 369 static void tce_iommu_disable(struct tce_container *container) 370 { 371 if (!container->enabled) 372 return; 373 374 container->enabled = false; 375 376 BUG_ON(!container->mm); 377 decrement_locked_vm(container->mm, container->locked_pages); 378 } 379 380 static void *tce_iommu_open(unsigned long arg) 381 { 382 struct tce_container *container; 383 384 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 385 pr_err("tce_vfio: Wrong IOMMU type\n"); 386 return ERR_PTR(-EINVAL); 387 } 388 389 container = kzalloc(sizeof(*container), GFP_KERNEL); 390 if (!container) 391 return ERR_PTR(-ENOMEM); 392 393 mutex_init(&container->lock); 394 INIT_LIST_HEAD_RCU(&container->group_list); 395 INIT_LIST_HEAD_RCU(&container->prereg_list); 396 397 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 398 399 return container; 400 } 401 402 static int tce_iommu_clear(struct tce_container *container, 403 struct iommu_table *tbl, 404 unsigned long entry, unsigned long pages); 405 static void tce_iommu_free_table(struct tce_container *container, 406 struct iommu_table *tbl); 407 408 static void tce_iommu_release(void *iommu_data) 409 { 410 struct tce_container *container = iommu_data; 411 struct tce_iommu_group *tcegrp; 412 long i; 413 414 while (tce_groups_attached(container)) { 415 tcegrp = list_first_entry(&container->group_list, 416 struct tce_iommu_group, next); 417 tce_iommu_detach_group(iommu_data, tcegrp->grp); 418 } 419 420 /* 421 * If VFIO created a table, it was not disposed 422 * by tce_iommu_detach_group() so do it now. 423 */ 424 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 425 struct iommu_table *tbl = container->tables[i]; 426 427 if (!tbl) 428 continue; 429 430 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 431 tce_iommu_free_table(container, tbl); 432 } 433 434 while (!list_empty(&container->prereg_list)) { 435 struct tce_iommu_prereg *tcemem; 436 437 tcemem = list_first_entry(&container->prereg_list, 438 struct tce_iommu_prereg, next); 439 WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem)); 440 } 441 442 tce_iommu_disable(container); 443 if (container->mm) 444 mmdrop(container->mm); 445 mutex_destroy(&container->lock); 446 447 kfree(container); 448 } 449 450 static void tce_iommu_unuse_page(struct tce_container *container, 451 unsigned long hpa) 452 { 453 struct page *page; 454 455 page = pfn_to_page(hpa >> PAGE_SHIFT); 456 put_page(page); 457 } 458 459 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 460 unsigned long tce, unsigned long size, 461 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 462 { 463 long ret = 0; 464 struct mm_iommu_table_group_mem_t *mem; 465 466 mem = mm_iommu_lookup(container->mm, tce, size); 467 if (!mem) 468 return -EINVAL; 469 470 ret = mm_iommu_ua_to_hpa(mem, tce, phpa); 471 if (ret) 472 return -EINVAL; 473 474 *pmem = mem; 475 476 return 0; 477 } 478 479 static void tce_iommu_unuse_page_v2(struct tce_container *container, 480 struct iommu_table *tbl, unsigned long entry) 481 { 482 struct mm_iommu_table_group_mem_t *mem = NULL; 483 int ret; 484 unsigned long hpa = 0; 485 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); 486 487 if (!pua) 488 return; 489 490 ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl), 491 &hpa, &mem); 492 if (ret) 493 pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", 494 __func__, *pua, entry, ret); 495 if (mem) 496 mm_iommu_mapped_dec(mem); 497 498 *pua = 0; 499 } 500 501 static int tce_iommu_clear(struct tce_container *container, 502 struct iommu_table *tbl, 503 unsigned long entry, unsigned long pages) 504 { 505 unsigned long oldhpa; 506 long ret; 507 enum dma_data_direction direction; 508 509 for ( ; pages; --pages, ++entry) { 510 cond_resched(); 511 512 direction = DMA_NONE; 513 oldhpa = 0; 514 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); 515 if (ret) 516 continue; 517 518 if (direction == DMA_NONE) 519 continue; 520 521 if (container->v2) { 522 tce_iommu_unuse_page_v2(container, tbl, entry); 523 continue; 524 } 525 526 tce_iommu_unuse_page(container, oldhpa); 527 } 528 529 return 0; 530 } 531 532 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 533 { 534 struct page *page = NULL; 535 enum dma_data_direction direction = iommu_tce_direction(tce); 536 537 if (get_user_pages_fast(tce & PAGE_MASK, 1, 538 direction != DMA_TO_DEVICE, &page) != 1) 539 return -EFAULT; 540 541 *hpa = __pa((unsigned long) page_address(page)); 542 543 return 0; 544 } 545 546 static long tce_iommu_build(struct tce_container *container, 547 struct iommu_table *tbl, 548 unsigned long entry, unsigned long tce, unsigned long pages, 549 enum dma_data_direction direction) 550 { 551 long i, ret = 0; 552 struct page *page; 553 unsigned long hpa; 554 enum dma_data_direction dirtmp; 555 556 for (i = 0; i < pages; ++i) { 557 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 558 559 ret = tce_iommu_use_page(tce, &hpa); 560 if (ret) 561 break; 562 563 page = pfn_to_page(hpa >> PAGE_SHIFT); 564 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 565 ret = -EPERM; 566 break; 567 } 568 569 hpa |= offset; 570 dirtmp = direction; 571 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 572 if (ret) { 573 tce_iommu_unuse_page(container, hpa); 574 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 575 __func__, entry << tbl->it_page_shift, 576 tce, ret); 577 break; 578 } 579 580 if (dirtmp != DMA_NONE) 581 tce_iommu_unuse_page(container, hpa); 582 583 tce += IOMMU_PAGE_SIZE(tbl); 584 } 585 586 if (ret) 587 tce_iommu_clear(container, tbl, entry, i); 588 589 return ret; 590 } 591 592 static long tce_iommu_build_v2(struct tce_container *container, 593 struct iommu_table *tbl, 594 unsigned long entry, unsigned long tce, unsigned long pages, 595 enum dma_data_direction direction) 596 { 597 long i, ret = 0; 598 struct page *page; 599 unsigned long hpa; 600 enum dma_data_direction dirtmp; 601 602 if (!tbl->it_userspace) { 603 ret = tce_iommu_userspace_view_alloc(tbl, container->mm); 604 if (ret) 605 return ret; 606 } 607 608 for (i = 0; i < pages; ++i) { 609 struct mm_iommu_table_group_mem_t *mem = NULL; 610 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, 611 entry + i); 612 613 ret = tce_iommu_prereg_ua_to_hpa(container, 614 tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem); 615 if (ret) 616 break; 617 618 page = pfn_to_page(hpa >> PAGE_SHIFT); 619 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 620 ret = -EPERM; 621 break; 622 } 623 624 /* Preserve offset within IOMMU page */ 625 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 626 dirtmp = direction; 627 628 /* The registered region is being unregistered */ 629 if (mm_iommu_mapped_inc(mem)) 630 break; 631 632 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 633 if (ret) { 634 /* dirtmp cannot be DMA_NONE here */ 635 tce_iommu_unuse_page_v2(container, tbl, entry + i); 636 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 637 __func__, entry << tbl->it_page_shift, 638 tce, ret); 639 break; 640 } 641 642 if (dirtmp != DMA_NONE) 643 tce_iommu_unuse_page_v2(container, tbl, entry + i); 644 645 *pua = tce; 646 647 tce += IOMMU_PAGE_SIZE(tbl); 648 } 649 650 if (ret) 651 tce_iommu_clear(container, tbl, entry, i); 652 653 return ret; 654 } 655 656 static long tce_iommu_create_table(struct tce_container *container, 657 struct iommu_table_group *table_group, 658 int num, 659 __u32 page_shift, 660 __u64 window_size, 661 __u32 levels, 662 struct iommu_table **ptbl) 663 { 664 long ret, table_size; 665 666 table_size = table_group->ops->get_table_size(page_shift, window_size, 667 levels); 668 if (!table_size) 669 return -EINVAL; 670 671 ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); 672 if (ret) 673 return ret; 674 675 ret = table_group->ops->create_table(table_group, num, 676 page_shift, window_size, levels, ptbl); 677 678 WARN_ON(!ret && !(*ptbl)->it_ops->free); 679 WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); 680 681 return ret; 682 } 683 684 static void tce_iommu_free_table(struct tce_container *container, 685 struct iommu_table *tbl) 686 { 687 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 688 689 tce_iommu_userspace_view_free(tbl, container->mm); 690 iommu_tce_table_put(tbl); 691 decrement_locked_vm(container->mm, pages); 692 } 693 694 static long tce_iommu_create_window(struct tce_container *container, 695 __u32 page_shift, __u64 window_size, __u32 levels, 696 __u64 *start_addr) 697 { 698 struct tce_iommu_group *tcegrp; 699 struct iommu_table_group *table_group; 700 struct iommu_table *tbl = NULL; 701 long ret, num; 702 703 num = tce_iommu_find_free_table(container); 704 if (num < 0) 705 return num; 706 707 /* Get the first group for ops::create_table */ 708 tcegrp = list_first_entry(&container->group_list, 709 struct tce_iommu_group, next); 710 table_group = iommu_group_get_iommudata(tcegrp->grp); 711 if (!table_group) 712 return -EFAULT; 713 714 if (!(table_group->pgsizes & (1ULL << page_shift))) 715 return -EINVAL; 716 717 if (!table_group->ops->set_window || !table_group->ops->unset_window || 718 !table_group->ops->get_table_size || 719 !table_group->ops->create_table) 720 return -EPERM; 721 722 /* Create TCE table */ 723 ret = tce_iommu_create_table(container, table_group, num, 724 page_shift, window_size, levels, &tbl); 725 if (ret) 726 return ret; 727 728 BUG_ON(!tbl->it_ops->free); 729 730 /* 731 * Program the table to every group. 732 * Groups have been tested for compatibility at the attach time. 733 */ 734 list_for_each_entry(tcegrp, &container->group_list, next) { 735 table_group = iommu_group_get_iommudata(tcegrp->grp); 736 737 ret = table_group->ops->set_window(table_group, num, tbl); 738 if (ret) 739 goto unset_exit; 740 } 741 742 container->tables[num] = tbl; 743 744 /* Return start address assigned by platform in create_table() */ 745 *start_addr = tbl->it_offset << tbl->it_page_shift; 746 747 return 0; 748 749 unset_exit: 750 list_for_each_entry(tcegrp, &container->group_list, next) { 751 table_group = iommu_group_get_iommudata(tcegrp->grp); 752 table_group->ops->unset_window(table_group, num); 753 } 754 tce_iommu_free_table(container, tbl); 755 756 return ret; 757 } 758 759 static long tce_iommu_remove_window(struct tce_container *container, 760 __u64 start_addr) 761 { 762 struct iommu_table_group *table_group = NULL; 763 struct iommu_table *tbl; 764 struct tce_iommu_group *tcegrp; 765 int num; 766 767 num = tce_iommu_find_table(container, start_addr, &tbl); 768 if (num < 0) 769 return -EINVAL; 770 771 BUG_ON(!tbl->it_size); 772 773 /* Detach groups from IOMMUs */ 774 list_for_each_entry(tcegrp, &container->group_list, next) { 775 table_group = iommu_group_get_iommudata(tcegrp->grp); 776 777 /* 778 * SPAPR TCE IOMMU exposes the default DMA window to 779 * the guest via dma32_window_start/size of 780 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 781 * the userspace to remove this window, some do not so 782 * here we check for the platform capability. 783 */ 784 if (!table_group->ops || !table_group->ops->unset_window) 785 return -EPERM; 786 787 table_group->ops->unset_window(table_group, num); 788 } 789 790 /* Free table */ 791 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 792 tce_iommu_free_table(container, tbl); 793 container->tables[num] = NULL; 794 795 return 0; 796 } 797 798 static long tce_iommu_create_default_window(struct tce_container *container) 799 { 800 long ret; 801 __u64 start_addr = 0; 802 struct tce_iommu_group *tcegrp; 803 struct iommu_table_group *table_group; 804 805 if (!container->def_window_pending) 806 return 0; 807 808 if (!tce_groups_attached(container)) 809 return -ENODEV; 810 811 tcegrp = list_first_entry(&container->group_list, 812 struct tce_iommu_group, next); 813 table_group = iommu_group_get_iommudata(tcegrp->grp); 814 if (!table_group) 815 return -ENODEV; 816 817 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 818 table_group->tce32_size, 1, &start_addr); 819 WARN_ON_ONCE(!ret && start_addr); 820 821 if (!ret) 822 container->def_window_pending = false; 823 824 return ret; 825 } 826 827 static long tce_iommu_ioctl(void *iommu_data, 828 unsigned int cmd, unsigned long arg) 829 { 830 struct tce_container *container = iommu_data; 831 unsigned long minsz, ddwsz; 832 long ret; 833 834 switch (cmd) { 835 case VFIO_CHECK_EXTENSION: 836 switch (arg) { 837 case VFIO_SPAPR_TCE_IOMMU: 838 case VFIO_SPAPR_TCE_v2_IOMMU: 839 ret = 1; 840 break; 841 default: 842 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 843 break; 844 } 845 846 return (ret < 0) ? 0 : ret; 847 } 848 849 /* 850 * Sanity check to prevent one userspace from manipulating 851 * another userspace mm. 852 */ 853 BUG_ON(!container); 854 if (container->mm && container->mm != current->mm) 855 return -EPERM; 856 857 switch (cmd) { 858 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 859 struct vfio_iommu_spapr_tce_info info; 860 struct tce_iommu_group *tcegrp; 861 struct iommu_table_group *table_group; 862 863 if (!tce_groups_attached(container)) 864 return -ENXIO; 865 866 tcegrp = list_first_entry(&container->group_list, 867 struct tce_iommu_group, next); 868 table_group = iommu_group_get_iommudata(tcegrp->grp); 869 870 if (!table_group) 871 return -ENXIO; 872 873 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 874 dma32_window_size); 875 876 if (copy_from_user(&info, (void __user *)arg, minsz)) 877 return -EFAULT; 878 879 if (info.argsz < minsz) 880 return -EINVAL; 881 882 info.dma32_window_start = table_group->tce32_start; 883 info.dma32_window_size = table_group->tce32_size; 884 info.flags = 0; 885 memset(&info.ddw, 0, sizeof(info.ddw)); 886 887 if (table_group->max_dynamic_windows_supported && 888 container->v2) { 889 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 890 info.ddw.pgsizes = table_group->pgsizes; 891 info.ddw.max_dynamic_windows_supported = 892 table_group->max_dynamic_windows_supported; 893 info.ddw.levels = table_group->max_levels; 894 } 895 896 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 897 898 if (info.argsz >= ddwsz) 899 minsz = ddwsz; 900 901 if (copy_to_user((void __user *)arg, &info, minsz)) 902 return -EFAULT; 903 904 return 0; 905 } 906 case VFIO_IOMMU_MAP_DMA: { 907 struct vfio_iommu_type1_dma_map param; 908 struct iommu_table *tbl = NULL; 909 long num; 910 enum dma_data_direction direction; 911 912 if (!container->enabled) 913 return -EPERM; 914 915 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 916 917 if (copy_from_user(¶m, (void __user *)arg, minsz)) 918 return -EFAULT; 919 920 if (param.argsz < minsz) 921 return -EINVAL; 922 923 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 924 VFIO_DMA_MAP_FLAG_WRITE)) 925 return -EINVAL; 926 927 ret = tce_iommu_create_default_window(container); 928 if (ret) 929 return ret; 930 931 num = tce_iommu_find_table(container, param.iova, &tbl); 932 if (num < 0) 933 return -ENXIO; 934 935 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 936 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 937 return -EINVAL; 938 939 /* iova is checked by the IOMMU API */ 940 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 941 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 942 direction = DMA_BIDIRECTIONAL; 943 else 944 direction = DMA_TO_DEVICE; 945 } else { 946 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 947 direction = DMA_FROM_DEVICE; 948 else 949 return -EINVAL; 950 } 951 952 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 953 if (ret) 954 return ret; 955 956 if (container->v2) 957 ret = tce_iommu_build_v2(container, tbl, 958 param.iova >> tbl->it_page_shift, 959 param.vaddr, 960 param.size >> tbl->it_page_shift, 961 direction); 962 else 963 ret = tce_iommu_build(container, tbl, 964 param.iova >> tbl->it_page_shift, 965 param.vaddr, 966 param.size >> tbl->it_page_shift, 967 direction); 968 969 iommu_flush_tce(tbl); 970 971 return ret; 972 } 973 case VFIO_IOMMU_UNMAP_DMA: { 974 struct vfio_iommu_type1_dma_unmap param; 975 struct iommu_table *tbl = NULL; 976 long num; 977 978 if (!container->enabled) 979 return -EPERM; 980 981 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 982 size); 983 984 if (copy_from_user(¶m, (void __user *)arg, minsz)) 985 return -EFAULT; 986 987 if (param.argsz < minsz) 988 return -EINVAL; 989 990 /* No flag is supported now */ 991 if (param.flags) 992 return -EINVAL; 993 994 ret = tce_iommu_create_default_window(container); 995 if (ret) 996 return ret; 997 998 num = tce_iommu_find_table(container, param.iova, &tbl); 999 if (num < 0) 1000 return -ENXIO; 1001 1002 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 1003 return -EINVAL; 1004 1005 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 1006 param.size >> tbl->it_page_shift); 1007 if (ret) 1008 return ret; 1009 1010 ret = tce_iommu_clear(container, tbl, 1011 param.iova >> tbl->it_page_shift, 1012 param.size >> tbl->it_page_shift); 1013 iommu_flush_tce(tbl); 1014 1015 return ret; 1016 } 1017 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 1018 struct vfio_iommu_spapr_register_memory param; 1019 1020 if (!container->v2) 1021 break; 1022 1023 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1024 size); 1025 1026 ret = tce_iommu_mm_set(container); 1027 if (ret) 1028 return ret; 1029 1030 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1031 return -EFAULT; 1032 1033 if (param.argsz < minsz) 1034 return -EINVAL; 1035 1036 /* No flag is supported now */ 1037 if (param.flags) 1038 return -EINVAL; 1039 1040 mutex_lock(&container->lock); 1041 ret = tce_iommu_register_pages(container, param.vaddr, 1042 param.size); 1043 mutex_unlock(&container->lock); 1044 1045 return ret; 1046 } 1047 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 1048 struct vfio_iommu_spapr_register_memory param; 1049 1050 if (!container->v2) 1051 break; 1052 1053 if (!container->mm) 1054 return -EPERM; 1055 1056 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1057 size); 1058 1059 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1060 return -EFAULT; 1061 1062 if (param.argsz < minsz) 1063 return -EINVAL; 1064 1065 /* No flag is supported now */ 1066 if (param.flags) 1067 return -EINVAL; 1068 1069 mutex_lock(&container->lock); 1070 ret = tce_iommu_unregister_pages(container, param.vaddr, 1071 param.size); 1072 mutex_unlock(&container->lock); 1073 1074 return ret; 1075 } 1076 case VFIO_IOMMU_ENABLE: 1077 if (container->v2) 1078 break; 1079 1080 mutex_lock(&container->lock); 1081 ret = tce_iommu_enable(container); 1082 mutex_unlock(&container->lock); 1083 return ret; 1084 1085 1086 case VFIO_IOMMU_DISABLE: 1087 if (container->v2) 1088 break; 1089 1090 mutex_lock(&container->lock); 1091 tce_iommu_disable(container); 1092 mutex_unlock(&container->lock); 1093 return 0; 1094 1095 case VFIO_EEH_PE_OP: { 1096 struct tce_iommu_group *tcegrp; 1097 1098 ret = 0; 1099 list_for_each_entry(tcegrp, &container->group_list, next) { 1100 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1101 cmd, arg); 1102 if (ret) 1103 return ret; 1104 } 1105 return ret; 1106 } 1107 1108 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1109 struct vfio_iommu_spapr_tce_create create; 1110 1111 if (!container->v2) 1112 break; 1113 1114 ret = tce_iommu_mm_set(container); 1115 if (ret) 1116 return ret; 1117 1118 if (!tce_groups_attached(container)) 1119 return -ENXIO; 1120 1121 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1122 start_addr); 1123 1124 if (copy_from_user(&create, (void __user *)arg, minsz)) 1125 return -EFAULT; 1126 1127 if (create.argsz < minsz) 1128 return -EINVAL; 1129 1130 if (create.flags) 1131 return -EINVAL; 1132 1133 mutex_lock(&container->lock); 1134 1135 ret = tce_iommu_create_default_window(container); 1136 if (!ret) 1137 ret = tce_iommu_create_window(container, 1138 create.page_shift, 1139 create.window_size, create.levels, 1140 &create.start_addr); 1141 1142 mutex_unlock(&container->lock); 1143 1144 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1145 ret = -EFAULT; 1146 1147 return ret; 1148 } 1149 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1150 struct vfio_iommu_spapr_tce_remove remove; 1151 1152 if (!container->v2) 1153 break; 1154 1155 ret = tce_iommu_mm_set(container); 1156 if (ret) 1157 return ret; 1158 1159 if (!tce_groups_attached(container)) 1160 return -ENXIO; 1161 1162 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1163 start_addr); 1164 1165 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1166 return -EFAULT; 1167 1168 if (remove.argsz < minsz) 1169 return -EINVAL; 1170 1171 if (remove.flags) 1172 return -EINVAL; 1173 1174 if (container->def_window_pending && !remove.start_addr) { 1175 container->def_window_pending = false; 1176 return 0; 1177 } 1178 1179 mutex_lock(&container->lock); 1180 1181 ret = tce_iommu_remove_window(container, remove.start_addr); 1182 1183 mutex_unlock(&container->lock); 1184 1185 return ret; 1186 } 1187 } 1188 1189 return -ENOTTY; 1190 } 1191 1192 static void tce_iommu_release_ownership(struct tce_container *container, 1193 struct iommu_table_group *table_group) 1194 { 1195 int i; 1196 1197 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1198 struct iommu_table *tbl = container->tables[i]; 1199 1200 if (!tbl) 1201 continue; 1202 1203 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1204 tce_iommu_userspace_view_free(tbl, container->mm); 1205 if (tbl->it_map) 1206 iommu_release_ownership(tbl); 1207 1208 container->tables[i] = NULL; 1209 } 1210 } 1211 1212 static int tce_iommu_take_ownership(struct tce_container *container, 1213 struct iommu_table_group *table_group) 1214 { 1215 int i, j, rc = 0; 1216 1217 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1218 struct iommu_table *tbl = table_group->tables[i]; 1219 1220 if (!tbl || !tbl->it_map) 1221 continue; 1222 1223 rc = iommu_take_ownership(tbl); 1224 if (rc) { 1225 for (j = 0; j < i; ++j) 1226 iommu_release_ownership( 1227 table_group->tables[j]); 1228 1229 return rc; 1230 } 1231 } 1232 1233 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1234 container->tables[i] = table_group->tables[i]; 1235 1236 return 0; 1237 } 1238 1239 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1240 struct iommu_table_group *table_group) 1241 { 1242 long i; 1243 1244 if (!table_group->ops->unset_window) { 1245 WARN_ON_ONCE(1); 1246 return; 1247 } 1248 1249 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1250 table_group->ops->unset_window(table_group, i); 1251 1252 table_group->ops->release_ownership(table_group); 1253 } 1254 1255 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1256 struct iommu_table_group *table_group) 1257 { 1258 long i, ret = 0; 1259 1260 if (!table_group->ops->create_table || !table_group->ops->set_window || 1261 !table_group->ops->release_ownership) { 1262 WARN_ON_ONCE(1); 1263 return -EFAULT; 1264 } 1265 1266 table_group->ops->take_ownership(table_group); 1267 1268 /* Set all windows to the new group */ 1269 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1270 struct iommu_table *tbl = container->tables[i]; 1271 1272 if (!tbl) 1273 continue; 1274 1275 ret = table_group->ops->set_window(table_group, i, tbl); 1276 if (ret) 1277 goto release_exit; 1278 } 1279 1280 return 0; 1281 1282 release_exit: 1283 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1284 table_group->ops->unset_window(table_group, i); 1285 1286 table_group->ops->release_ownership(table_group); 1287 1288 return ret; 1289 } 1290 1291 static int tce_iommu_attach_group(void *iommu_data, 1292 struct iommu_group *iommu_group) 1293 { 1294 int ret; 1295 struct tce_container *container = iommu_data; 1296 struct iommu_table_group *table_group; 1297 struct tce_iommu_group *tcegrp = NULL; 1298 1299 mutex_lock(&container->lock); 1300 1301 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1302 iommu_group_id(iommu_group), iommu_group); */ 1303 table_group = iommu_group_get_iommudata(iommu_group); 1304 if (!table_group) { 1305 ret = -ENODEV; 1306 goto unlock_exit; 1307 } 1308 1309 if (tce_groups_attached(container) && (!table_group->ops || 1310 !table_group->ops->take_ownership || 1311 !table_group->ops->release_ownership)) { 1312 ret = -EBUSY; 1313 goto unlock_exit; 1314 } 1315 1316 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1317 list_for_each_entry(tcegrp, &container->group_list, next) { 1318 struct iommu_table_group *table_group_tmp; 1319 1320 if (tcegrp->grp == iommu_group) { 1321 pr_warn("tce_vfio: Group %d is already attached\n", 1322 iommu_group_id(iommu_group)); 1323 ret = -EBUSY; 1324 goto unlock_exit; 1325 } 1326 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1327 if (table_group_tmp->ops->create_table != 1328 table_group->ops->create_table) { 1329 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1330 iommu_group_id(iommu_group), 1331 iommu_group_id(tcegrp->grp)); 1332 ret = -EPERM; 1333 goto unlock_exit; 1334 } 1335 } 1336 1337 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1338 if (!tcegrp) { 1339 ret = -ENOMEM; 1340 goto unlock_exit; 1341 } 1342 1343 if (!table_group->ops || !table_group->ops->take_ownership || 1344 !table_group->ops->release_ownership) { 1345 if (container->v2) { 1346 ret = -EPERM; 1347 goto unlock_exit; 1348 } 1349 ret = tce_iommu_take_ownership(container, table_group); 1350 } else { 1351 if (!container->v2) { 1352 ret = -EPERM; 1353 goto unlock_exit; 1354 } 1355 ret = tce_iommu_take_ownership_ddw(container, table_group); 1356 if (!tce_groups_attached(container) && !container->tables[0]) 1357 container->def_window_pending = true; 1358 } 1359 1360 if (!ret) { 1361 tcegrp->grp = iommu_group; 1362 list_add(&tcegrp->next, &container->group_list); 1363 } 1364 1365 unlock_exit: 1366 if (ret && tcegrp) 1367 kfree(tcegrp); 1368 1369 mutex_unlock(&container->lock); 1370 1371 return ret; 1372 } 1373 1374 static void tce_iommu_detach_group(void *iommu_data, 1375 struct iommu_group *iommu_group) 1376 { 1377 struct tce_container *container = iommu_data; 1378 struct iommu_table_group *table_group; 1379 bool found = false; 1380 struct tce_iommu_group *tcegrp; 1381 1382 mutex_lock(&container->lock); 1383 1384 list_for_each_entry(tcegrp, &container->group_list, next) { 1385 if (tcegrp->grp == iommu_group) { 1386 found = true; 1387 break; 1388 } 1389 } 1390 1391 if (!found) { 1392 pr_warn("tce_vfio: detaching unattached group #%u\n", 1393 iommu_group_id(iommu_group)); 1394 goto unlock_exit; 1395 } 1396 1397 list_del(&tcegrp->next); 1398 kfree(tcegrp); 1399 1400 table_group = iommu_group_get_iommudata(iommu_group); 1401 BUG_ON(!table_group); 1402 1403 if (!table_group->ops || !table_group->ops->release_ownership) 1404 tce_iommu_release_ownership(container, table_group); 1405 else 1406 tce_iommu_release_ownership_ddw(container, table_group); 1407 1408 unlock_exit: 1409 mutex_unlock(&container->lock); 1410 } 1411 1412 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1413 .name = "iommu-vfio-powerpc", 1414 .owner = THIS_MODULE, 1415 .open = tce_iommu_open, 1416 .release = tce_iommu_release, 1417 .ioctl = tce_iommu_ioctl, 1418 .attach_group = tce_iommu_attach_group, 1419 .detach_group = tce_iommu_detach_group, 1420 }; 1421 1422 static int __init tce_iommu_init(void) 1423 { 1424 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1425 } 1426 1427 static void __exit tce_iommu_cleanup(void) 1428 { 1429 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1430 } 1431 1432 module_init(tce_iommu_init); 1433 module_exit(tce_iommu_cleanup); 1434 1435 MODULE_VERSION(DRIVER_VERSION); 1436 MODULE_LICENSE("GPL v2"); 1437 MODULE_AUTHOR(DRIVER_AUTHOR); 1438 MODULE_DESCRIPTION(DRIVER_DESC); 1439 1440