1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation 4 * 5 * Rewrite, cleanup, new allocation schemes, virtual merging: 6 * Copyright (C) 2004 Olof Johansson, IBM Corporation 7 * and Ben. Herrenschmidt, IBM Corporation 8 * 9 * Dynamic DMA mapping support, bus-independent parts. 10 */ 11 12 13 #include <linux/init.h> 14 #include <linux/types.h> 15 #include <linux/slab.h> 16 #include <linux/sysfs.h> 17 #include <linux/mm.h> 18 #include <linux/spinlock.h> 19 #include <linux/string.h> 20 #include <linux/string_choices.h> 21 #include <linux/dma-mapping.h> 22 #include <linux/bitmap.h> 23 #include <linux/iommu-helper.h> 24 #include <linux/crash_dump.h> 25 #include <linux/hash.h> 26 #include <linux/fault-inject.h> 27 #include <linux/pci.h> 28 #include <linux/iommu.h> 29 #include <linux/sched.h> 30 #include <linux/debugfs.h> 31 #include <linux/vmalloc.h> 32 #include <asm/io.h> 33 #include <asm/iommu.h> 34 #include <asm/pci-bridge.h> 35 #include <asm/machdep.h> 36 #include <asm/kdump.h> 37 #include <asm/fadump.h> 38 #include <asm/vio.h> 39 #include <asm/tce.h> 40 #include <asm/mmu_context.h> 41 #include <asm/ppc-pci.h> 42 43 #define DBG(...) 44 45 #ifdef CONFIG_IOMMU_DEBUGFS 46 static int iommu_debugfs_weight_get(void *data, u64 *val) 47 { 48 struct iommu_table *tbl = data; 49 *val = bitmap_weight(tbl->it_map, tbl->it_size); 50 return 0; 51 } 52 DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n"); 53 54 static void iommu_debugfs_add(struct iommu_table *tbl) 55 { 56 char name[10]; 57 struct dentry *liobn_entry; 58 59 sprintf(name, "%08lx", tbl->it_index); 60 liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir); 61 62 debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight); 63 debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size); 64 debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift); 65 debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start); 66 debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end); 67 debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels); 68 debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size); 69 } 70 71 static void iommu_debugfs_del(struct iommu_table *tbl) 72 { 73 char name[10]; 74 75 sprintf(name, "%08lx", tbl->it_index); 76 debugfs_lookup_and_remove(name, iommu_debugfs_dir); 77 } 78 #else 79 static void iommu_debugfs_add(struct iommu_table *tbl){} 80 static void iommu_debugfs_del(struct iommu_table *tbl){} 81 #endif 82 83 static int novmerge; 84 85 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); 86 87 static int __init setup_iommu(char *str) 88 { 89 if (!strcmp(str, "novmerge")) 90 novmerge = 1; 91 else if (!strcmp(str, "vmerge")) 92 novmerge = 0; 93 return 1; 94 } 95 96 __setup("iommu=", setup_iommu); 97 98 static DEFINE_PER_CPU(unsigned int, iommu_pool_hash); 99 100 /* 101 * We precalculate the hash to avoid doing it on every allocation. 102 * 103 * The hash is important to spread CPUs across all the pools. For example, 104 * on a POWER7 with 4 way SMT we want interrupts on the primary threads and 105 * with 4 pools all primary threads would map to the same pool. 106 */ 107 static int __init setup_iommu_pool_hash(void) 108 { 109 unsigned int i; 110 111 for_each_possible_cpu(i) 112 per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS); 113 114 return 0; 115 } 116 subsys_initcall(setup_iommu_pool_hash); 117 118 #ifdef CONFIG_FAIL_IOMMU 119 120 static DECLARE_FAULT_ATTR(fail_iommu); 121 122 static int __init setup_fail_iommu(char *str) 123 { 124 return setup_fault_attr(&fail_iommu, str); 125 } 126 __setup("fail_iommu=", setup_fail_iommu); 127 128 static bool should_fail_iommu(struct device *dev) 129 { 130 return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1); 131 } 132 133 static int __init fail_iommu_debugfs(void) 134 { 135 struct dentry *dir = fault_create_debugfs_attr("fail_iommu", 136 NULL, &fail_iommu); 137 138 return PTR_ERR_OR_ZERO(dir); 139 } 140 late_initcall(fail_iommu_debugfs); 141 142 static ssize_t fail_iommu_show(struct device *dev, 143 struct device_attribute *attr, char *buf) 144 { 145 return sysfs_emit(buf, "%d\n", dev->archdata.fail_iommu); 146 } 147 148 static ssize_t fail_iommu_store(struct device *dev, 149 struct device_attribute *attr, const char *buf, 150 size_t count) 151 { 152 int i; 153 154 if (count > 0 && sscanf(buf, "%d", &i) > 0) 155 dev->archdata.fail_iommu = (i == 0) ? 0 : 1; 156 157 return count; 158 } 159 160 static DEVICE_ATTR_RW(fail_iommu); 161 162 static int fail_iommu_bus_notify(struct notifier_block *nb, 163 unsigned long action, void *data) 164 { 165 struct device *dev = data; 166 167 if (action == BUS_NOTIFY_ADD_DEVICE) { 168 if (device_create_file(dev, &dev_attr_fail_iommu)) 169 pr_warn("Unable to create IOMMU fault injection sysfs " 170 "entries\n"); 171 } else if (action == BUS_NOTIFY_DEL_DEVICE) { 172 device_remove_file(dev, &dev_attr_fail_iommu); 173 } 174 175 return 0; 176 } 177 178 /* 179 * PCI and VIO buses need separate notifier_block structs, since they're linked 180 * list nodes. Sharing a notifier_block would mean that any notifiers later 181 * registered for PCI buses would also get called by VIO buses and vice versa. 182 */ 183 static struct notifier_block fail_iommu_pci_bus_notifier = { 184 .notifier_call = fail_iommu_bus_notify 185 }; 186 187 #ifdef CONFIG_IBMVIO 188 static struct notifier_block fail_iommu_vio_bus_notifier = { 189 .notifier_call = fail_iommu_bus_notify 190 }; 191 #endif 192 193 static int __init fail_iommu_setup(void) 194 { 195 #ifdef CONFIG_PCI 196 bus_register_notifier(&pci_bus_type, &fail_iommu_pci_bus_notifier); 197 #endif 198 #ifdef CONFIG_IBMVIO 199 bus_register_notifier(&vio_bus_type, &fail_iommu_vio_bus_notifier); 200 #endif 201 202 return 0; 203 } 204 /* 205 * Must execute after PCI and VIO subsystem have initialised but before 206 * devices are probed. 207 */ 208 arch_initcall(fail_iommu_setup); 209 #else 210 static inline bool should_fail_iommu(struct device *dev) 211 { 212 return false; 213 } 214 #endif 215 216 static unsigned long iommu_range_alloc(struct device *dev, 217 struct iommu_table *tbl, 218 unsigned long npages, 219 unsigned long *handle, 220 unsigned long mask, 221 unsigned int align_order) 222 { 223 unsigned long n, end, start; 224 unsigned long limit; 225 int largealloc = npages > 15; 226 int pass = 0; 227 unsigned long align_mask; 228 unsigned long flags; 229 unsigned int pool_nr; 230 struct iommu_pool *pool; 231 232 align_mask = (1ull << align_order) - 1; 233 234 /* This allocator was derived from x86_64's bit string search */ 235 236 /* Sanity check */ 237 if (unlikely(npages == 0)) { 238 if (printk_ratelimit()) 239 WARN_ON(1); 240 return DMA_MAPPING_ERROR; 241 } 242 243 if (should_fail_iommu(dev)) 244 return DMA_MAPPING_ERROR; 245 246 /* 247 * We don't need to disable preemption here because any CPU can 248 * safely use any IOMMU pool. 249 */ 250 pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); 251 252 if (largealloc) 253 pool = &(tbl->large_pool); 254 else 255 pool = &(tbl->pools[pool_nr]); 256 257 spin_lock_irqsave(&(pool->lock), flags); 258 259 again: 260 if ((pass == 0) && handle && *handle && 261 (*handle >= pool->start) && (*handle < pool->end)) 262 start = *handle; 263 else 264 start = pool->hint; 265 266 limit = pool->end; 267 268 /* The case below can happen if we have a small segment appended 269 * to a large, or when the previous alloc was at the very end of 270 * the available space. If so, go back to the initial start. 271 */ 272 if (start >= limit) 273 start = pool->start; 274 275 if (limit + tbl->it_offset > mask) { 276 limit = mask - tbl->it_offset + 1; 277 /* If we're constrained on address range, first try 278 * at the masked hint to avoid O(n) search complexity, 279 * but on second pass, start at 0 in pool 0. 280 */ 281 if ((start & mask) >= limit || pass > 0) { 282 spin_unlock(&(pool->lock)); 283 pool = &(tbl->pools[0]); 284 spin_lock(&(pool->lock)); 285 start = pool->start; 286 } else { 287 start &= mask; 288 } 289 } 290 291 n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, 292 dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift), 293 align_mask); 294 if (n == -1) { 295 if (likely(pass == 0)) { 296 /* First try the pool from the start */ 297 pool->hint = pool->start; 298 pass++; 299 goto again; 300 301 } else if (pass <= tbl->nr_pools) { 302 /* Now try scanning all the other pools */ 303 spin_unlock(&(pool->lock)); 304 pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1); 305 pool = &tbl->pools[pool_nr]; 306 spin_lock(&(pool->lock)); 307 pool->hint = pool->start; 308 pass++; 309 goto again; 310 311 } else if (pass == tbl->nr_pools + 1) { 312 /* Last resort: try largepool */ 313 spin_unlock(&pool->lock); 314 pool = &tbl->large_pool; 315 spin_lock(&pool->lock); 316 pool->hint = pool->start; 317 pass++; 318 goto again; 319 320 } else { 321 /* Give up */ 322 spin_unlock_irqrestore(&(pool->lock), flags); 323 return DMA_MAPPING_ERROR; 324 } 325 } 326 327 end = n + npages; 328 329 /* Bump the hint to a new block for small allocs. */ 330 if (largealloc) { 331 /* Don't bump to new block to avoid fragmentation */ 332 pool->hint = end; 333 } else { 334 /* Overflow will be taken care of at the next allocation */ 335 pool->hint = (end + tbl->it_blocksize - 1) & 336 ~(tbl->it_blocksize - 1); 337 } 338 339 /* Update handle for SG allocations */ 340 if (handle) 341 *handle = end; 342 343 spin_unlock_irqrestore(&(pool->lock), flags); 344 345 return n; 346 } 347 348 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, 349 void *page, unsigned int npages, 350 enum dma_data_direction direction, 351 unsigned long mask, unsigned int align_order, 352 unsigned long attrs) 353 { 354 unsigned long entry; 355 dma_addr_t ret = DMA_MAPPING_ERROR; 356 int build_fail; 357 358 entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); 359 360 if (unlikely(entry == DMA_MAPPING_ERROR)) 361 return DMA_MAPPING_ERROR; 362 363 entry += tbl->it_offset; /* Offset into real TCE table */ 364 ret = entry << tbl->it_page_shift; /* Set the return dma address */ 365 366 /* Put the TCEs in the HW table */ 367 build_fail = tbl->it_ops->set(tbl, entry, npages, 368 (unsigned long)page & 369 IOMMU_PAGE_MASK(tbl), direction, attrs); 370 371 /* tbl->it_ops->set() only returns non-zero for transient errors. 372 * Clean up the table bitmap in this case and return 373 * DMA_MAPPING_ERROR. For all other errors the functionality is 374 * not altered. 375 */ 376 if (unlikely(build_fail)) { 377 __iommu_free(tbl, ret, npages); 378 return DMA_MAPPING_ERROR; 379 } 380 381 /* Flush/invalidate TLB caches if necessary */ 382 if (tbl->it_ops->flush) 383 tbl->it_ops->flush(tbl); 384 385 /* Make sure updates are seen by hardware */ 386 mb(); 387 388 return ret; 389 } 390 391 static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, 392 unsigned int npages) 393 { 394 unsigned long entry, free_entry; 395 396 entry = dma_addr >> tbl->it_page_shift; 397 free_entry = entry - tbl->it_offset; 398 399 if (((free_entry + npages) > tbl->it_size) || 400 (entry < tbl->it_offset)) { 401 if (printk_ratelimit()) { 402 printk(KERN_INFO "iommu_free: invalid entry\n"); 403 printk(KERN_INFO "\tentry = 0x%lx\n", entry); 404 printk(KERN_INFO "\tdma_addr = 0x%llx\n", (u64)dma_addr); 405 printk(KERN_INFO "\tTable = 0x%llx\n", (u64)tbl); 406 printk(KERN_INFO "\tbus# = 0x%llx\n", (u64)tbl->it_busno); 407 printk(KERN_INFO "\tsize = 0x%llx\n", (u64)tbl->it_size); 408 printk(KERN_INFO "\tstartOff = 0x%llx\n", (u64)tbl->it_offset); 409 printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index); 410 WARN_ON(1); 411 } 412 413 return false; 414 } 415 416 return true; 417 } 418 419 static struct iommu_pool *get_pool(struct iommu_table *tbl, 420 unsigned long entry) 421 { 422 struct iommu_pool *p; 423 unsigned long largepool_start = tbl->large_pool.start; 424 425 /* The large pool is the last pool at the top of the table */ 426 if (entry >= largepool_start) { 427 p = &tbl->large_pool; 428 } else { 429 unsigned int pool_nr = entry / tbl->poolsize; 430 431 BUG_ON(pool_nr > tbl->nr_pools); 432 p = &tbl->pools[pool_nr]; 433 } 434 435 return p; 436 } 437 438 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 439 unsigned int npages) 440 { 441 unsigned long entry, free_entry; 442 unsigned long flags; 443 struct iommu_pool *pool; 444 445 entry = dma_addr >> tbl->it_page_shift; 446 free_entry = entry - tbl->it_offset; 447 448 pool = get_pool(tbl, free_entry); 449 450 if (!iommu_free_check(tbl, dma_addr, npages)) 451 return; 452 453 tbl->it_ops->clear(tbl, entry, npages); 454 455 spin_lock_irqsave(&(pool->lock), flags); 456 bitmap_clear(tbl->it_map, free_entry, npages); 457 spin_unlock_irqrestore(&(pool->lock), flags); 458 } 459 460 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 461 unsigned int npages) 462 { 463 __iommu_free(tbl, dma_addr, npages); 464 465 /* Make sure TLB cache is flushed if the HW needs it. We do 466 * not do an mb() here on purpose, it is not needed on any of 467 * the current platforms. 468 */ 469 if (tbl->it_ops->flush) 470 tbl->it_ops->flush(tbl); 471 } 472 473 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, 474 struct scatterlist *sglist, int nelems, 475 unsigned long mask, enum dma_data_direction direction, 476 unsigned long attrs) 477 { 478 dma_addr_t dma_next = 0, dma_addr; 479 struct scatterlist *s, *outs, *segstart; 480 int outcount, incount, i, build_fail = 0; 481 unsigned int align; 482 unsigned long handle; 483 unsigned int max_seg_size; 484 485 BUG_ON(direction == DMA_NONE); 486 487 if ((nelems == 0) || !tbl) 488 return -EINVAL; 489 490 outs = s = segstart = &sglist[0]; 491 outcount = 1; 492 incount = nelems; 493 handle = 0; 494 495 /* Init first segment length for backout at failure */ 496 outs->dma_length = 0; 497 498 DBG("sg mapping %d elements:\n", nelems); 499 500 max_seg_size = dma_get_max_seg_size(dev); 501 for_each_sg(sglist, s, nelems, i) { 502 unsigned long vaddr, npages, entry, slen; 503 504 slen = s->length; 505 /* Sanity check */ 506 if (slen == 0) { 507 dma_next = 0; 508 continue; 509 } 510 /* Allocate iommu entries for that segment */ 511 vaddr = (unsigned long) sg_virt(s); 512 npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl)); 513 align = 0; 514 if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE && 515 (vaddr & ~PAGE_MASK) == 0) 516 align = PAGE_SHIFT - tbl->it_page_shift; 517 entry = iommu_range_alloc(dev, tbl, npages, &handle, 518 mask >> tbl->it_page_shift, align); 519 520 DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); 521 522 /* Handle failure */ 523 if (unlikely(entry == DMA_MAPPING_ERROR)) { 524 if (!(attrs & DMA_ATTR_NO_WARN) && 525 printk_ratelimit()) 526 dev_info(dev, "iommu_alloc failed, tbl %p " 527 "vaddr %lx npages %lu\n", tbl, vaddr, 528 npages); 529 goto failure; 530 } 531 532 /* Convert entry to a dma_addr_t */ 533 entry += tbl->it_offset; 534 dma_addr = entry << tbl->it_page_shift; 535 dma_addr |= (vaddr & ~IOMMU_PAGE_MASK(tbl)); 536 537 DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", 538 npages, entry, dma_addr); 539 540 /* Insert into HW table */ 541 build_fail = tbl->it_ops->set(tbl, entry, npages, 542 vaddr & IOMMU_PAGE_MASK(tbl), 543 direction, attrs); 544 if(unlikely(build_fail)) 545 goto failure; 546 547 /* If we are in an open segment, try merging */ 548 if (segstart != s) { 549 DBG(" - trying merge...\n"); 550 /* We cannot merge if: 551 * - allocated dma_addr isn't contiguous to previous allocation 552 */ 553 if (novmerge || (dma_addr != dma_next) || 554 (outs->dma_length + s->length > max_seg_size)) { 555 /* Can't merge: create a new segment */ 556 segstart = s; 557 outcount++; 558 outs = sg_next(outs); 559 DBG(" can't merge, new segment.\n"); 560 } else { 561 outs->dma_length += s->length; 562 DBG(" merged, new len: %ux\n", outs->dma_length); 563 } 564 } 565 566 if (segstart == s) { 567 /* This is a new segment, fill entries */ 568 DBG(" - filling new segment.\n"); 569 outs->dma_address = dma_addr; 570 outs->dma_length = slen; 571 } 572 573 /* Calculate next page pointer for contiguous check */ 574 dma_next = dma_addr + slen; 575 576 DBG(" - dma next is: %lx\n", dma_next); 577 } 578 579 /* Flush/invalidate TLB caches if necessary */ 580 if (tbl->it_ops->flush) 581 tbl->it_ops->flush(tbl); 582 583 DBG("mapped %d elements:\n", outcount); 584 585 /* For the sake of ppc_iommu_unmap_sg, we clear out the length in the 586 * next entry of the sglist if we didn't fill the list completely 587 */ 588 if (outcount < incount) { 589 outs = sg_next(outs); 590 outs->dma_length = 0; 591 } 592 593 /* Make sure updates are seen by hardware */ 594 mb(); 595 596 return outcount; 597 598 failure: 599 for_each_sg(sglist, s, nelems, i) { 600 if (s->dma_length != 0) { 601 unsigned long vaddr, npages; 602 603 vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl); 604 npages = iommu_num_pages(s->dma_address, s->dma_length, 605 IOMMU_PAGE_SIZE(tbl)); 606 __iommu_free(tbl, vaddr, npages); 607 s->dma_length = 0; 608 } 609 if (s == outs) 610 break; 611 } 612 return -EIO; 613 } 614 615 616 void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, 617 int nelems, enum dma_data_direction direction, 618 unsigned long attrs) 619 { 620 struct scatterlist *sg; 621 622 BUG_ON(direction == DMA_NONE); 623 624 if (!tbl) 625 return; 626 627 sg = sglist; 628 while (nelems--) { 629 unsigned int npages; 630 dma_addr_t dma_handle = sg->dma_address; 631 632 if (sg->dma_length == 0) 633 break; 634 npages = iommu_num_pages(dma_handle, sg->dma_length, 635 IOMMU_PAGE_SIZE(tbl)); 636 __iommu_free(tbl, dma_handle, npages); 637 sg = sg_next(sg); 638 } 639 640 /* Flush/invalidate TLBs if necessary. As for iommu_free(), we 641 * do not do an mb() here, the affected platforms do not need it 642 * when freeing. 643 */ 644 if (tbl->it_ops->flush) 645 tbl->it_ops->flush(tbl); 646 } 647 648 void iommu_table_clear(struct iommu_table *tbl) 649 { 650 /* 651 * In case of firmware assisted dump system goes through clean 652 * reboot process at the time of system crash. Hence it's safe to 653 * clear the TCE entries if firmware assisted dump is active. 654 */ 655 if (!is_kdump_kernel() || is_fadump_active()) { 656 /* Clear the table in case firmware left allocations in it */ 657 tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size); 658 return; 659 } 660 661 #ifdef CONFIG_CRASH_DUMP 662 if (tbl->it_ops->get) { 663 unsigned long index, tceval, tcecount = 0; 664 665 /* Reserve the existing mappings left by the first kernel. */ 666 for (index = 0; index < tbl->it_size; index++) { 667 tceval = tbl->it_ops->get(tbl, index + tbl->it_offset); 668 /* 669 * Freed TCE entry contains 0x7fffffffffffffff on JS20 670 */ 671 if (tceval && (tceval != 0x7fffffffffffffffUL)) { 672 __set_bit(index, tbl->it_map); 673 tcecount++; 674 } 675 } 676 677 if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) { 678 printk(KERN_WARNING "TCE table is full; freeing "); 679 printk(KERN_WARNING "%d entries for the kdump boot\n", 680 KDUMP_MIN_TCE_ENTRIES); 681 for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES; 682 index < tbl->it_size; index++) 683 __clear_bit(index, tbl->it_map); 684 } 685 } 686 #endif 687 } 688 689 void iommu_table_reserve_pages(struct iommu_table *tbl, 690 unsigned long res_start, unsigned long res_end) 691 { 692 unsigned long i; 693 694 WARN_ON_ONCE(res_end < res_start); 695 /* 696 * Reserve page 0 so it will not be used for any mappings. 697 * This avoids buggy drivers that consider page 0 to be invalid 698 * to crash the machine or even lose data. 699 */ 700 if (tbl->it_offset == 0) 701 set_bit(0, tbl->it_map); 702 703 if (res_start < tbl->it_offset) 704 res_start = tbl->it_offset; 705 706 if (res_end > (tbl->it_offset + tbl->it_size)) 707 res_end = tbl->it_offset + tbl->it_size; 708 709 /* Check if res_start..res_end is a valid range in the table */ 710 if (res_start >= res_end) { 711 tbl->it_reserved_start = tbl->it_offset; 712 tbl->it_reserved_end = tbl->it_offset; 713 return; 714 } 715 716 tbl->it_reserved_start = res_start; 717 tbl->it_reserved_end = res_end; 718 719 for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) 720 set_bit(i - tbl->it_offset, tbl->it_map); 721 } 722 723 /* 724 * Build a iommu_table structure. This contains a bit map which 725 * is used to manage allocation of the tce space. 726 */ 727 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, 728 unsigned long res_start, unsigned long res_end) 729 { 730 unsigned long sz; 731 static int welcomed = 0; 732 unsigned int i; 733 struct iommu_pool *p; 734 735 BUG_ON(!tbl->it_ops); 736 737 /* number of bytes needed for the bitmap */ 738 sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); 739 740 tbl->it_map = vzalloc_node(sz, nid); 741 if (!tbl->it_map) { 742 pr_err("%s: Can't allocate %ld bytes\n", __func__, sz); 743 return NULL; 744 } 745 746 iommu_table_reserve_pages(tbl, res_start, res_end); 747 748 /* We only split the IOMMU table if we have 1GB or more of space */ 749 if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) 750 tbl->nr_pools = IOMMU_NR_POOLS; 751 else 752 tbl->nr_pools = 1; 753 754 /* We reserve the top 1/4 of the table for large allocations */ 755 tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools; 756 757 for (i = 0; i < tbl->nr_pools; i++) { 758 p = &tbl->pools[i]; 759 spin_lock_init(&(p->lock)); 760 p->start = tbl->poolsize * i; 761 p->hint = p->start; 762 p->end = p->start + tbl->poolsize; 763 } 764 765 p = &tbl->large_pool; 766 spin_lock_init(&(p->lock)); 767 p->start = tbl->poolsize * i; 768 p->hint = p->start; 769 p->end = tbl->it_size; 770 771 iommu_table_clear(tbl); 772 773 if (!welcomed) { 774 pr_info("IOMMU table initialized, virtual merging %s\n", 775 str_disabled_enabled(novmerge)); 776 welcomed = 1; 777 } 778 779 iommu_debugfs_add(tbl); 780 781 return tbl; 782 } 783 784 bool iommu_table_in_use(struct iommu_table *tbl) 785 { 786 unsigned long start = 0, end; 787 788 /* ignore reserved bit0 */ 789 if (tbl->it_offset == 0) 790 start = 1; 791 792 /* Simple case with no reserved MMIO32 region */ 793 if (!tbl->it_reserved_start && !tbl->it_reserved_end) 794 return find_next_bit(tbl->it_map, tbl->it_size, start) != tbl->it_size; 795 796 end = tbl->it_reserved_start - tbl->it_offset; 797 if (find_next_bit(tbl->it_map, end, start) != end) 798 return true; 799 800 start = tbl->it_reserved_end - tbl->it_offset; 801 end = tbl->it_size; 802 return find_next_bit(tbl->it_map, end, start) != end; 803 } 804 805 static void iommu_table_free(struct kref *kref) 806 { 807 struct iommu_table *tbl; 808 809 tbl = container_of(kref, struct iommu_table, it_kref); 810 811 if (tbl->it_ops->free) 812 tbl->it_ops->free(tbl); 813 814 if (!tbl->it_map) { 815 kfree(tbl); 816 return; 817 } 818 819 iommu_debugfs_del(tbl); 820 821 /* verify that table contains no entries */ 822 if (iommu_table_in_use(tbl)) 823 pr_warn("%s: Unexpected TCEs\n", __func__); 824 825 /* free bitmap */ 826 vfree(tbl->it_map); 827 828 /* free table */ 829 kfree(tbl); 830 } 831 832 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl) 833 { 834 if (kref_get_unless_zero(&tbl->it_kref)) 835 return tbl; 836 837 return NULL; 838 } 839 EXPORT_SYMBOL_GPL(iommu_tce_table_get); 840 841 int iommu_tce_table_put(struct iommu_table *tbl) 842 { 843 if (WARN_ON(!tbl)) 844 return 0; 845 846 return kref_put(&tbl->it_kref, iommu_table_free); 847 } 848 EXPORT_SYMBOL_GPL(iommu_tce_table_put); 849 850 /* Creates TCEs for a user provided buffer. The user buffer must be 851 * contiguous real kernel storage (not vmalloc). The address passed here 852 * is physical address into that page. The dma_addr_t returned will point 853 * to the same byte within the page as was passed in. 854 */ 855 dma_addr_t iommu_map_phys(struct device *dev, struct iommu_table *tbl, 856 phys_addr_t phys, size_t size, unsigned long mask, 857 enum dma_data_direction direction, 858 unsigned long attrs) 859 { 860 dma_addr_t dma_handle = DMA_MAPPING_ERROR; 861 void *vaddr; 862 unsigned long uaddr; 863 unsigned int npages, align; 864 865 BUG_ON(direction == DMA_NONE); 866 867 vaddr = phys_to_virt(phys); 868 uaddr = (unsigned long)vaddr; 869 870 if (tbl) { 871 npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); 872 align = 0; 873 if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && 874 ((unsigned long)vaddr & ~PAGE_MASK) == 0) 875 align = PAGE_SHIFT - tbl->it_page_shift; 876 877 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, 878 mask >> tbl->it_page_shift, align, 879 attrs); 880 if (dma_handle == DMA_MAPPING_ERROR) { 881 if (!(attrs & DMA_ATTR_NO_WARN) && 882 printk_ratelimit()) { 883 dev_info(dev, "iommu_alloc failed, tbl %p " 884 "vaddr %p npages %d\n", tbl, vaddr, 885 npages); 886 } 887 } else 888 dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl)); 889 } 890 891 return dma_handle; 892 } 893 894 void iommu_unmap_phys(struct iommu_table *tbl, dma_addr_t dma_handle, 895 size_t size, enum dma_data_direction direction, 896 unsigned long attrs) 897 { 898 unsigned int npages; 899 900 BUG_ON(direction == DMA_NONE); 901 902 if (tbl) { 903 npages = iommu_num_pages(dma_handle, size, 904 IOMMU_PAGE_SIZE(tbl)); 905 iommu_free(tbl, dma_handle, npages); 906 } 907 } 908 909 /* Allocates a contiguous real buffer and creates mappings over it. 910 * Returns the virtual address of the buffer and sets dma_handle 911 * to the dma address (mapping) of the first page. 912 */ 913 void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, 914 size_t size, dma_addr_t *dma_handle, 915 unsigned long mask, gfp_t flag, int node) 916 { 917 void *ret = NULL; 918 dma_addr_t mapping; 919 unsigned int order; 920 unsigned int nio_pages, io_order; 921 struct page *page; 922 int tcesize = (1 << tbl->it_page_shift); 923 924 size = PAGE_ALIGN(size); 925 order = get_order(size); 926 927 /* 928 * Client asked for way too much space. This is checked later 929 * anyway. It is easier to debug here for the drivers than in 930 * the tce tables. 931 */ 932 if (order >= IOMAP_MAX_ORDER) { 933 dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n", 934 size); 935 return NULL; 936 } 937 938 if (!tbl) 939 return NULL; 940 941 /* Alloc enough pages (and possibly more) */ 942 page = alloc_pages_node(node, flag, order); 943 if (!page) 944 return NULL; 945 ret = page_address(page); 946 memset(ret, 0, size); 947 948 /* Set up tces to cover the allocated range */ 949 nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift; 950 951 io_order = get_iommu_order(size, tbl); 952 mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, 953 mask >> tbl->it_page_shift, io_order, 0); 954 if (mapping == DMA_MAPPING_ERROR) { 955 free_pages((unsigned long)ret, order); 956 return NULL; 957 } 958 959 *dma_handle = mapping | ((u64)ret & (tcesize - 1)); 960 return ret; 961 } 962 963 void iommu_free_coherent(struct iommu_table *tbl, size_t size, 964 void *vaddr, dma_addr_t dma_handle) 965 { 966 if (tbl) { 967 unsigned int nio_pages; 968 969 size = PAGE_ALIGN(size); 970 nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift; 971 iommu_free(tbl, dma_handle, nio_pages); 972 size = PAGE_ALIGN(size); 973 free_pages((unsigned long)vaddr, get_order(size)); 974 } 975 } 976 977 unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir) 978 { 979 switch (dir) { 980 case DMA_BIDIRECTIONAL: 981 return TCE_PCI_READ | TCE_PCI_WRITE; 982 case DMA_FROM_DEVICE: 983 return TCE_PCI_WRITE; 984 case DMA_TO_DEVICE: 985 return TCE_PCI_READ; 986 default: 987 return 0; 988 } 989 } 990 EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm); 991 992 #ifdef CONFIG_IOMMU_API 993 994 int dev_has_iommu_table(struct device *dev, void *data) 995 { 996 struct pci_dev *pdev = to_pci_dev(dev); 997 struct pci_dev **ppdev = data; 998 999 if (!dev) 1000 return 0; 1001 1002 if (device_iommu_mapped(dev)) { 1003 *ppdev = pdev; 1004 return 1; 1005 } 1006 1007 return 0; 1008 } 1009 1010 /* 1011 * SPAPR TCE API 1012 */ 1013 static void group_release(void *iommu_data) 1014 { 1015 struct iommu_table_group *table_group = iommu_data; 1016 1017 table_group->group = NULL; 1018 } 1019 1020 void iommu_register_group(struct iommu_table_group *table_group, 1021 int pci_domain_number, unsigned long pe_num) 1022 { 1023 struct iommu_group *grp; 1024 char *name; 1025 1026 grp = iommu_group_alloc(); 1027 if (IS_ERR(grp)) { 1028 pr_warn("powerpc iommu api: cannot create new group, err=%ld\n", 1029 PTR_ERR(grp)); 1030 return; 1031 } 1032 table_group->group = grp; 1033 iommu_group_set_iommudata(grp, table_group, group_release); 1034 name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", 1035 pci_domain_number, pe_num); 1036 if (!name) 1037 return; 1038 iommu_group_set_name(grp, name); 1039 kfree(name); 1040 } 1041 1042 enum dma_data_direction iommu_tce_direction(unsigned long tce) 1043 { 1044 if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) 1045 return DMA_BIDIRECTIONAL; 1046 else if (tce & TCE_PCI_READ) 1047 return DMA_TO_DEVICE; 1048 else if (tce & TCE_PCI_WRITE) 1049 return DMA_FROM_DEVICE; 1050 else 1051 return DMA_NONE; 1052 } 1053 EXPORT_SYMBOL_GPL(iommu_tce_direction); 1054 1055 void iommu_flush_tce(struct iommu_table *tbl) 1056 { 1057 /* Flush/invalidate TLB caches if necessary */ 1058 if (tbl->it_ops->flush) 1059 tbl->it_ops->flush(tbl); 1060 1061 /* Make sure updates are seen by hardware */ 1062 mb(); 1063 } 1064 EXPORT_SYMBOL_GPL(iommu_flush_tce); 1065 1066 int iommu_tce_check_ioba(unsigned long page_shift, 1067 unsigned long offset, unsigned long size, 1068 unsigned long ioba, unsigned long npages) 1069 { 1070 unsigned long mask = (1UL << page_shift) - 1; 1071 1072 if (ioba & mask) 1073 return -EINVAL; 1074 1075 ioba >>= page_shift; 1076 if (ioba < offset) 1077 return -EINVAL; 1078 1079 if ((ioba + 1) > (offset + size)) 1080 return -EINVAL; 1081 1082 return 0; 1083 } 1084 EXPORT_SYMBOL_GPL(iommu_tce_check_ioba); 1085 1086 int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) 1087 { 1088 unsigned long mask = (1UL << page_shift) - 1; 1089 1090 if (gpa & mask) 1091 return -EINVAL; 1092 1093 return 0; 1094 } 1095 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); 1096 1097 long iommu_tce_xchg_no_kill(struct mm_struct *mm, 1098 struct iommu_table *tbl, 1099 unsigned long entry, unsigned long *hpa, 1100 enum dma_data_direction *direction) 1101 { 1102 long ret; 1103 unsigned long size = 0; 1104 1105 ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction); 1106 if (!ret && ((*direction == DMA_FROM_DEVICE) || 1107 (*direction == DMA_BIDIRECTIONAL)) && 1108 !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, 1109 &size)) 1110 SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); 1111 1112 return ret; 1113 } 1114 EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill); 1115 1116 void iommu_tce_kill(struct iommu_table *tbl, 1117 unsigned long entry, unsigned long pages) 1118 { 1119 if (tbl->it_ops->tce_kill) 1120 tbl->it_ops->tce_kill(tbl, entry, pages); 1121 } 1122 EXPORT_SYMBOL_GPL(iommu_tce_kill); 1123 1124 int iommu_add_device(struct iommu_table_group *table_group, struct device *dev) 1125 { 1126 /* 1127 * The sysfs entries should be populated before 1128 * binding IOMMU group. If sysfs entries isn't 1129 * ready, we simply bail. 1130 */ 1131 if (!device_is_registered(dev)) 1132 return -ENOENT; 1133 1134 if (device_iommu_mapped(dev)) { 1135 pr_debug("%s: Skipping device %s with iommu group %d\n", 1136 __func__, dev_name(dev), 1137 iommu_group_id(dev->iommu_group)); 1138 return -EBUSY; 1139 } 1140 1141 pr_debug("%s: Adding %s to iommu group %d\n", 1142 __func__, dev_name(dev), iommu_group_id(table_group->group)); 1143 /* 1144 * This is still not adding devices via the IOMMU bus notifier because 1145 * of pcibios_init() from arch/powerpc/kernel/pci_64.c which calls 1146 * pcibios_scan_phb() first (and this guy adds devices and triggers 1147 * the notifier) and only then it calls pci_bus_add_devices() which 1148 * configures DMA for buses which also creates PEs and IOMMU groups. 1149 */ 1150 return iommu_probe_device(dev); 1151 } 1152 EXPORT_SYMBOL_GPL(iommu_add_device); 1153 1154 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) 1155 /* 1156 * A simple iommu_ops to allow less cruft in generic VFIO code. 1157 */ 1158 static int 1159 spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, 1160 struct device *dev, 1161 struct iommu_domain *old) 1162 { 1163 struct iommu_domain *domain = iommu_driver_get_domain_for_dev(dev); 1164 struct iommu_table_group *table_group; 1165 struct iommu_group *grp; 1166 1167 /* At first attach the ownership is already set */ 1168 if (!domain) 1169 return 0; 1170 1171 grp = iommu_group_get(dev); 1172 table_group = iommu_group_get_iommudata(grp); 1173 /* 1174 * The domain being set to PLATFORM from earlier 1175 * BLOCKED. The table_group ownership has to be released. 1176 */ 1177 table_group->ops->release_ownership(table_group, dev); 1178 iommu_group_put(grp); 1179 1180 return 0; 1181 } 1182 1183 static const struct iommu_domain_ops spapr_tce_platform_domain_ops = { 1184 .attach_dev = spapr_tce_platform_iommu_attach_dev, 1185 }; 1186 1187 static struct iommu_domain spapr_tce_platform_domain = { 1188 .type = IOMMU_DOMAIN_PLATFORM, 1189 .ops = &spapr_tce_platform_domain_ops, 1190 }; 1191 1192 static int 1193 spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain, 1194 struct device *dev, struct iommu_domain *old) 1195 { 1196 struct iommu_group *grp = iommu_group_get(dev); 1197 struct iommu_table_group *table_group; 1198 int ret = -EINVAL; 1199 1200 /* 1201 * FIXME: SPAPR mixes blocked and platform behaviors, the blocked domain 1202 * also sets the dma_api ops 1203 */ 1204 table_group = iommu_group_get_iommudata(grp); 1205 ret = table_group->ops->take_ownership(table_group, dev); 1206 iommu_group_put(grp); 1207 1208 return ret; 1209 } 1210 1211 static const struct iommu_domain_ops spapr_tce_blocked_domain_ops = { 1212 .attach_dev = spapr_tce_blocked_iommu_attach_dev, 1213 }; 1214 1215 static struct iommu_domain spapr_tce_blocked_domain = { 1216 .type = IOMMU_DOMAIN_BLOCKED, 1217 .ops = &spapr_tce_blocked_domain_ops, 1218 }; 1219 1220 static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap) 1221 { 1222 switch (cap) { 1223 case IOMMU_CAP_CACHE_COHERENCY: 1224 return true; 1225 default: 1226 break; 1227 } 1228 1229 return false; 1230 } 1231 1232 static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev) 1233 { 1234 struct pci_dev *pdev; 1235 struct pci_controller *hose; 1236 1237 if (!dev_is_pci(dev)) 1238 return ERR_PTR(-ENODEV); 1239 1240 pdev = to_pci_dev(dev); 1241 hose = pdev->bus->sysdata; 1242 1243 return &hose->iommu; 1244 } 1245 1246 static void spapr_tce_iommu_release_device(struct device *dev) 1247 { 1248 } 1249 1250 static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev) 1251 { 1252 struct pci_controller *hose; 1253 struct pci_dev *pdev; 1254 1255 pdev = to_pci_dev(dev); 1256 hose = pdev->bus->sysdata; 1257 1258 if (!hose->controller_ops.device_group) 1259 return ERR_PTR(-ENOENT); 1260 1261 return hose->controller_ops.device_group(hose, pdev); 1262 } 1263 1264 static const struct iommu_ops spapr_tce_iommu_ops = { 1265 .default_domain = &spapr_tce_platform_domain, 1266 .blocked_domain = &spapr_tce_blocked_domain, 1267 .capable = spapr_tce_iommu_capable, 1268 .probe_device = spapr_tce_iommu_probe_device, 1269 .release_device = spapr_tce_iommu_release_device, 1270 .device_group = spapr_tce_iommu_device_group, 1271 }; 1272 1273 static struct attribute *spapr_tce_iommu_attrs[] = { 1274 NULL, 1275 }; 1276 1277 static struct attribute_group spapr_tce_iommu_group = { 1278 .name = "spapr-tce-iommu", 1279 .attrs = spapr_tce_iommu_attrs, 1280 }; 1281 1282 static const struct attribute_group *spapr_tce_iommu_groups[] = { 1283 &spapr_tce_iommu_group, 1284 NULL, 1285 }; 1286 1287 void ppc_iommu_register_device(struct pci_controller *phb) 1288 { 1289 iommu_device_sysfs_add(&phb->iommu, phb->parent, 1290 spapr_tce_iommu_groups, "iommu-phb%04x", 1291 phb->global_number); 1292 iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops, 1293 phb->parent); 1294 } 1295 1296 void ppc_iommu_unregister_device(struct pci_controller *phb) 1297 { 1298 iommu_device_unregister(&phb->iommu); 1299 iommu_device_sysfs_remove(&phb->iommu); 1300 } 1301 1302 /* 1303 * This registers IOMMU devices of PHBs. This needs to happen 1304 * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and 1305 * before subsys_initcall(iommu_subsys_init). 1306 */ 1307 static int __init spapr_tce_setup_phb_iommus_initcall(void) 1308 { 1309 struct pci_controller *hose; 1310 1311 list_for_each_entry(hose, &hose_list, list_node) { 1312 ppc_iommu_register_device(hose); 1313 } 1314 return 0; 1315 } 1316 postcore_initcall_sync(spapr_tce_setup_phb_iommus_initcall); 1317 #endif 1318 1319 #endif /* CONFIG_IOMMU_API */ 1320