1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation 4 * 5 * Rewrite, cleanup, new allocation schemes, virtual merging: 6 * Copyright (C) 2004 Olof Johansson, IBM Corporation 7 * and Ben. Herrenschmidt, IBM Corporation 8 * 9 * Dynamic DMA mapping support, bus-independent parts. 10 */ 11 12 13 #include <linux/init.h> 14 #include <linux/types.h> 15 #include <linux/slab.h> 16 #include <linux/mm.h> 17 #include <linux/spinlock.h> 18 #include <linux/string.h> 19 #include <linux/string_choices.h> 20 #include <linux/dma-mapping.h> 21 #include <linux/bitmap.h> 22 #include <linux/iommu-helper.h> 23 #include <linux/crash_dump.h> 24 #include <linux/hash.h> 25 #include <linux/fault-inject.h> 26 #include <linux/pci.h> 27 #include <linux/iommu.h> 28 #include <linux/sched.h> 29 #include <linux/debugfs.h> 30 #include <linux/vmalloc.h> 31 #include <asm/io.h> 32 #include <asm/iommu.h> 33 #include <asm/pci-bridge.h> 34 #include <asm/machdep.h> 35 #include <asm/kdump.h> 36 #include <asm/fadump.h> 37 #include <asm/vio.h> 38 #include <asm/tce.h> 39 #include <asm/mmu_context.h> 40 #include <asm/ppc-pci.h> 41 42 #define DBG(...) 43 44 #ifdef CONFIG_IOMMU_DEBUGFS 45 static int iommu_debugfs_weight_get(void *data, u64 *val) 46 { 47 struct iommu_table *tbl = data; 48 *val = bitmap_weight(tbl->it_map, tbl->it_size); 49 return 0; 50 } 51 DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n"); 52 53 static void iommu_debugfs_add(struct iommu_table *tbl) 54 { 55 char name[10]; 56 struct dentry *liobn_entry; 57 58 sprintf(name, "%08lx", tbl->it_index); 59 liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir); 60 61 debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight); 62 debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size); 63 debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift); 64 debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start); 65 debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end); 66 debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels); 67 debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size); 68 } 69 70 static void iommu_debugfs_del(struct iommu_table *tbl) 71 { 72 char name[10]; 73 74 sprintf(name, "%08lx", tbl->it_index); 75 debugfs_lookup_and_remove(name, iommu_debugfs_dir); 76 } 77 #else 78 static void iommu_debugfs_add(struct iommu_table *tbl){} 79 static void iommu_debugfs_del(struct iommu_table *tbl){} 80 #endif 81 82 static int novmerge; 83 84 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); 85 86 static int __init setup_iommu(char *str) 87 { 88 if (!strcmp(str, "novmerge")) 89 novmerge = 1; 90 else if (!strcmp(str, "vmerge")) 91 novmerge = 0; 92 return 1; 93 } 94 95 __setup("iommu=", setup_iommu); 96 97 static DEFINE_PER_CPU(unsigned int, iommu_pool_hash); 98 99 /* 100 * We precalculate the hash to avoid doing it on every allocation. 101 * 102 * The hash is important to spread CPUs across all the pools. For example, 103 * on a POWER7 with 4 way SMT we want interrupts on the primary threads and 104 * with 4 pools all primary threads would map to the same pool. 105 */ 106 static int __init setup_iommu_pool_hash(void) 107 { 108 unsigned int i; 109 110 for_each_possible_cpu(i) 111 per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS); 112 113 return 0; 114 } 115 subsys_initcall(setup_iommu_pool_hash); 116 117 #ifdef CONFIG_FAIL_IOMMU 118 119 static DECLARE_FAULT_ATTR(fail_iommu); 120 121 static int __init setup_fail_iommu(char *str) 122 { 123 return setup_fault_attr(&fail_iommu, str); 124 } 125 __setup("fail_iommu=", setup_fail_iommu); 126 127 static bool should_fail_iommu(struct device *dev) 128 { 129 return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1); 130 } 131 132 static int __init fail_iommu_debugfs(void) 133 { 134 struct dentry *dir = fault_create_debugfs_attr("fail_iommu", 135 NULL, &fail_iommu); 136 137 return PTR_ERR_OR_ZERO(dir); 138 } 139 late_initcall(fail_iommu_debugfs); 140 141 static ssize_t fail_iommu_show(struct device *dev, 142 struct device_attribute *attr, char *buf) 143 { 144 return sprintf(buf, "%d\n", dev->archdata.fail_iommu); 145 } 146 147 static ssize_t fail_iommu_store(struct device *dev, 148 struct device_attribute *attr, const char *buf, 149 size_t count) 150 { 151 int i; 152 153 if (count > 0 && sscanf(buf, "%d", &i) > 0) 154 dev->archdata.fail_iommu = (i == 0) ? 0 : 1; 155 156 return count; 157 } 158 159 static DEVICE_ATTR_RW(fail_iommu); 160 161 static int fail_iommu_bus_notify(struct notifier_block *nb, 162 unsigned long action, void *data) 163 { 164 struct device *dev = data; 165 166 if (action == BUS_NOTIFY_ADD_DEVICE) { 167 if (device_create_file(dev, &dev_attr_fail_iommu)) 168 pr_warn("Unable to create IOMMU fault injection sysfs " 169 "entries\n"); 170 } else if (action == BUS_NOTIFY_DEL_DEVICE) { 171 device_remove_file(dev, &dev_attr_fail_iommu); 172 } 173 174 return 0; 175 } 176 177 /* 178 * PCI and VIO buses need separate notifier_block structs, since they're linked 179 * list nodes. Sharing a notifier_block would mean that any notifiers later 180 * registered for PCI buses would also get called by VIO buses and vice versa. 181 */ 182 static struct notifier_block fail_iommu_pci_bus_notifier = { 183 .notifier_call = fail_iommu_bus_notify 184 }; 185 186 #ifdef CONFIG_IBMVIO 187 static struct notifier_block fail_iommu_vio_bus_notifier = { 188 .notifier_call = fail_iommu_bus_notify 189 }; 190 #endif 191 192 static int __init fail_iommu_setup(void) 193 { 194 #ifdef CONFIG_PCI 195 bus_register_notifier(&pci_bus_type, &fail_iommu_pci_bus_notifier); 196 #endif 197 #ifdef CONFIG_IBMVIO 198 bus_register_notifier(&vio_bus_type, &fail_iommu_vio_bus_notifier); 199 #endif 200 201 return 0; 202 } 203 /* 204 * Must execute after PCI and VIO subsystem have initialised but before 205 * devices are probed. 206 */ 207 arch_initcall(fail_iommu_setup); 208 #else 209 static inline bool should_fail_iommu(struct device *dev) 210 { 211 return false; 212 } 213 #endif 214 215 static unsigned long iommu_range_alloc(struct device *dev, 216 struct iommu_table *tbl, 217 unsigned long npages, 218 unsigned long *handle, 219 unsigned long mask, 220 unsigned int align_order) 221 { 222 unsigned long n, end, start; 223 unsigned long limit; 224 int largealloc = npages > 15; 225 int pass = 0; 226 unsigned long align_mask; 227 unsigned long flags; 228 unsigned int pool_nr; 229 struct iommu_pool *pool; 230 231 align_mask = (1ull << align_order) - 1; 232 233 /* This allocator was derived from x86_64's bit string search */ 234 235 /* Sanity check */ 236 if (unlikely(npages == 0)) { 237 if (printk_ratelimit()) 238 WARN_ON(1); 239 return DMA_MAPPING_ERROR; 240 } 241 242 if (should_fail_iommu(dev)) 243 return DMA_MAPPING_ERROR; 244 245 /* 246 * We don't need to disable preemption here because any CPU can 247 * safely use any IOMMU pool. 248 */ 249 pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); 250 251 if (largealloc) 252 pool = &(tbl->large_pool); 253 else 254 pool = &(tbl->pools[pool_nr]); 255 256 spin_lock_irqsave(&(pool->lock), flags); 257 258 again: 259 if ((pass == 0) && handle && *handle && 260 (*handle >= pool->start) && (*handle < pool->end)) 261 start = *handle; 262 else 263 start = pool->hint; 264 265 limit = pool->end; 266 267 /* The case below can happen if we have a small segment appended 268 * to a large, or when the previous alloc was at the very end of 269 * the available space. If so, go back to the initial start. 270 */ 271 if (start >= limit) 272 start = pool->start; 273 274 if (limit + tbl->it_offset > mask) { 275 limit = mask - tbl->it_offset + 1; 276 /* If we're constrained on address range, first try 277 * at the masked hint to avoid O(n) search complexity, 278 * but on second pass, start at 0 in pool 0. 279 */ 280 if ((start & mask) >= limit || pass > 0) { 281 spin_unlock(&(pool->lock)); 282 pool = &(tbl->pools[0]); 283 spin_lock(&(pool->lock)); 284 start = pool->start; 285 } else { 286 start &= mask; 287 } 288 } 289 290 n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, 291 dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift), 292 align_mask); 293 if (n == -1) { 294 if (likely(pass == 0)) { 295 /* First try the pool from the start */ 296 pool->hint = pool->start; 297 pass++; 298 goto again; 299 300 } else if (pass <= tbl->nr_pools) { 301 /* Now try scanning all the other pools */ 302 spin_unlock(&(pool->lock)); 303 pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1); 304 pool = &tbl->pools[pool_nr]; 305 spin_lock(&(pool->lock)); 306 pool->hint = pool->start; 307 pass++; 308 goto again; 309 310 } else if (pass == tbl->nr_pools + 1) { 311 /* Last resort: try largepool */ 312 spin_unlock(&pool->lock); 313 pool = &tbl->large_pool; 314 spin_lock(&pool->lock); 315 pool->hint = pool->start; 316 pass++; 317 goto again; 318 319 } else { 320 /* Give up */ 321 spin_unlock_irqrestore(&(pool->lock), flags); 322 return DMA_MAPPING_ERROR; 323 } 324 } 325 326 end = n + npages; 327 328 /* Bump the hint to a new block for small allocs. */ 329 if (largealloc) { 330 /* Don't bump to new block to avoid fragmentation */ 331 pool->hint = end; 332 } else { 333 /* Overflow will be taken care of at the next allocation */ 334 pool->hint = (end + tbl->it_blocksize - 1) & 335 ~(tbl->it_blocksize - 1); 336 } 337 338 /* Update handle for SG allocations */ 339 if (handle) 340 *handle = end; 341 342 spin_unlock_irqrestore(&(pool->lock), flags); 343 344 return n; 345 } 346 347 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, 348 void *page, unsigned int npages, 349 enum dma_data_direction direction, 350 unsigned long mask, unsigned int align_order, 351 unsigned long attrs) 352 { 353 unsigned long entry; 354 dma_addr_t ret = DMA_MAPPING_ERROR; 355 int build_fail; 356 357 entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); 358 359 if (unlikely(entry == DMA_MAPPING_ERROR)) 360 return DMA_MAPPING_ERROR; 361 362 entry += tbl->it_offset; /* Offset into real TCE table */ 363 ret = entry << tbl->it_page_shift; /* Set the return dma address */ 364 365 /* Put the TCEs in the HW table */ 366 build_fail = tbl->it_ops->set(tbl, entry, npages, 367 (unsigned long)page & 368 IOMMU_PAGE_MASK(tbl), direction, attrs); 369 370 /* tbl->it_ops->set() only returns non-zero for transient errors. 371 * Clean up the table bitmap in this case and return 372 * DMA_MAPPING_ERROR. For all other errors the functionality is 373 * not altered. 374 */ 375 if (unlikely(build_fail)) { 376 __iommu_free(tbl, ret, npages); 377 return DMA_MAPPING_ERROR; 378 } 379 380 /* Flush/invalidate TLB caches if necessary */ 381 if (tbl->it_ops->flush) 382 tbl->it_ops->flush(tbl); 383 384 /* Make sure updates are seen by hardware */ 385 mb(); 386 387 return ret; 388 } 389 390 static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, 391 unsigned int npages) 392 { 393 unsigned long entry, free_entry; 394 395 entry = dma_addr >> tbl->it_page_shift; 396 free_entry = entry - tbl->it_offset; 397 398 if (((free_entry + npages) > tbl->it_size) || 399 (entry < tbl->it_offset)) { 400 if (printk_ratelimit()) { 401 printk(KERN_INFO "iommu_free: invalid entry\n"); 402 printk(KERN_INFO "\tentry = 0x%lx\n", entry); 403 printk(KERN_INFO "\tdma_addr = 0x%llx\n", (u64)dma_addr); 404 printk(KERN_INFO "\tTable = 0x%llx\n", (u64)tbl); 405 printk(KERN_INFO "\tbus# = 0x%llx\n", (u64)tbl->it_busno); 406 printk(KERN_INFO "\tsize = 0x%llx\n", (u64)tbl->it_size); 407 printk(KERN_INFO "\tstartOff = 0x%llx\n", (u64)tbl->it_offset); 408 printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index); 409 WARN_ON(1); 410 } 411 412 return false; 413 } 414 415 return true; 416 } 417 418 static struct iommu_pool *get_pool(struct iommu_table *tbl, 419 unsigned long entry) 420 { 421 struct iommu_pool *p; 422 unsigned long largepool_start = tbl->large_pool.start; 423 424 /* The large pool is the last pool at the top of the table */ 425 if (entry >= largepool_start) { 426 p = &tbl->large_pool; 427 } else { 428 unsigned int pool_nr = entry / tbl->poolsize; 429 430 BUG_ON(pool_nr > tbl->nr_pools); 431 p = &tbl->pools[pool_nr]; 432 } 433 434 return p; 435 } 436 437 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 438 unsigned int npages) 439 { 440 unsigned long entry, free_entry; 441 unsigned long flags; 442 struct iommu_pool *pool; 443 444 entry = dma_addr >> tbl->it_page_shift; 445 free_entry = entry - tbl->it_offset; 446 447 pool = get_pool(tbl, free_entry); 448 449 if (!iommu_free_check(tbl, dma_addr, npages)) 450 return; 451 452 tbl->it_ops->clear(tbl, entry, npages); 453 454 spin_lock_irqsave(&(pool->lock), flags); 455 bitmap_clear(tbl->it_map, free_entry, npages); 456 spin_unlock_irqrestore(&(pool->lock), flags); 457 } 458 459 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 460 unsigned int npages) 461 { 462 __iommu_free(tbl, dma_addr, npages); 463 464 /* Make sure TLB cache is flushed if the HW needs it. We do 465 * not do an mb() here on purpose, it is not needed on any of 466 * the current platforms. 467 */ 468 if (tbl->it_ops->flush) 469 tbl->it_ops->flush(tbl); 470 } 471 472 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, 473 struct scatterlist *sglist, int nelems, 474 unsigned long mask, enum dma_data_direction direction, 475 unsigned long attrs) 476 { 477 dma_addr_t dma_next = 0, dma_addr; 478 struct scatterlist *s, *outs, *segstart; 479 int outcount, incount, i, build_fail = 0; 480 unsigned int align; 481 unsigned long handle; 482 unsigned int max_seg_size; 483 484 BUG_ON(direction == DMA_NONE); 485 486 if ((nelems == 0) || !tbl) 487 return -EINVAL; 488 489 outs = s = segstart = &sglist[0]; 490 outcount = 1; 491 incount = nelems; 492 handle = 0; 493 494 /* Init first segment length for backout at failure */ 495 outs->dma_length = 0; 496 497 DBG("sg mapping %d elements:\n", nelems); 498 499 max_seg_size = dma_get_max_seg_size(dev); 500 for_each_sg(sglist, s, nelems, i) { 501 unsigned long vaddr, npages, entry, slen; 502 503 slen = s->length; 504 /* Sanity check */ 505 if (slen == 0) { 506 dma_next = 0; 507 continue; 508 } 509 /* Allocate iommu entries for that segment */ 510 vaddr = (unsigned long) sg_virt(s); 511 npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl)); 512 align = 0; 513 if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE && 514 (vaddr & ~PAGE_MASK) == 0) 515 align = PAGE_SHIFT - tbl->it_page_shift; 516 entry = iommu_range_alloc(dev, tbl, npages, &handle, 517 mask >> tbl->it_page_shift, align); 518 519 DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); 520 521 /* Handle failure */ 522 if (unlikely(entry == DMA_MAPPING_ERROR)) { 523 if (!(attrs & DMA_ATTR_NO_WARN) && 524 printk_ratelimit()) 525 dev_info(dev, "iommu_alloc failed, tbl %p " 526 "vaddr %lx npages %lu\n", tbl, vaddr, 527 npages); 528 goto failure; 529 } 530 531 /* Convert entry to a dma_addr_t */ 532 entry += tbl->it_offset; 533 dma_addr = entry << tbl->it_page_shift; 534 dma_addr |= (vaddr & ~IOMMU_PAGE_MASK(tbl)); 535 536 DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", 537 npages, entry, dma_addr); 538 539 /* Insert into HW table */ 540 build_fail = tbl->it_ops->set(tbl, entry, npages, 541 vaddr & IOMMU_PAGE_MASK(tbl), 542 direction, attrs); 543 if(unlikely(build_fail)) 544 goto failure; 545 546 /* If we are in an open segment, try merging */ 547 if (segstart != s) { 548 DBG(" - trying merge...\n"); 549 /* We cannot merge if: 550 * - allocated dma_addr isn't contiguous to previous allocation 551 */ 552 if (novmerge || (dma_addr != dma_next) || 553 (outs->dma_length + s->length > max_seg_size)) { 554 /* Can't merge: create a new segment */ 555 segstart = s; 556 outcount++; 557 outs = sg_next(outs); 558 DBG(" can't merge, new segment.\n"); 559 } else { 560 outs->dma_length += s->length; 561 DBG(" merged, new len: %ux\n", outs->dma_length); 562 } 563 } 564 565 if (segstart == s) { 566 /* This is a new segment, fill entries */ 567 DBG(" - filling new segment.\n"); 568 outs->dma_address = dma_addr; 569 outs->dma_length = slen; 570 } 571 572 /* Calculate next page pointer for contiguous check */ 573 dma_next = dma_addr + slen; 574 575 DBG(" - dma next is: %lx\n", dma_next); 576 } 577 578 /* Flush/invalidate TLB caches if necessary */ 579 if (tbl->it_ops->flush) 580 tbl->it_ops->flush(tbl); 581 582 DBG("mapped %d elements:\n", outcount); 583 584 /* For the sake of ppc_iommu_unmap_sg, we clear out the length in the 585 * next entry of the sglist if we didn't fill the list completely 586 */ 587 if (outcount < incount) { 588 outs = sg_next(outs); 589 outs->dma_length = 0; 590 } 591 592 /* Make sure updates are seen by hardware */ 593 mb(); 594 595 return outcount; 596 597 failure: 598 for_each_sg(sglist, s, nelems, i) { 599 if (s->dma_length != 0) { 600 unsigned long vaddr, npages; 601 602 vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl); 603 npages = iommu_num_pages(s->dma_address, s->dma_length, 604 IOMMU_PAGE_SIZE(tbl)); 605 __iommu_free(tbl, vaddr, npages); 606 s->dma_length = 0; 607 } 608 if (s == outs) 609 break; 610 } 611 return -EIO; 612 } 613 614 615 void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, 616 int nelems, enum dma_data_direction direction, 617 unsigned long attrs) 618 { 619 struct scatterlist *sg; 620 621 BUG_ON(direction == DMA_NONE); 622 623 if (!tbl) 624 return; 625 626 sg = sglist; 627 while (nelems--) { 628 unsigned int npages; 629 dma_addr_t dma_handle = sg->dma_address; 630 631 if (sg->dma_length == 0) 632 break; 633 npages = iommu_num_pages(dma_handle, sg->dma_length, 634 IOMMU_PAGE_SIZE(tbl)); 635 __iommu_free(tbl, dma_handle, npages); 636 sg = sg_next(sg); 637 } 638 639 /* Flush/invalidate TLBs if necessary. As for iommu_free(), we 640 * do not do an mb() here, the affected platforms do not need it 641 * when freeing. 642 */ 643 if (tbl->it_ops->flush) 644 tbl->it_ops->flush(tbl); 645 } 646 647 void iommu_table_clear(struct iommu_table *tbl) 648 { 649 /* 650 * In case of firmware assisted dump system goes through clean 651 * reboot process at the time of system crash. Hence it's safe to 652 * clear the TCE entries if firmware assisted dump is active. 653 */ 654 if (!is_kdump_kernel() || is_fadump_active()) { 655 /* Clear the table in case firmware left allocations in it */ 656 tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size); 657 return; 658 } 659 660 #ifdef CONFIG_CRASH_DUMP 661 if (tbl->it_ops->get) { 662 unsigned long index, tceval, tcecount = 0; 663 664 /* Reserve the existing mappings left by the first kernel. */ 665 for (index = 0; index < tbl->it_size; index++) { 666 tceval = tbl->it_ops->get(tbl, index + tbl->it_offset); 667 /* 668 * Freed TCE entry contains 0x7fffffffffffffff on JS20 669 */ 670 if (tceval && (tceval != 0x7fffffffffffffffUL)) { 671 __set_bit(index, tbl->it_map); 672 tcecount++; 673 } 674 } 675 676 if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) { 677 printk(KERN_WARNING "TCE table is full; freeing "); 678 printk(KERN_WARNING "%d entries for the kdump boot\n", 679 KDUMP_MIN_TCE_ENTRIES); 680 for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES; 681 index < tbl->it_size; index++) 682 __clear_bit(index, tbl->it_map); 683 } 684 } 685 #endif 686 } 687 688 void iommu_table_reserve_pages(struct iommu_table *tbl, 689 unsigned long res_start, unsigned long res_end) 690 { 691 unsigned long i; 692 693 WARN_ON_ONCE(res_end < res_start); 694 /* 695 * Reserve page 0 so it will not be used for any mappings. 696 * This avoids buggy drivers that consider page 0 to be invalid 697 * to crash the machine or even lose data. 698 */ 699 if (tbl->it_offset == 0) 700 set_bit(0, tbl->it_map); 701 702 if (res_start < tbl->it_offset) 703 res_start = tbl->it_offset; 704 705 if (res_end > (tbl->it_offset + tbl->it_size)) 706 res_end = tbl->it_offset + tbl->it_size; 707 708 /* Check if res_start..res_end is a valid range in the table */ 709 if (res_start >= res_end) { 710 tbl->it_reserved_start = tbl->it_offset; 711 tbl->it_reserved_end = tbl->it_offset; 712 return; 713 } 714 715 tbl->it_reserved_start = res_start; 716 tbl->it_reserved_end = res_end; 717 718 for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) 719 set_bit(i - tbl->it_offset, tbl->it_map); 720 } 721 722 /* 723 * Build a iommu_table structure. This contains a bit map which 724 * is used to manage allocation of the tce space. 725 */ 726 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, 727 unsigned long res_start, unsigned long res_end) 728 { 729 unsigned long sz; 730 static int welcomed = 0; 731 unsigned int i; 732 struct iommu_pool *p; 733 734 BUG_ON(!tbl->it_ops); 735 736 /* number of bytes needed for the bitmap */ 737 sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); 738 739 tbl->it_map = vzalloc_node(sz, nid); 740 if (!tbl->it_map) { 741 pr_err("%s: Can't allocate %ld bytes\n", __func__, sz); 742 return NULL; 743 } 744 745 iommu_table_reserve_pages(tbl, res_start, res_end); 746 747 /* We only split the IOMMU table if we have 1GB or more of space */ 748 if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) 749 tbl->nr_pools = IOMMU_NR_POOLS; 750 else 751 tbl->nr_pools = 1; 752 753 /* We reserve the top 1/4 of the table for large allocations */ 754 tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools; 755 756 for (i = 0; i < tbl->nr_pools; i++) { 757 p = &tbl->pools[i]; 758 spin_lock_init(&(p->lock)); 759 p->start = tbl->poolsize * i; 760 p->hint = p->start; 761 p->end = p->start + tbl->poolsize; 762 } 763 764 p = &tbl->large_pool; 765 spin_lock_init(&(p->lock)); 766 p->start = tbl->poolsize * i; 767 p->hint = p->start; 768 p->end = tbl->it_size; 769 770 iommu_table_clear(tbl); 771 772 if (!welcomed) { 773 pr_info("IOMMU table initialized, virtual merging %s\n", 774 str_disabled_enabled(novmerge)); 775 welcomed = 1; 776 } 777 778 iommu_debugfs_add(tbl); 779 780 return tbl; 781 } 782 783 bool iommu_table_in_use(struct iommu_table *tbl) 784 { 785 unsigned long start = 0, end; 786 787 /* ignore reserved bit0 */ 788 if (tbl->it_offset == 0) 789 start = 1; 790 791 /* Simple case with no reserved MMIO32 region */ 792 if (!tbl->it_reserved_start && !tbl->it_reserved_end) 793 return find_next_bit(tbl->it_map, tbl->it_size, start) != tbl->it_size; 794 795 end = tbl->it_reserved_start - tbl->it_offset; 796 if (find_next_bit(tbl->it_map, end, start) != end) 797 return true; 798 799 start = tbl->it_reserved_end - tbl->it_offset; 800 end = tbl->it_size; 801 return find_next_bit(tbl->it_map, end, start) != end; 802 } 803 804 static void iommu_table_free(struct kref *kref) 805 { 806 struct iommu_table *tbl; 807 808 tbl = container_of(kref, struct iommu_table, it_kref); 809 810 if (tbl->it_ops->free) 811 tbl->it_ops->free(tbl); 812 813 if (!tbl->it_map) { 814 kfree(tbl); 815 return; 816 } 817 818 iommu_debugfs_del(tbl); 819 820 /* verify that table contains no entries */ 821 if (iommu_table_in_use(tbl)) 822 pr_warn("%s: Unexpected TCEs\n", __func__); 823 824 /* free bitmap */ 825 vfree(tbl->it_map); 826 827 /* free table */ 828 kfree(tbl); 829 } 830 831 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl) 832 { 833 if (kref_get_unless_zero(&tbl->it_kref)) 834 return tbl; 835 836 return NULL; 837 } 838 EXPORT_SYMBOL_GPL(iommu_tce_table_get); 839 840 int iommu_tce_table_put(struct iommu_table *tbl) 841 { 842 if (WARN_ON(!tbl)) 843 return 0; 844 845 return kref_put(&tbl->it_kref, iommu_table_free); 846 } 847 EXPORT_SYMBOL_GPL(iommu_tce_table_put); 848 849 /* Creates TCEs for a user provided buffer. The user buffer must be 850 * contiguous real kernel storage (not vmalloc). The address passed here 851 * comprises a page address and offset into that page. The dma_addr_t 852 * returned will point to the same byte within the page as was passed in. 853 */ 854 dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, 855 struct page *page, unsigned long offset, size_t size, 856 unsigned long mask, enum dma_data_direction direction, 857 unsigned long attrs) 858 { 859 dma_addr_t dma_handle = DMA_MAPPING_ERROR; 860 void *vaddr; 861 unsigned long uaddr; 862 unsigned int npages, align; 863 864 BUG_ON(direction == DMA_NONE); 865 866 vaddr = page_address(page) + offset; 867 uaddr = (unsigned long)vaddr; 868 869 if (tbl) { 870 npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); 871 align = 0; 872 if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && 873 ((unsigned long)vaddr & ~PAGE_MASK) == 0) 874 align = PAGE_SHIFT - tbl->it_page_shift; 875 876 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, 877 mask >> tbl->it_page_shift, align, 878 attrs); 879 if (dma_handle == DMA_MAPPING_ERROR) { 880 if (!(attrs & DMA_ATTR_NO_WARN) && 881 printk_ratelimit()) { 882 dev_info(dev, "iommu_alloc failed, tbl %p " 883 "vaddr %p npages %d\n", tbl, vaddr, 884 npages); 885 } 886 } else 887 dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl)); 888 } 889 890 return dma_handle; 891 } 892 893 void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle, 894 size_t size, enum dma_data_direction direction, 895 unsigned long attrs) 896 { 897 unsigned int npages; 898 899 BUG_ON(direction == DMA_NONE); 900 901 if (tbl) { 902 npages = iommu_num_pages(dma_handle, size, 903 IOMMU_PAGE_SIZE(tbl)); 904 iommu_free(tbl, dma_handle, npages); 905 } 906 } 907 908 /* Allocates a contiguous real buffer and creates mappings over it. 909 * Returns the virtual address of the buffer and sets dma_handle 910 * to the dma address (mapping) of the first page. 911 */ 912 void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, 913 size_t size, dma_addr_t *dma_handle, 914 unsigned long mask, gfp_t flag, int node) 915 { 916 void *ret = NULL; 917 dma_addr_t mapping; 918 unsigned int order; 919 unsigned int nio_pages, io_order; 920 struct page *page; 921 int tcesize = (1 << tbl->it_page_shift); 922 923 size = PAGE_ALIGN(size); 924 order = get_order(size); 925 926 /* 927 * Client asked for way too much space. This is checked later 928 * anyway. It is easier to debug here for the drivers than in 929 * the tce tables. 930 */ 931 if (order >= IOMAP_MAX_ORDER) { 932 dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n", 933 size); 934 return NULL; 935 } 936 937 if (!tbl) 938 return NULL; 939 940 /* Alloc enough pages (and possibly more) */ 941 page = alloc_pages_node(node, flag, order); 942 if (!page) 943 return NULL; 944 ret = page_address(page); 945 memset(ret, 0, size); 946 947 /* Set up tces to cover the allocated range */ 948 nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift; 949 950 io_order = get_iommu_order(size, tbl); 951 mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, 952 mask >> tbl->it_page_shift, io_order, 0); 953 if (mapping == DMA_MAPPING_ERROR) { 954 free_pages((unsigned long)ret, order); 955 return NULL; 956 } 957 958 *dma_handle = mapping | ((u64)ret & (tcesize - 1)); 959 return ret; 960 } 961 962 void iommu_free_coherent(struct iommu_table *tbl, size_t size, 963 void *vaddr, dma_addr_t dma_handle) 964 { 965 if (tbl) { 966 unsigned int nio_pages; 967 968 size = PAGE_ALIGN(size); 969 nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift; 970 iommu_free(tbl, dma_handle, nio_pages); 971 size = PAGE_ALIGN(size); 972 free_pages((unsigned long)vaddr, get_order(size)); 973 } 974 } 975 976 unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir) 977 { 978 switch (dir) { 979 case DMA_BIDIRECTIONAL: 980 return TCE_PCI_READ | TCE_PCI_WRITE; 981 case DMA_FROM_DEVICE: 982 return TCE_PCI_WRITE; 983 case DMA_TO_DEVICE: 984 return TCE_PCI_READ; 985 default: 986 return 0; 987 } 988 } 989 EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm); 990 991 #ifdef CONFIG_IOMMU_API 992 993 int dev_has_iommu_table(struct device *dev, void *data) 994 { 995 struct pci_dev *pdev = to_pci_dev(dev); 996 struct pci_dev **ppdev = data; 997 998 if (!dev) 999 return 0; 1000 1001 if (device_iommu_mapped(dev)) { 1002 *ppdev = pdev; 1003 return 1; 1004 } 1005 1006 return 0; 1007 } 1008 1009 /* 1010 * SPAPR TCE API 1011 */ 1012 static void group_release(void *iommu_data) 1013 { 1014 struct iommu_table_group *table_group = iommu_data; 1015 1016 table_group->group = NULL; 1017 } 1018 1019 void iommu_register_group(struct iommu_table_group *table_group, 1020 int pci_domain_number, unsigned long pe_num) 1021 { 1022 struct iommu_group *grp; 1023 char *name; 1024 1025 grp = iommu_group_alloc(); 1026 if (IS_ERR(grp)) { 1027 pr_warn("powerpc iommu api: cannot create new group, err=%ld\n", 1028 PTR_ERR(grp)); 1029 return; 1030 } 1031 table_group->group = grp; 1032 iommu_group_set_iommudata(grp, table_group, group_release); 1033 name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", 1034 pci_domain_number, pe_num); 1035 if (!name) 1036 return; 1037 iommu_group_set_name(grp, name); 1038 kfree(name); 1039 } 1040 1041 enum dma_data_direction iommu_tce_direction(unsigned long tce) 1042 { 1043 if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) 1044 return DMA_BIDIRECTIONAL; 1045 else if (tce & TCE_PCI_READ) 1046 return DMA_TO_DEVICE; 1047 else if (tce & TCE_PCI_WRITE) 1048 return DMA_FROM_DEVICE; 1049 else 1050 return DMA_NONE; 1051 } 1052 EXPORT_SYMBOL_GPL(iommu_tce_direction); 1053 1054 void iommu_flush_tce(struct iommu_table *tbl) 1055 { 1056 /* Flush/invalidate TLB caches if necessary */ 1057 if (tbl->it_ops->flush) 1058 tbl->it_ops->flush(tbl); 1059 1060 /* Make sure updates are seen by hardware */ 1061 mb(); 1062 } 1063 EXPORT_SYMBOL_GPL(iommu_flush_tce); 1064 1065 int iommu_tce_check_ioba(unsigned long page_shift, 1066 unsigned long offset, unsigned long size, 1067 unsigned long ioba, unsigned long npages) 1068 { 1069 unsigned long mask = (1UL << page_shift) - 1; 1070 1071 if (ioba & mask) 1072 return -EINVAL; 1073 1074 ioba >>= page_shift; 1075 if (ioba < offset) 1076 return -EINVAL; 1077 1078 if ((ioba + 1) > (offset + size)) 1079 return -EINVAL; 1080 1081 return 0; 1082 } 1083 EXPORT_SYMBOL_GPL(iommu_tce_check_ioba); 1084 1085 int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) 1086 { 1087 unsigned long mask = (1UL << page_shift) - 1; 1088 1089 if (gpa & mask) 1090 return -EINVAL; 1091 1092 return 0; 1093 } 1094 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); 1095 1096 long iommu_tce_xchg_no_kill(struct mm_struct *mm, 1097 struct iommu_table *tbl, 1098 unsigned long entry, unsigned long *hpa, 1099 enum dma_data_direction *direction) 1100 { 1101 long ret; 1102 unsigned long size = 0; 1103 1104 ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction); 1105 if (!ret && ((*direction == DMA_FROM_DEVICE) || 1106 (*direction == DMA_BIDIRECTIONAL)) && 1107 !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, 1108 &size)) 1109 SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); 1110 1111 return ret; 1112 } 1113 EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill); 1114 1115 void iommu_tce_kill(struct iommu_table *tbl, 1116 unsigned long entry, unsigned long pages) 1117 { 1118 if (tbl->it_ops->tce_kill) 1119 tbl->it_ops->tce_kill(tbl, entry, pages); 1120 } 1121 EXPORT_SYMBOL_GPL(iommu_tce_kill); 1122 1123 int iommu_add_device(struct iommu_table_group *table_group, struct device *dev) 1124 { 1125 /* 1126 * The sysfs entries should be populated before 1127 * binding IOMMU group. If sysfs entries isn't 1128 * ready, we simply bail. 1129 */ 1130 if (!device_is_registered(dev)) 1131 return -ENOENT; 1132 1133 if (device_iommu_mapped(dev)) { 1134 pr_debug("%s: Skipping device %s with iommu group %d\n", 1135 __func__, dev_name(dev), 1136 iommu_group_id(dev->iommu_group)); 1137 return -EBUSY; 1138 } 1139 1140 pr_debug("%s: Adding %s to iommu group %d\n", 1141 __func__, dev_name(dev), iommu_group_id(table_group->group)); 1142 /* 1143 * This is still not adding devices via the IOMMU bus notifier because 1144 * of pcibios_init() from arch/powerpc/kernel/pci_64.c which calls 1145 * pcibios_scan_phb() first (and this guy adds devices and triggers 1146 * the notifier) and only then it calls pci_bus_add_devices() which 1147 * configures DMA for buses which also creates PEs and IOMMU groups. 1148 */ 1149 return iommu_probe_device(dev); 1150 } 1151 EXPORT_SYMBOL_GPL(iommu_add_device); 1152 1153 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) 1154 /* 1155 * A simple iommu_ops to allow less cruft in generic VFIO code. 1156 */ 1157 static int 1158 spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, 1159 struct device *dev) 1160 { 1161 struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 1162 struct iommu_table_group *table_group; 1163 struct iommu_group *grp; 1164 1165 /* At first attach the ownership is already set */ 1166 if (!domain) 1167 return 0; 1168 1169 grp = iommu_group_get(dev); 1170 table_group = iommu_group_get_iommudata(grp); 1171 /* 1172 * The domain being set to PLATFORM from earlier 1173 * BLOCKED. The table_group ownership has to be released. 1174 */ 1175 table_group->ops->release_ownership(table_group, dev); 1176 iommu_group_put(grp); 1177 1178 return 0; 1179 } 1180 1181 static const struct iommu_domain_ops spapr_tce_platform_domain_ops = { 1182 .attach_dev = spapr_tce_platform_iommu_attach_dev, 1183 }; 1184 1185 static struct iommu_domain spapr_tce_platform_domain = { 1186 .type = IOMMU_DOMAIN_PLATFORM, 1187 .ops = &spapr_tce_platform_domain_ops, 1188 }; 1189 1190 static int 1191 spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain, 1192 struct device *dev) 1193 { 1194 struct iommu_group *grp = iommu_group_get(dev); 1195 struct iommu_table_group *table_group; 1196 int ret = -EINVAL; 1197 1198 /* 1199 * FIXME: SPAPR mixes blocked and platform behaviors, the blocked domain 1200 * also sets the dma_api ops 1201 */ 1202 table_group = iommu_group_get_iommudata(grp); 1203 ret = table_group->ops->take_ownership(table_group, dev); 1204 iommu_group_put(grp); 1205 1206 return ret; 1207 } 1208 1209 static const struct iommu_domain_ops spapr_tce_blocked_domain_ops = { 1210 .attach_dev = spapr_tce_blocked_iommu_attach_dev, 1211 }; 1212 1213 static struct iommu_domain spapr_tce_blocked_domain = { 1214 .type = IOMMU_DOMAIN_BLOCKED, 1215 .ops = &spapr_tce_blocked_domain_ops, 1216 }; 1217 1218 static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap) 1219 { 1220 switch (cap) { 1221 case IOMMU_CAP_CACHE_COHERENCY: 1222 return true; 1223 default: 1224 break; 1225 } 1226 1227 return false; 1228 } 1229 1230 static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev) 1231 { 1232 struct pci_dev *pdev; 1233 struct pci_controller *hose; 1234 1235 if (!dev_is_pci(dev)) 1236 return ERR_PTR(-ENODEV); 1237 1238 pdev = to_pci_dev(dev); 1239 hose = pdev->bus->sysdata; 1240 1241 return &hose->iommu; 1242 } 1243 1244 static void spapr_tce_iommu_release_device(struct device *dev) 1245 { 1246 } 1247 1248 static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev) 1249 { 1250 struct pci_controller *hose; 1251 struct pci_dev *pdev; 1252 1253 pdev = to_pci_dev(dev); 1254 hose = pdev->bus->sysdata; 1255 1256 if (!hose->controller_ops.device_group) 1257 return ERR_PTR(-ENOENT); 1258 1259 return hose->controller_ops.device_group(hose, pdev); 1260 } 1261 1262 static const struct iommu_ops spapr_tce_iommu_ops = { 1263 .default_domain = &spapr_tce_platform_domain, 1264 .blocked_domain = &spapr_tce_blocked_domain, 1265 .capable = spapr_tce_iommu_capable, 1266 .probe_device = spapr_tce_iommu_probe_device, 1267 .release_device = spapr_tce_iommu_release_device, 1268 .device_group = spapr_tce_iommu_device_group, 1269 }; 1270 1271 static struct attribute *spapr_tce_iommu_attrs[] = { 1272 NULL, 1273 }; 1274 1275 static struct attribute_group spapr_tce_iommu_group = { 1276 .name = "spapr-tce-iommu", 1277 .attrs = spapr_tce_iommu_attrs, 1278 }; 1279 1280 static const struct attribute_group *spapr_tce_iommu_groups[] = { 1281 &spapr_tce_iommu_group, 1282 NULL, 1283 }; 1284 1285 void ppc_iommu_register_device(struct pci_controller *phb) 1286 { 1287 iommu_device_sysfs_add(&phb->iommu, phb->parent, 1288 spapr_tce_iommu_groups, "iommu-phb%04x", 1289 phb->global_number); 1290 iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops, 1291 phb->parent); 1292 } 1293 1294 void ppc_iommu_unregister_device(struct pci_controller *phb) 1295 { 1296 iommu_device_unregister(&phb->iommu); 1297 iommu_device_sysfs_remove(&phb->iommu); 1298 } 1299 1300 /* 1301 * This registers IOMMU devices of PHBs. This needs to happen 1302 * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and 1303 * before subsys_initcall(iommu_subsys_init). 1304 */ 1305 static int __init spapr_tce_setup_phb_iommus_initcall(void) 1306 { 1307 struct pci_controller *hose; 1308 1309 list_for_each_entry(hose, &hose_list, list_node) { 1310 ppc_iommu_register_device(hose); 1311 } 1312 return 0; 1313 } 1314 postcore_initcall_sync(spapr_tce_setup_phb_iommus_initcall); 1315 #endif 1316 1317 #endif /* CONFIG_IOMMU_API */ 1318