1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation 4 * 5 * Rewrite, cleanup, new allocation schemes, virtual merging: 6 * Copyright (C) 2004 Olof Johansson, IBM Corporation 7 * and Ben. Herrenschmidt, IBM Corporation 8 * 9 * Dynamic DMA mapping support, bus-independent parts. 10 */ 11 12 13 #include <linux/init.h> 14 #include <linux/types.h> 15 #include <linux/slab.h> 16 #include <linux/mm.h> 17 #include <linux/spinlock.h> 18 #include <linux/string.h> 19 #include <linux/dma-mapping.h> 20 #include <linux/bitmap.h> 21 #include <linux/iommu-helper.h> 22 #include <linux/crash_dump.h> 23 #include <linux/hash.h> 24 #include <linux/fault-inject.h> 25 #include <linux/pci.h> 26 #include <linux/iommu.h> 27 #include <linux/sched.h> 28 #include <linux/debugfs.h> 29 #include <asm/io.h> 30 #include <asm/prom.h> 31 #include <asm/iommu.h> 32 #include <asm/pci-bridge.h> 33 #include <asm/machdep.h> 34 #include <asm/kdump.h> 35 #include <asm/fadump.h> 36 #include <asm/vio.h> 37 #include <asm/tce.h> 38 #include <asm/mmu_context.h> 39 40 #define DBG(...) 41 42 #ifdef CONFIG_IOMMU_DEBUGFS 43 static int iommu_debugfs_weight_get(void *data, u64 *val) 44 { 45 struct iommu_table *tbl = data; 46 *val = bitmap_weight(tbl->it_map, tbl->it_size); 47 return 0; 48 } 49 DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n"); 50 51 static void iommu_debugfs_add(struct iommu_table *tbl) 52 { 53 char name[10]; 54 struct dentry *liobn_entry; 55 56 sprintf(name, "%08lx", tbl->it_index); 57 liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir); 58 59 debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight); 60 debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size); 61 debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift); 62 debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start); 63 debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end); 64 debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels); 65 debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size); 66 } 67 68 static void iommu_debugfs_del(struct iommu_table *tbl) 69 { 70 char name[10]; 71 struct dentry *liobn_entry; 72 73 sprintf(name, "%08lx", tbl->it_index); 74 liobn_entry = debugfs_lookup(name, iommu_debugfs_dir); 75 if (liobn_entry) 76 debugfs_remove(liobn_entry); 77 } 78 #else 79 static void iommu_debugfs_add(struct iommu_table *tbl){} 80 static void iommu_debugfs_del(struct iommu_table *tbl){} 81 #endif 82 83 static int novmerge; 84 85 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); 86 87 static int __init setup_iommu(char *str) 88 { 89 if (!strcmp(str, "novmerge")) 90 novmerge = 1; 91 else if (!strcmp(str, "vmerge")) 92 novmerge = 0; 93 return 1; 94 } 95 96 __setup("iommu=", setup_iommu); 97 98 static DEFINE_PER_CPU(unsigned int, iommu_pool_hash); 99 100 /* 101 * We precalculate the hash to avoid doing it on every allocation. 102 * 103 * The hash is important to spread CPUs across all the pools. For example, 104 * on a POWER7 with 4 way SMT we want interrupts on the primary threads and 105 * with 4 pools all primary threads would map to the same pool. 106 */ 107 static int __init setup_iommu_pool_hash(void) 108 { 109 unsigned int i; 110 111 for_each_possible_cpu(i) 112 per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS); 113 114 return 0; 115 } 116 subsys_initcall(setup_iommu_pool_hash); 117 118 #ifdef CONFIG_FAIL_IOMMU 119 120 static DECLARE_FAULT_ATTR(fail_iommu); 121 122 static int __init setup_fail_iommu(char *str) 123 { 124 return setup_fault_attr(&fail_iommu, str); 125 } 126 __setup("fail_iommu=", setup_fail_iommu); 127 128 static bool should_fail_iommu(struct device *dev) 129 { 130 return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1); 131 } 132 133 static int __init fail_iommu_debugfs(void) 134 { 135 struct dentry *dir = fault_create_debugfs_attr("fail_iommu", 136 NULL, &fail_iommu); 137 138 return PTR_ERR_OR_ZERO(dir); 139 } 140 late_initcall(fail_iommu_debugfs); 141 142 static ssize_t fail_iommu_show(struct device *dev, 143 struct device_attribute *attr, char *buf) 144 { 145 return sprintf(buf, "%d\n", dev->archdata.fail_iommu); 146 } 147 148 static ssize_t fail_iommu_store(struct device *dev, 149 struct device_attribute *attr, const char *buf, 150 size_t count) 151 { 152 int i; 153 154 if (count > 0 && sscanf(buf, "%d", &i) > 0) 155 dev->archdata.fail_iommu = (i == 0) ? 0 : 1; 156 157 return count; 158 } 159 160 static DEVICE_ATTR_RW(fail_iommu); 161 162 static int fail_iommu_bus_notify(struct notifier_block *nb, 163 unsigned long action, void *data) 164 { 165 struct device *dev = data; 166 167 if (action == BUS_NOTIFY_ADD_DEVICE) { 168 if (device_create_file(dev, &dev_attr_fail_iommu)) 169 pr_warn("Unable to create IOMMU fault injection sysfs " 170 "entries\n"); 171 } else if (action == BUS_NOTIFY_DEL_DEVICE) { 172 device_remove_file(dev, &dev_attr_fail_iommu); 173 } 174 175 return 0; 176 } 177 178 static struct notifier_block fail_iommu_bus_notifier = { 179 .notifier_call = fail_iommu_bus_notify 180 }; 181 182 static int __init fail_iommu_setup(void) 183 { 184 #ifdef CONFIG_PCI 185 bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier); 186 #endif 187 #ifdef CONFIG_IBMVIO 188 bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier); 189 #endif 190 191 return 0; 192 } 193 /* 194 * Must execute after PCI and VIO subsystem have initialised but before 195 * devices are probed. 196 */ 197 arch_initcall(fail_iommu_setup); 198 #else 199 static inline bool should_fail_iommu(struct device *dev) 200 { 201 return false; 202 } 203 #endif 204 205 static unsigned long iommu_range_alloc(struct device *dev, 206 struct iommu_table *tbl, 207 unsigned long npages, 208 unsigned long *handle, 209 unsigned long mask, 210 unsigned int align_order) 211 { 212 unsigned long n, end, start; 213 unsigned long limit; 214 int largealloc = npages > 15; 215 int pass = 0; 216 unsigned long align_mask; 217 unsigned long flags; 218 unsigned int pool_nr; 219 struct iommu_pool *pool; 220 221 align_mask = (1ull << align_order) - 1; 222 223 /* This allocator was derived from x86_64's bit string search */ 224 225 /* Sanity check */ 226 if (unlikely(npages == 0)) { 227 if (printk_ratelimit()) 228 WARN_ON(1); 229 return DMA_MAPPING_ERROR; 230 } 231 232 if (should_fail_iommu(dev)) 233 return DMA_MAPPING_ERROR; 234 235 /* 236 * We don't need to disable preemption here because any CPU can 237 * safely use any IOMMU pool. 238 */ 239 pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); 240 241 if (largealloc) 242 pool = &(tbl->large_pool); 243 else 244 pool = &(tbl->pools[pool_nr]); 245 246 spin_lock_irqsave(&(pool->lock), flags); 247 248 again: 249 if ((pass == 0) && handle && *handle && 250 (*handle >= pool->start) && (*handle < pool->end)) 251 start = *handle; 252 else 253 start = pool->hint; 254 255 limit = pool->end; 256 257 /* The case below can happen if we have a small segment appended 258 * to a large, or when the previous alloc was at the very end of 259 * the available space. If so, go back to the initial start. 260 */ 261 if (start >= limit) 262 start = pool->start; 263 264 if (limit + tbl->it_offset > mask) { 265 limit = mask - tbl->it_offset + 1; 266 /* If we're constrained on address range, first try 267 * at the masked hint to avoid O(n) search complexity, 268 * but on second pass, start at 0 in pool 0. 269 */ 270 if ((start & mask) >= limit || pass > 0) { 271 spin_unlock(&(pool->lock)); 272 pool = &(tbl->pools[0]); 273 spin_lock(&(pool->lock)); 274 start = pool->start; 275 } else { 276 start &= mask; 277 } 278 } 279 280 n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, 281 dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift), 282 align_mask); 283 if (n == -1) { 284 if (likely(pass == 0)) { 285 /* First try the pool from the start */ 286 pool->hint = pool->start; 287 pass++; 288 goto again; 289 290 } else if (pass <= tbl->nr_pools) { 291 /* Now try scanning all the other pools */ 292 spin_unlock(&(pool->lock)); 293 pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1); 294 pool = &tbl->pools[pool_nr]; 295 spin_lock(&(pool->lock)); 296 pool->hint = pool->start; 297 pass++; 298 goto again; 299 300 } else { 301 /* Give up */ 302 spin_unlock_irqrestore(&(pool->lock), flags); 303 return DMA_MAPPING_ERROR; 304 } 305 } 306 307 end = n + npages; 308 309 /* Bump the hint to a new block for small allocs. */ 310 if (largealloc) { 311 /* Don't bump to new block to avoid fragmentation */ 312 pool->hint = end; 313 } else { 314 /* Overflow will be taken care of at the next allocation */ 315 pool->hint = (end + tbl->it_blocksize - 1) & 316 ~(tbl->it_blocksize - 1); 317 } 318 319 /* Update handle for SG allocations */ 320 if (handle) 321 *handle = end; 322 323 spin_unlock_irqrestore(&(pool->lock), flags); 324 325 return n; 326 } 327 328 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, 329 void *page, unsigned int npages, 330 enum dma_data_direction direction, 331 unsigned long mask, unsigned int align_order, 332 unsigned long attrs) 333 { 334 unsigned long entry; 335 dma_addr_t ret = DMA_MAPPING_ERROR; 336 int build_fail; 337 338 entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); 339 340 if (unlikely(entry == DMA_MAPPING_ERROR)) 341 return DMA_MAPPING_ERROR; 342 343 entry += tbl->it_offset; /* Offset into real TCE table */ 344 ret = entry << tbl->it_page_shift; /* Set the return dma address */ 345 346 /* Put the TCEs in the HW table */ 347 build_fail = tbl->it_ops->set(tbl, entry, npages, 348 (unsigned long)page & 349 IOMMU_PAGE_MASK(tbl), direction, attrs); 350 351 /* tbl->it_ops->set() only returns non-zero for transient errors. 352 * Clean up the table bitmap in this case and return 353 * DMA_MAPPING_ERROR. For all other errors the functionality is 354 * not altered. 355 */ 356 if (unlikely(build_fail)) { 357 __iommu_free(tbl, ret, npages); 358 return DMA_MAPPING_ERROR; 359 } 360 361 /* Flush/invalidate TLB caches if necessary */ 362 if (tbl->it_ops->flush) 363 tbl->it_ops->flush(tbl); 364 365 /* Make sure updates are seen by hardware */ 366 mb(); 367 368 return ret; 369 } 370 371 static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, 372 unsigned int npages) 373 { 374 unsigned long entry, free_entry; 375 376 entry = dma_addr >> tbl->it_page_shift; 377 free_entry = entry - tbl->it_offset; 378 379 if (((free_entry + npages) > tbl->it_size) || 380 (entry < tbl->it_offset)) { 381 if (printk_ratelimit()) { 382 printk(KERN_INFO "iommu_free: invalid entry\n"); 383 printk(KERN_INFO "\tentry = 0x%lx\n", entry); 384 printk(KERN_INFO "\tdma_addr = 0x%llx\n", (u64)dma_addr); 385 printk(KERN_INFO "\tTable = 0x%llx\n", (u64)tbl); 386 printk(KERN_INFO "\tbus# = 0x%llx\n", (u64)tbl->it_busno); 387 printk(KERN_INFO "\tsize = 0x%llx\n", (u64)tbl->it_size); 388 printk(KERN_INFO "\tstartOff = 0x%llx\n", (u64)tbl->it_offset); 389 printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index); 390 WARN_ON(1); 391 } 392 393 return false; 394 } 395 396 return true; 397 } 398 399 static struct iommu_pool *get_pool(struct iommu_table *tbl, 400 unsigned long entry) 401 { 402 struct iommu_pool *p; 403 unsigned long largepool_start = tbl->large_pool.start; 404 405 /* The large pool is the last pool at the top of the table */ 406 if (entry >= largepool_start) { 407 p = &tbl->large_pool; 408 } else { 409 unsigned int pool_nr = entry / tbl->poolsize; 410 411 BUG_ON(pool_nr > tbl->nr_pools); 412 p = &tbl->pools[pool_nr]; 413 } 414 415 return p; 416 } 417 418 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 419 unsigned int npages) 420 { 421 unsigned long entry, free_entry; 422 unsigned long flags; 423 struct iommu_pool *pool; 424 425 entry = dma_addr >> tbl->it_page_shift; 426 free_entry = entry - tbl->it_offset; 427 428 pool = get_pool(tbl, free_entry); 429 430 if (!iommu_free_check(tbl, dma_addr, npages)) 431 return; 432 433 tbl->it_ops->clear(tbl, entry, npages); 434 435 spin_lock_irqsave(&(pool->lock), flags); 436 bitmap_clear(tbl->it_map, free_entry, npages); 437 spin_unlock_irqrestore(&(pool->lock), flags); 438 } 439 440 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 441 unsigned int npages) 442 { 443 __iommu_free(tbl, dma_addr, npages); 444 445 /* Make sure TLB cache is flushed if the HW needs it. We do 446 * not do an mb() here on purpose, it is not needed on any of 447 * the current platforms. 448 */ 449 if (tbl->it_ops->flush) 450 tbl->it_ops->flush(tbl); 451 } 452 453 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, 454 struct scatterlist *sglist, int nelems, 455 unsigned long mask, enum dma_data_direction direction, 456 unsigned long attrs) 457 { 458 dma_addr_t dma_next = 0, dma_addr; 459 struct scatterlist *s, *outs, *segstart; 460 int outcount, incount, i, build_fail = 0; 461 unsigned int align; 462 unsigned long handle; 463 unsigned int max_seg_size; 464 465 BUG_ON(direction == DMA_NONE); 466 467 if ((nelems == 0) || !tbl) 468 return 0; 469 470 outs = s = segstart = &sglist[0]; 471 outcount = 1; 472 incount = nelems; 473 handle = 0; 474 475 /* Init first segment length for backout at failure */ 476 outs->dma_length = 0; 477 478 DBG("sg mapping %d elements:\n", nelems); 479 480 max_seg_size = dma_get_max_seg_size(dev); 481 for_each_sg(sglist, s, nelems, i) { 482 unsigned long vaddr, npages, entry, slen; 483 484 slen = s->length; 485 /* Sanity check */ 486 if (slen == 0) { 487 dma_next = 0; 488 continue; 489 } 490 /* Allocate iommu entries for that segment */ 491 vaddr = (unsigned long) sg_virt(s); 492 npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl)); 493 align = 0; 494 if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE && 495 (vaddr & ~PAGE_MASK) == 0) 496 align = PAGE_SHIFT - tbl->it_page_shift; 497 entry = iommu_range_alloc(dev, tbl, npages, &handle, 498 mask >> tbl->it_page_shift, align); 499 500 DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); 501 502 /* Handle failure */ 503 if (unlikely(entry == DMA_MAPPING_ERROR)) { 504 if (!(attrs & DMA_ATTR_NO_WARN) && 505 printk_ratelimit()) 506 dev_info(dev, "iommu_alloc failed, tbl %p " 507 "vaddr %lx npages %lu\n", tbl, vaddr, 508 npages); 509 goto failure; 510 } 511 512 /* Convert entry to a dma_addr_t */ 513 entry += tbl->it_offset; 514 dma_addr = entry << tbl->it_page_shift; 515 dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl)); 516 517 DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", 518 npages, entry, dma_addr); 519 520 /* Insert into HW table */ 521 build_fail = tbl->it_ops->set(tbl, entry, npages, 522 vaddr & IOMMU_PAGE_MASK(tbl), 523 direction, attrs); 524 if(unlikely(build_fail)) 525 goto failure; 526 527 /* If we are in an open segment, try merging */ 528 if (segstart != s) { 529 DBG(" - trying merge...\n"); 530 /* We cannot merge if: 531 * - allocated dma_addr isn't contiguous to previous allocation 532 */ 533 if (novmerge || (dma_addr != dma_next) || 534 (outs->dma_length + s->length > max_seg_size)) { 535 /* Can't merge: create a new segment */ 536 segstart = s; 537 outcount++; 538 outs = sg_next(outs); 539 DBG(" can't merge, new segment.\n"); 540 } else { 541 outs->dma_length += s->length; 542 DBG(" merged, new len: %ux\n", outs->dma_length); 543 } 544 } 545 546 if (segstart == s) { 547 /* This is a new segment, fill entries */ 548 DBG(" - filling new segment.\n"); 549 outs->dma_address = dma_addr; 550 outs->dma_length = slen; 551 } 552 553 /* Calculate next page pointer for contiguous check */ 554 dma_next = dma_addr + slen; 555 556 DBG(" - dma next is: %lx\n", dma_next); 557 } 558 559 /* Flush/invalidate TLB caches if necessary */ 560 if (tbl->it_ops->flush) 561 tbl->it_ops->flush(tbl); 562 563 DBG("mapped %d elements:\n", outcount); 564 565 /* For the sake of ppc_iommu_unmap_sg, we clear out the length in the 566 * next entry of the sglist if we didn't fill the list completely 567 */ 568 if (outcount < incount) { 569 outs = sg_next(outs); 570 outs->dma_address = DMA_MAPPING_ERROR; 571 outs->dma_length = 0; 572 } 573 574 /* Make sure updates are seen by hardware */ 575 mb(); 576 577 return outcount; 578 579 failure: 580 for_each_sg(sglist, s, nelems, i) { 581 if (s->dma_length != 0) { 582 unsigned long vaddr, npages; 583 584 vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl); 585 npages = iommu_num_pages(s->dma_address, s->dma_length, 586 IOMMU_PAGE_SIZE(tbl)); 587 __iommu_free(tbl, vaddr, npages); 588 s->dma_address = DMA_MAPPING_ERROR; 589 s->dma_length = 0; 590 } 591 if (s == outs) 592 break; 593 } 594 return 0; 595 } 596 597 598 void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, 599 int nelems, enum dma_data_direction direction, 600 unsigned long attrs) 601 { 602 struct scatterlist *sg; 603 604 BUG_ON(direction == DMA_NONE); 605 606 if (!tbl) 607 return; 608 609 sg = sglist; 610 while (nelems--) { 611 unsigned int npages; 612 dma_addr_t dma_handle = sg->dma_address; 613 614 if (sg->dma_length == 0) 615 break; 616 npages = iommu_num_pages(dma_handle, sg->dma_length, 617 IOMMU_PAGE_SIZE(tbl)); 618 __iommu_free(tbl, dma_handle, npages); 619 sg = sg_next(sg); 620 } 621 622 /* Flush/invalidate TLBs if necessary. As for iommu_free(), we 623 * do not do an mb() here, the affected platforms do not need it 624 * when freeing. 625 */ 626 if (tbl->it_ops->flush) 627 tbl->it_ops->flush(tbl); 628 } 629 630 static void iommu_table_clear(struct iommu_table *tbl) 631 { 632 /* 633 * In case of firmware assisted dump system goes through clean 634 * reboot process at the time of system crash. Hence it's safe to 635 * clear the TCE entries if firmware assisted dump is active. 636 */ 637 if (!is_kdump_kernel() || is_fadump_active()) { 638 /* Clear the table in case firmware left allocations in it */ 639 tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size); 640 return; 641 } 642 643 #ifdef CONFIG_CRASH_DUMP 644 if (tbl->it_ops->get) { 645 unsigned long index, tceval, tcecount = 0; 646 647 /* Reserve the existing mappings left by the first kernel. */ 648 for (index = 0; index < tbl->it_size; index++) { 649 tceval = tbl->it_ops->get(tbl, index + tbl->it_offset); 650 /* 651 * Freed TCE entry contains 0x7fffffffffffffff on JS20 652 */ 653 if (tceval && (tceval != 0x7fffffffffffffffUL)) { 654 __set_bit(index, tbl->it_map); 655 tcecount++; 656 } 657 } 658 659 if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) { 660 printk(KERN_WARNING "TCE table is full; freeing "); 661 printk(KERN_WARNING "%d entries for the kdump boot\n", 662 KDUMP_MIN_TCE_ENTRIES); 663 for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES; 664 index < tbl->it_size; index++) 665 __clear_bit(index, tbl->it_map); 666 } 667 } 668 #endif 669 } 670 671 static void iommu_table_reserve_pages(struct iommu_table *tbl, 672 unsigned long res_start, unsigned long res_end) 673 { 674 int i; 675 676 WARN_ON_ONCE(res_end < res_start); 677 /* 678 * Reserve page 0 so it will not be used for any mappings. 679 * This avoids buggy drivers that consider page 0 to be invalid 680 * to crash the machine or even lose data. 681 */ 682 if (tbl->it_offset == 0) 683 set_bit(0, tbl->it_map); 684 685 tbl->it_reserved_start = res_start; 686 tbl->it_reserved_end = res_end; 687 688 /* Check if res_start..res_end isn't empty and overlaps the table */ 689 if (res_start && res_end && 690 (tbl->it_offset + tbl->it_size < res_start || 691 res_end < tbl->it_offset)) 692 return; 693 694 for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) 695 set_bit(i - tbl->it_offset, tbl->it_map); 696 } 697 698 static void iommu_table_release_pages(struct iommu_table *tbl) 699 { 700 int i; 701 702 /* 703 * In case we have reserved the first bit, we should not emit 704 * the warning below. 705 */ 706 if (tbl->it_offset == 0) 707 clear_bit(0, tbl->it_map); 708 709 for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) 710 clear_bit(i - tbl->it_offset, tbl->it_map); 711 } 712 713 /* 714 * Build a iommu_table structure. This contains a bit map which 715 * is used to manage allocation of the tce space. 716 */ 717 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, 718 unsigned long res_start, unsigned long res_end) 719 { 720 unsigned long sz; 721 static int welcomed = 0; 722 struct page *page; 723 unsigned int i; 724 struct iommu_pool *p; 725 726 BUG_ON(!tbl->it_ops); 727 728 /* number of bytes needed for the bitmap */ 729 sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); 730 731 page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz)); 732 if (!page) 733 panic("iommu_init_table: Can't allocate %ld bytes\n", sz); 734 tbl->it_map = page_address(page); 735 memset(tbl->it_map, 0, sz); 736 737 iommu_table_reserve_pages(tbl, res_start, res_end); 738 739 /* We only split the IOMMU table if we have 1GB or more of space */ 740 if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) 741 tbl->nr_pools = IOMMU_NR_POOLS; 742 else 743 tbl->nr_pools = 1; 744 745 /* We reserve the top 1/4 of the table for large allocations */ 746 tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools; 747 748 for (i = 0; i < tbl->nr_pools; i++) { 749 p = &tbl->pools[i]; 750 spin_lock_init(&(p->lock)); 751 p->start = tbl->poolsize * i; 752 p->hint = p->start; 753 p->end = p->start + tbl->poolsize; 754 } 755 756 p = &tbl->large_pool; 757 spin_lock_init(&(p->lock)); 758 p->start = tbl->poolsize * i; 759 p->hint = p->start; 760 p->end = tbl->it_size; 761 762 iommu_table_clear(tbl); 763 764 if (!welcomed) { 765 printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n", 766 novmerge ? "disabled" : "enabled"); 767 welcomed = 1; 768 } 769 770 iommu_debugfs_add(tbl); 771 772 return tbl; 773 } 774 775 static void iommu_table_free(struct kref *kref) 776 { 777 unsigned long bitmap_sz; 778 unsigned int order; 779 struct iommu_table *tbl; 780 781 tbl = container_of(kref, struct iommu_table, it_kref); 782 783 if (tbl->it_ops->free) 784 tbl->it_ops->free(tbl); 785 786 if (!tbl->it_map) { 787 kfree(tbl); 788 return; 789 } 790 791 iommu_debugfs_del(tbl); 792 793 iommu_table_release_pages(tbl); 794 795 /* verify that table contains no entries */ 796 if (!bitmap_empty(tbl->it_map, tbl->it_size)) 797 pr_warn("%s: Unexpected TCEs\n", __func__); 798 799 /* calculate bitmap size in bytes */ 800 bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); 801 802 /* free bitmap */ 803 order = get_order(bitmap_sz); 804 free_pages((unsigned long) tbl->it_map, order); 805 806 /* free table */ 807 kfree(tbl); 808 } 809 810 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl) 811 { 812 if (kref_get_unless_zero(&tbl->it_kref)) 813 return tbl; 814 815 return NULL; 816 } 817 EXPORT_SYMBOL_GPL(iommu_tce_table_get); 818 819 int iommu_tce_table_put(struct iommu_table *tbl) 820 { 821 if (WARN_ON(!tbl)) 822 return 0; 823 824 return kref_put(&tbl->it_kref, iommu_table_free); 825 } 826 EXPORT_SYMBOL_GPL(iommu_tce_table_put); 827 828 /* Creates TCEs for a user provided buffer. The user buffer must be 829 * contiguous real kernel storage (not vmalloc). The address passed here 830 * comprises a page address and offset into that page. The dma_addr_t 831 * returned will point to the same byte within the page as was passed in. 832 */ 833 dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, 834 struct page *page, unsigned long offset, size_t size, 835 unsigned long mask, enum dma_data_direction direction, 836 unsigned long attrs) 837 { 838 dma_addr_t dma_handle = DMA_MAPPING_ERROR; 839 void *vaddr; 840 unsigned long uaddr; 841 unsigned int npages, align; 842 843 BUG_ON(direction == DMA_NONE); 844 845 vaddr = page_address(page) + offset; 846 uaddr = (unsigned long)vaddr; 847 848 if (tbl) { 849 npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); 850 align = 0; 851 if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && 852 ((unsigned long)vaddr & ~PAGE_MASK) == 0) 853 align = PAGE_SHIFT - tbl->it_page_shift; 854 855 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, 856 mask >> tbl->it_page_shift, align, 857 attrs); 858 if (dma_handle == DMA_MAPPING_ERROR) { 859 if (!(attrs & DMA_ATTR_NO_WARN) && 860 printk_ratelimit()) { 861 dev_info(dev, "iommu_alloc failed, tbl %p " 862 "vaddr %p npages %d\n", tbl, vaddr, 863 npages); 864 } 865 } else 866 dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl)); 867 } 868 869 return dma_handle; 870 } 871 872 void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle, 873 size_t size, enum dma_data_direction direction, 874 unsigned long attrs) 875 { 876 unsigned int npages; 877 878 BUG_ON(direction == DMA_NONE); 879 880 if (tbl) { 881 npages = iommu_num_pages(dma_handle, size, 882 IOMMU_PAGE_SIZE(tbl)); 883 iommu_free(tbl, dma_handle, npages); 884 } 885 } 886 887 /* Allocates a contiguous real buffer and creates mappings over it. 888 * Returns the virtual address of the buffer and sets dma_handle 889 * to the dma address (mapping) of the first page. 890 */ 891 void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, 892 size_t size, dma_addr_t *dma_handle, 893 unsigned long mask, gfp_t flag, int node) 894 { 895 void *ret = NULL; 896 dma_addr_t mapping; 897 unsigned int order; 898 unsigned int nio_pages, io_order; 899 struct page *page; 900 901 size = PAGE_ALIGN(size); 902 order = get_order(size); 903 904 /* 905 * Client asked for way too much space. This is checked later 906 * anyway. It is easier to debug here for the drivers than in 907 * the tce tables. 908 */ 909 if (order >= IOMAP_MAX_ORDER) { 910 dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n", 911 size); 912 return NULL; 913 } 914 915 if (!tbl) 916 return NULL; 917 918 /* Alloc enough pages (and possibly more) */ 919 page = alloc_pages_node(node, flag, order); 920 if (!page) 921 return NULL; 922 ret = page_address(page); 923 memset(ret, 0, size); 924 925 /* Set up tces to cover the allocated range */ 926 nio_pages = size >> tbl->it_page_shift; 927 io_order = get_iommu_order(size, tbl); 928 mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, 929 mask >> tbl->it_page_shift, io_order, 0); 930 if (mapping == DMA_MAPPING_ERROR) { 931 free_pages((unsigned long)ret, order); 932 return NULL; 933 } 934 *dma_handle = mapping; 935 return ret; 936 } 937 938 void iommu_free_coherent(struct iommu_table *tbl, size_t size, 939 void *vaddr, dma_addr_t dma_handle) 940 { 941 if (tbl) { 942 unsigned int nio_pages; 943 944 size = PAGE_ALIGN(size); 945 nio_pages = size >> tbl->it_page_shift; 946 iommu_free(tbl, dma_handle, nio_pages); 947 size = PAGE_ALIGN(size); 948 free_pages((unsigned long)vaddr, get_order(size)); 949 } 950 } 951 952 unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir) 953 { 954 switch (dir) { 955 case DMA_BIDIRECTIONAL: 956 return TCE_PCI_READ | TCE_PCI_WRITE; 957 case DMA_FROM_DEVICE: 958 return TCE_PCI_WRITE; 959 case DMA_TO_DEVICE: 960 return TCE_PCI_READ; 961 default: 962 return 0; 963 } 964 } 965 EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm); 966 967 #ifdef CONFIG_IOMMU_API 968 /* 969 * SPAPR TCE API 970 */ 971 static void group_release(void *iommu_data) 972 { 973 struct iommu_table_group *table_group = iommu_data; 974 975 table_group->group = NULL; 976 } 977 978 void iommu_register_group(struct iommu_table_group *table_group, 979 int pci_domain_number, unsigned long pe_num) 980 { 981 struct iommu_group *grp; 982 char *name; 983 984 grp = iommu_group_alloc(); 985 if (IS_ERR(grp)) { 986 pr_warn("powerpc iommu api: cannot create new group, err=%ld\n", 987 PTR_ERR(grp)); 988 return; 989 } 990 table_group->group = grp; 991 iommu_group_set_iommudata(grp, table_group, group_release); 992 name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", 993 pci_domain_number, pe_num); 994 if (!name) 995 return; 996 iommu_group_set_name(grp, name); 997 kfree(name); 998 } 999 1000 enum dma_data_direction iommu_tce_direction(unsigned long tce) 1001 { 1002 if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) 1003 return DMA_BIDIRECTIONAL; 1004 else if (tce & TCE_PCI_READ) 1005 return DMA_TO_DEVICE; 1006 else if (tce & TCE_PCI_WRITE) 1007 return DMA_FROM_DEVICE; 1008 else 1009 return DMA_NONE; 1010 } 1011 EXPORT_SYMBOL_GPL(iommu_tce_direction); 1012 1013 void iommu_flush_tce(struct iommu_table *tbl) 1014 { 1015 /* Flush/invalidate TLB caches if necessary */ 1016 if (tbl->it_ops->flush) 1017 tbl->it_ops->flush(tbl); 1018 1019 /* Make sure updates are seen by hardware */ 1020 mb(); 1021 } 1022 EXPORT_SYMBOL_GPL(iommu_flush_tce); 1023 1024 int iommu_tce_check_ioba(unsigned long page_shift, 1025 unsigned long offset, unsigned long size, 1026 unsigned long ioba, unsigned long npages) 1027 { 1028 unsigned long mask = (1UL << page_shift) - 1; 1029 1030 if (ioba & mask) 1031 return -EINVAL; 1032 1033 ioba >>= page_shift; 1034 if (ioba < offset) 1035 return -EINVAL; 1036 1037 if ((ioba + 1) > (offset + size)) 1038 return -EINVAL; 1039 1040 return 0; 1041 } 1042 EXPORT_SYMBOL_GPL(iommu_tce_check_ioba); 1043 1044 int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) 1045 { 1046 unsigned long mask = (1UL << page_shift) - 1; 1047 1048 if (gpa & mask) 1049 return -EINVAL; 1050 1051 return 0; 1052 } 1053 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); 1054 1055 extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, 1056 struct iommu_table *tbl, 1057 unsigned long entry, unsigned long *hpa, 1058 enum dma_data_direction *direction) 1059 { 1060 long ret; 1061 unsigned long size = 0; 1062 1063 ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false); 1064 if (!ret && ((*direction == DMA_FROM_DEVICE) || 1065 (*direction == DMA_BIDIRECTIONAL)) && 1066 !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, 1067 &size)) 1068 SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); 1069 1070 return ret; 1071 } 1072 EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill); 1073 1074 void iommu_tce_kill(struct iommu_table *tbl, 1075 unsigned long entry, unsigned long pages) 1076 { 1077 if (tbl->it_ops->tce_kill) 1078 tbl->it_ops->tce_kill(tbl, entry, pages, false); 1079 } 1080 EXPORT_SYMBOL_GPL(iommu_tce_kill); 1081 1082 int iommu_take_ownership(struct iommu_table *tbl) 1083 { 1084 unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; 1085 int ret = 0; 1086 1087 /* 1088 * VFIO does not control TCE entries allocation and the guest 1089 * can write new TCEs on top of existing ones so iommu_tce_build() 1090 * must be able to release old pages. This functionality 1091 * requires exchange() callback defined so if it is not 1092 * implemented, we disallow taking ownership over the table. 1093 */ 1094 if (!tbl->it_ops->xchg_no_kill) 1095 return -EINVAL; 1096 1097 spin_lock_irqsave(&tbl->large_pool.lock, flags); 1098 for (i = 0; i < tbl->nr_pools; i++) 1099 spin_lock(&tbl->pools[i].lock); 1100 1101 iommu_table_release_pages(tbl); 1102 1103 if (!bitmap_empty(tbl->it_map, tbl->it_size)) { 1104 pr_err("iommu_tce: it_map is not empty"); 1105 ret = -EBUSY; 1106 /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */ 1107 iommu_table_reserve_pages(tbl, tbl->it_reserved_start, 1108 tbl->it_reserved_end); 1109 } else { 1110 memset(tbl->it_map, 0xff, sz); 1111 } 1112 1113 for (i = 0; i < tbl->nr_pools; i++) 1114 spin_unlock(&tbl->pools[i].lock); 1115 spin_unlock_irqrestore(&tbl->large_pool.lock, flags); 1116 1117 return ret; 1118 } 1119 EXPORT_SYMBOL_GPL(iommu_take_ownership); 1120 1121 void iommu_release_ownership(struct iommu_table *tbl) 1122 { 1123 unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; 1124 1125 spin_lock_irqsave(&tbl->large_pool.lock, flags); 1126 for (i = 0; i < tbl->nr_pools; i++) 1127 spin_lock(&tbl->pools[i].lock); 1128 1129 memset(tbl->it_map, 0, sz); 1130 1131 iommu_table_reserve_pages(tbl, tbl->it_reserved_start, 1132 tbl->it_reserved_end); 1133 1134 for (i = 0; i < tbl->nr_pools; i++) 1135 spin_unlock(&tbl->pools[i].lock); 1136 spin_unlock_irqrestore(&tbl->large_pool.lock, flags); 1137 } 1138 EXPORT_SYMBOL_GPL(iommu_release_ownership); 1139 1140 int iommu_add_device(struct iommu_table_group *table_group, struct device *dev) 1141 { 1142 /* 1143 * The sysfs entries should be populated before 1144 * binding IOMMU group. If sysfs entries isn't 1145 * ready, we simply bail. 1146 */ 1147 if (!device_is_registered(dev)) 1148 return -ENOENT; 1149 1150 if (device_iommu_mapped(dev)) { 1151 pr_debug("%s: Skipping device %s with iommu group %d\n", 1152 __func__, dev_name(dev), 1153 iommu_group_id(dev->iommu_group)); 1154 return -EBUSY; 1155 } 1156 1157 pr_debug("%s: Adding %s to iommu group %d\n", 1158 __func__, dev_name(dev), iommu_group_id(table_group->group)); 1159 1160 return iommu_group_add_device(table_group->group, dev); 1161 } 1162 EXPORT_SYMBOL_GPL(iommu_add_device); 1163 1164 void iommu_del_device(struct device *dev) 1165 { 1166 /* 1167 * Some devices might not have IOMMU table and group 1168 * and we needn't detach them from the associated 1169 * IOMMU groups 1170 */ 1171 if (!device_iommu_mapped(dev)) { 1172 pr_debug("iommu_tce: skipping device %s with no tbl\n", 1173 dev_name(dev)); 1174 return; 1175 } 1176 1177 iommu_group_remove_device(dev); 1178 } 1179 EXPORT_SYMBOL_GPL(iommu_del_device); 1180 #endif /* CONFIG_IOMMU_API */ 1181