1 /* 2 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation 3 * 4 * Rewrite, cleanup, new allocation schemes, virtual merging: 5 * Copyright (C) 2004 Olof Johansson, IBM Corporation 6 * and Ben. Herrenschmidt, IBM Corporation 7 * 8 * Dynamic DMA mapping support, bus-independent parts. 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25 26 #include <linux/init.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/mm.h> 30 #include <linux/spinlock.h> 31 #include <linux/string.h> 32 #include <linux/dma-mapping.h> 33 #include <linux/init.h> 34 #include <linux/bitops.h> 35 #include <asm/io.h> 36 #include <asm/prom.h> 37 #include <asm/iommu.h> 38 #include <asm/pci-bridge.h> 39 #include <asm/machdep.h> 40 #include <asm/kdump.h> 41 42 #define DBG(...) 43 44 #ifdef CONFIG_IOMMU_VMERGE 45 static int novmerge = 0; 46 #else 47 static int novmerge = 1; 48 #endif 49 50 static inline unsigned long iommu_num_pages(unsigned long vaddr, 51 unsigned long slen) 52 { 53 unsigned long npages; 54 55 npages = IOMMU_PAGE_ALIGN(vaddr + slen) - (vaddr & IOMMU_PAGE_MASK); 56 npages >>= IOMMU_PAGE_SHIFT; 57 58 return npages; 59 } 60 61 static int __init setup_iommu(char *str) 62 { 63 if (!strcmp(str, "novmerge")) 64 novmerge = 1; 65 else if (!strcmp(str, "vmerge")) 66 novmerge = 0; 67 return 1; 68 } 69 70 __setup("iommu=", setup_iommu); 71 72 static unsigned long iommu_range_alloc(struct iommu_table *tbl, 73 unsigned long npages, 74 unsigned long *handle, 75 unsigned long mask, 76 unsigned int align_order) 77 { 78 unsigned long n, end, i, start; 79 unsigned long limit; 80 int largealloc = npages > 15; 81 int pass = 0; 82 unsigned long align_mask; 83 84 align_mask = 0xffffffffffffffffl >> (64 - align_order); 85 86 /* This allocator was derived from x86_64's bit string search */ 87 88 /* Sanity check */ 89 if (unlikely(npages == 0)) { 90 if (printk_ratelimit()) 91 WARN_ON(1); 92 return DMA_ERROR_CODE; 93 } 94 95 if (handle && *handle) 96 start = *handle; 97 else 98 start = largealloc ? tbl->it_largehint : tbl->it_hint; 99 100 /* Use only half of the table for small allocs (15 pages or less) */ 101 limit = largealloc ? tbl->it_size : tbl->it_halfpoint; 102 103 if (largealloc && start < tbl->it_halfpoint) 104 start = tbl->it_halfpoint; 105 106 /* The case below can happen if we have a small segment appended 107 * to a large, or when the previous alloc was at the very end of 108 * the available space. If so, go back to the initial start. 109 */ 110 if (start >= limit) 111 start = largealloc ? tbl->it_largehint : tbl->it_hint; 112 113 again: 114 115 if (limit + tbl->it_offset > mask) { 116 limit = mask - tbl->it_offset + 1; 117 /* If we're constrained on address range, first try 118 * at the masked hint to avoid O(n) search complexity, 119 * but on second pass, start at 0. 120 */ 121 if ((start & mask) >= limit || pass > 0) 122 start = 0; 123 else 124 start &= mask; 125 } 126 127 n = find_next_zero_bit(tbl->it_map, limit, start); 128 129 /* Align allocation */ 130 n = (n + align_mask) & ~align_mask; 131 132 end = n + npages; 133 134 if (unlikely(end >= limit)) { 135 if (likely(pass < 2)) { 136 /* First failure, just rescan the half of the table. 137 * Second failure, rescan the other half of the table. 138 */ 139 start = (largealloc ^ pass) ? tbl->it_halfpoint : 0; 140 limit = pass ? tbl->it_size : limit; 141 pass++; 142 goto again; 143 } else { 144 /* Third failure, give up */ 145 return DMA_ERROR_CODE; 146 } 147 } 148 149 for (i = n; i < end; i++) 150 if (test_bit(i, tbl->it_map)) { 151 start = i+1; 152 goto again; 153 } 154 155 for (i = n; i < end; i++) 156 __set_bit(i, tbl->it_map); 157 158 /* Bump the hint to a new block for small allocs. */ 159 if (largealloc) { 160 /* Don't bump to new block to avoid fragmentation */ 161 tbl->it_largehint = end; 162 } else { 163 /* Overflow will be taken care of at the next allocation */ 164 tbl->it_hint = (end + tbl->it_blocksize - 1) & 165 ~(tbl->it_blocksize - 1); 166 } 167 168 /* Update handle for SG allocations */ 169 if (handle) 170 *handle = end; 171 172 return n; 173 } 174 175 static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, 176 unsigned int npages, enum dma_data_direction direction, 177 unsigned long mask, unsigned int align_order) 178 { 179 unsigned long entry, flags; 180 dma_addr_t ret = DMA_ERROR_CODE; 181 182 spin_lock_irqsave(&(tbl->it_lock), flags); 183 184 entry = iommu_range_alloc(tbl, npages, NULL, mask, align_order); 185 186 if (unlikely(entry == DMA_ERROR_CODE)) { 187 spin_unlock_irqrestore(&(tbl->it_lock), flags); 188 return DMA_ERROR_CODE; 189 } 190 191 entry += tbl->it_offset; /* Offset into real TCE table */ 192 ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ 193 194 /* Put the TCEs in the HW table */ 195 ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK, 196 direction); 197 198 199 /* Flush/invalidate TLB caches if necessary */ 200 if (ppc_md.tce_flush) 201 ppc_md.tce_flush(tbl); 202 203 spin_unlock_irqrestore(&(tbl->it_lock), flags); 204 205 /* Make sure updates are seen by hardware */ 206 mb(); 207 208 return ret; 209 } 210 211 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 212 unsigned int npages) 213 { 214 unsigned long entry, free_entry; 215 unsigned long i; 216 217 entry = dma_addr >> IOMMU_PAGE_SHIFT; 218 free_entry = entry - tbl->it_offset; 219 220 if (((free_entry + npages) > tbl->it_size) || 221 (entry < tbl->it_offset)) { 222 if (printk_ratelimit()) { 223 printk(KERN_INFO "iommu_free: invalid entry\n"); 224 printk(KERN_INFO "\tentry = 0x%lx\n", entry); 225 printk(KERN_INFO "\tdma_addr = 0x%lx\n", (u64)dma_addr); 226 printk(KERN_INFO "\tTable = 0x%lx\n", (u64)tbl); 227 printk(KERN_INFO "\tbus# = 0x%lx\n", (u64)tbl->it_busno); 228 printk(KERN_INFO "\tsize = 0x%lx\n", (u64)tbl->it_size); 229 printk(KERN_INFO "\tstartOff = 0x%lx\n", (u64)tbl->it_offset); 230 printk(KERN_INFO "\tindex = 0x%lx\n", (u64)tbl->it_index); 231 WARN_ON(1); 232 } 233 return; 234 } 235 236 ppc_md.tce_free(tbl, entry, npages); 237 238 for (i = 0; i < npages; i++) 239 __clear_bit(free_entry+i, tbl->it_map); 240 } 241 242 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 243 unsigned int npages) 244 { 245 unsigned long flags; 246 247 spin_lock_irqsave(&(tbl->it_lock), flags); 248 249 __iommu_free(tbl, dma_addr, npages); 250 251 /* Make sure TLB cache is flushed if the HW needs it. We do 252 * not do an mb() here on purpose, it is not needed on any of 253 * the current platforms. 254 */ 255 if (ppc_md.tce_flush) 256 ppc_md.tce_flush(tbl); 257 258 spin_unlock_irqrestore(&(tbl->it_lock), flags); 259 } 260 261 int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist, 262 int nelems, unsigned long mask, 263 enum dma_data_direction direction) 264 { 265 dma_addr_t dma_next = 0, dma_addr; 266 unsigned long flags; 267 struct scatterlist *s, *outs, *segstart; 268 int outcount, incount; 269 unsigned long handle; 270 271 BUG_ON(direction == DMA_NONE); 272 273 if ((nelems == 0) || !tbl) 274 return 0; 275 276 outs = s = segstart = &sglist[0]; 277 outcount = 1; 278 incount = nelems; 279 handle = 0; 280 281 /* Init first segment length for backout at failure */ 282 outs->dma_length = 0; 283 284 DBG("sg mapping %d elements:\n", nelems); 285 286 spin_lock_irqsave(&(tbl->it_lock), flags); 287 288 for (s = outs; nelems; nelems--, s++) { 289 unsigned long vaddr, npages, entry, slen; 290 291 slen = s->length; 292 /* Sanity check */ 293 if (slen == 0) { 294 dma_next = 0; 295 continue; 296 } 297 /* Allocate iommu entries for that segment */ 298 vaddr = (unsigned long)page_address(s->page) + s->offset; 299 npages = iommu_num_pages(vaddr, slen); 300 entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0); 301 302 DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); 303 304 /* Handle failure */ 305 if (unlikely(entry == DMA_ERROR_CODE)) { 306 if (printk_ratelimit()) 307 printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx" 308 " npages %lx\n", tbl, vaddr, npages); 309 goto failure; 310 } 311 312 /* Convert entry to a dma_addr_t */ 313 entry += tbl->it_offset; 314 dma_addr = entry << IOMMU_PAGE_SHIFT; 315 dma_addr |= (s->offset & ~IOMMU_PAGE_MASK); 316 317 DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", 318 npages, entry, dma_addr); 319 320 /* Insert into HW table */ 321 ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction); 322 323 /* If we are in an open segment, try merging */ 324 if (segstart != s) { 325 DBG(" - trying merge...\n"); 326 /* We cannot merge if: 327 * - allocated dma_addr isn't contiguous to previous allocation 328 */ 329 if (novmerge || (dma_addr != dma_next)) { 330 /* Can't merge: create a new segment */ 331 segstart = s; 332 outcount++; outs++; 333 DBG(" can't merge, new segment.\n"); 334 } else { 335 outs->dma_length += s->length; 336 DBG(" merged, new len: %ux\n", outs->dma_length); 337 } 338 } 339 340 if (segstart == s) { 341 /* This is a new segment, fill entries */ 342 DBG(" - filling new segment.\n"); 343 outs->dma_address = dma_addr; 344 outs->dma_length = slen; 345 } 346 347 /* Calculate next page pointer for contiguous check */ 348 dma_next = dma_addr + slen; 349 350 DBG(" - dma next is: %lx\n", dma_next); 351 } 352 353 /* Flush/invalidate TLB caches if necessary */ 354 if (ppc_md.tce_flush) 355 ppc_md.tce_flush(tbl); 356 357 spin_unlock_irqrestore(&(tbl->it_lock), flags); 358 359 DBG("mapped %d elements:\n", outcount); 360 361 /* For the sake of iommu_unmap_sg, we clear out the length in the 362 * next entry of the sglist if we didn't fill the list completely 363 */ 364 if (outcount < incount) { 365 outs++; 366 outs->dma_address = DMA_ERROR_CODE; 367 outs->dma_length = 0; 368 } 369 370 /* Make sure updates are seen by hardware */ 371 mb(); 372 373 return outcount; 374 375 failure: 376 for (s = &sglist[0]; s <= outs; s++) { 377 if (s->dma_length != 0) { 378 unsigned long vaddr, npages; 379 380 vaddr = s->dma_address & IOMMU_PAGE_MASK; 381 npages = iommu_num_pages(s->dma_address, s->dma_length); 382 __iommu_free(tbl, vaddr, npages); 383 s->dma_address = DMA_ERROR_CODE; 384 s->dma_length = 0; 385 } 386 } 387 spin_unlock_irqrestore(&(tbl->it_lock), flags); 388 return 0; 389 } 390 391 392 void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, 393 int nelems, enum dma_data_direction direction) 394 { 395 unsigned long flags; 396 397 BUG_ON(direction == DMA_NONE); 398 399 if (!tbl) 400 return; 401 402 spin_lock_irqsave(&(tbl->it_lock), flags); 403 404 while (nelems--) { 405 unsigned int npages; 406 dma_addr_t dma_handle = sglist->dma_address; 407 408 if (sglist->dma_length == 0) 409 break; 410 npages = iommu_num_pages(dma_handle,sglist->dma_length); 411 __iommu_free(tbl, dma_handle, npages); 412 sglist++; 413 } 414 415 /* Flush/invalidate TLBs if necessary. As for iommu_free(), we 416 * do not do an mb() here, the affected platforms do not need it 417 * when freeing. 418 */ 419 if (ppc_md.tce_flush) 420 ppc_md.tce_flush(tbl); 421 422 spin_unlock_irqrestore(&(tbl->it_lock), flags); 423 } 424 425 /* 426 * Build a iommu_table structure. This contains a bit map which 427 * is used to manage allocation of the tce space. 428 */ 429 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) 430 { 431 unsigned long sz; 432 static int welcomed = 0; 433 struct page *page; 434 435 /* Set aside 1/4 of the table for large allocations. */ 436 tbl->it_halfpoint = tbl->it_size * 3 / 4; 437 438 /* number of bytes needed for the bitmap */ 439 sz = (tbl->it_size + 7) >> 3; 440 441 page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz)); 442 if (!page) 443 panic("iommu_init_table: Can't allocate %ld bytes\n", sz); 444 tbl->it_map = page_address(page); 445 memset(tbl->it_map, 0, sz); 446 447 tbl->it_hint = 0; 448 tbl->it_largehint = tbl->it_halfpoint; 449 spin_lock_init(&tbl->it_lock); 450 451 #ifdef CONFIG_CRASH_DUMP 452 if (ppc_md.tce_get) { 453 unsigned long index, tceval; 454 unsigned long tcecount = 0; 455 456 /* 457 * Reserve the existing mappings left by the first kernel. 458 */ 459 for (index = 0; index < tbl->it_size; index++) { 460 tceval = ppc_md.tce_get(tbl, index + tbl->it_offset); 461 /* 462 * Freed TCE entry contains 0x7fffffffffffffff on JS20 463 */ 464 if (tceval && (tceval != 0x7fffffffffffffffUL)) { 465 __set_bit(index, tbl->it_map); 466 tcecount++; 467 } 468 } 469 if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) { 470 printk(KERN_WARNING "TCE table is full; "); 471 printk(KERN_WARNING "freeing %d entries for the kdump boot\n", 472 KDUMP_MIN_TCE_ENTRIES); 473 for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES; 474 index < tbl->it_size; index++) 475 __clear_bit(index, tbl->it_map); 476 } 477 } 478 #else 479 /* Clear the hardware table in case firmware left allocations in it */ 480 ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); 481 #endif 482 483 if (!welcomed) { 484 printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n", 485 novmerge ? "disabled" : "enabled"); 486 welcomed = 1; 487 } 488 489 return tbl; 490 } 491 492 void iommu_free_table(struct device_node *dn) 493 { 494 struct pci_dn *pdn = dn->data; 495 struct iommu_table *tbl = pdn->iommu_table; 496 unsigned long bitmap_sz, i; 497 unsigned int order; 498 499 if (!tbl || !tbl->it_map) { 500 printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__, 501 dn->full_name); 502 return; 503 } 504 505 /* verify that table contains no entries */ 506 /* it_size is in entries, and we're examining 64 at a time */ 507 for (i = 0; i < (tbl->it_size/64); i++) { 508 if (tbl->it_map[i] != 0) { 509 printk(KERN_WARNING "%s: Unexpected TCEs for %s\n", 510 __FUNCTION__, dn->full_name); 511 break; 512 } 513 } 514 515 /* calculate bitmap size in bytes */ 516 bitmap_sz = (tbl->it_size + 7) / 8; 517 518 /* free bitmap */ 519 order = get_order(bitmap_sz); 520 free_pages((unsigned long) tbl->it_map, order); 521 522 /* free table */ 523 kfree(tbl); 524 } 525 526 /* Creates TCEs for a user provided buffer. The user buffer must be 527 * contiguous real kernel storage (not vmalloc). The address of the buffer 528 * passed here is the kernel (virtual) address of the buffer. The buffer 529 * need not be page aligned, the dma_addr_t returned will point to the same 530 * byte within the page as vaddr. 531 */ 532 dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, 533 size_t size, unsigned long mask, 534 enum dma_data_direction direction) 535 { 536 dma_addr_t dma_handle = DMA_ERROR_CODE; 537 unsigned long uaddr; 538 unsigned int npages; 539 540 BUG_ON(direction == DMA_NONE); 541 542 uaddr = (unsigned long)vaddr; 543 npages = iommu_num_pages(uaddr, size); 544 545 if (tbl) { 546 dma_handle = iommu_alloc(tbl, vaddr, npages, direction, 547 mask >> IOMMU_PAGE_SHIFT, 0); 548 if (dma_handle == DMA_ERROR_CODE) { 549 if (printk_ratelimit()) { 550 printk(KERN_INFO "iommu_alloc failed, " 551 "tbl %p vaddr %p npages %d\n", 552 tbl, vaddr, npages); 553 } 554 } else 555 dma_handle |= (uaddr & ~IOMMU_PAGE_MASK); 556 } 557 558 return dma_handle; 559 } 560 561 void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, 562 size_t size, enum dma_data_direction direction) 563 { 564 unsigned int npages; 565 566 BUG_ON(direction == DMA_NONE); 567 568 if (tbl) { 569 npages = iommu_num_pages(dma_handle, size); 570 iommu_free(tbl, dma_handle, npages); 571 } 572 } 573 574 /* Allocates a contiguous real buffer and creates mappings over it. 575 * Returns the virtual address of the buffer and sets dma_handle 576 * to the dma address (mapping) of the first page. 577 */ 578 void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, 579 dma_addr_t *dma_handle, unsigned long mask, gfp_t flag, int node) 580 { 581 void *ret = NULL; 582 dma_addr_t mapping; 583 unsigned int order; 584 unsigned int nio_pages, io_order; 585 struct page *page; 586 587 size = PAGE_ALIGN(size); 588 order = get_order(size); 589 590 /* 591 * Client asked for way too much space. This is checked later 592 * anyway. It is easier to debug here for the drivers than in 593 * the tce tables. 594 */ 595 if (order >= IOMAP_MAX_ORDER) { 596 printk("iommu_alloc_consistent size too large: 0x%lx\n", size); 597 return NULL; 598 } 599 600 if (!tbl) 601 return NULL; 602 603 /* Alloc enough pages (and possibly more) */ 604 page = alloc_pages_node(node, flag, order); 605 if (!page) 606 return NULL; 607 ret = page_address(page); 608 memset(ret, 0, size); 609 610 /* Set up tces to cover the allocated range */ 611 nio_pages = size >> IOMMU_PAGE_SHIFT; 612 io_order = get_iommu_order(size); 613 mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL, 614 mask >> IOMMU_PAGE_SHIFT, io_order); 615 if (mapping == DMA_ERROR_CODE) { 616 free_pages((unsigned long)ret, order); 617 return NULL; 618 } 619 *dma_handle = mapping; 620 return ret; 621 } 622 623 void iommu_free_coherent(struct iommu_table *tbl, size_t size, 624 void *vaddr, dma_addr_t dma_handle) 625 { 626 if (tbl) { 627 unsigned int nio_pages; 628 629 size = PAGE_ALIGN(size); 630 nio_pages = size >> IOMMU_PAGE_SHIFT; 631 iommu_free(tbl, dma_handle, nio_pages); 632 size = PAGE_ALIGN(size); 633 free_pages((unsigned long)vaddr, get_order(size)); 634 } 635 } 636