xref: /linux/arch/powerpc/kernel/iommu.c (revision ea8b474b5550d353a02f25a5813cb1682509d5e6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
4  *
5  * Rewrite, cleanup, new allocation schemes, virtual merging:
6  * Copyright (C) 2004 Olof Johansson, IBM Corporation
7  *               and  Ben. Herrenschmidt, IBM Corporation
8  *
9  * Dynamic DMA mapping support, bus-independent parts.
10  */
11 
12 
13 #include <linux/init.h>
14 #include <linux/types.h>
15 #include <linux/slab.h>
16 #include <linux/sysfs.h>
17 #include <linux/mm.h>
18 #include <linux/spinlock.h>
19 #include <linux/string.h>
20 #include <linux/string_choices.h>
21 #include <linux/dma-mapping.h>
22 #include <linux/bitmap.h>
23 #include <linux/iommu-helper.h>
24 #include <linux/crash_dump.h>
25 #include <linux/hash.h>
26 #include <linux/fault-inject.h>
27 #include <linux/pci.h>
28 #include <linux/iommu.h>
29 #include <linux/sched.h>
30 #include <linux/debugfs.h>
31 #include <linux/vmalloc.h>
32 #include <asm/io.h>
33 #include <asm/iommu.h>
34 #include <asm/pci-bridge.h>
35 #include <asm/machdep.h>
36 #include <asm/kdump.h>
37 #include <asm/fadump.h>
38 #include <asm/vio.h>
39 #include <asm/tce.h>
40 #include <asm/mmu_context.h>
41 #include <asm/ppc-pci.h>
42 
43 #define DBG(...)
44 
45 #ifdef CONFIG_IOMMU_DEBUGFS
46 static int iommu_debugfs_weight_get(void *data, u64 *val)
47 {
48 	struct iommu_table *tbl = data;
49 	*val = bitmap_weight(tbl->it_map, tbl->it_size);
50 	return 0;
51 }
52 DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n");
53 
54 static void iommu_debugfs_add(struct iommu_table *tbl)
55 {
56 	char name[10];
57 	struct dentry *liobn_entry;
58 
59 	sprintf(name, "%08lx", tbl->it_index);
60 	liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir);
61 
62 	debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight);
63 	debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size);
64 	debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift);
65 	debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start);
66 	debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end);
67 	debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels);
68 	debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size);
69 }
70 
71 static void iommu_debugfs_del(struct iommu_table *tbl)
72 {
73 	char name[10];
74 
75 	sprintf(name, "%08lx", tbl->it_index);
76 	debugfs_lookup_and_remove(name, iommu_debugfs_dir);
77 }
78 #else
79 static void iommu_debugfs_add(struct iommu_table *tbl){}
80 static void iommu_debugfs_del(struct iommu_table *tbl){}
81 #endif
82 
83 static int novmerge;
84 
85 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
86 
87 static int __init setup_iommu(char *str)
88 {
89 	if (!strcmp(str, "novmerge"))
90 		novmerge = 1;
91 	else if (!strcmp(str, "vmerge"))
92 		novmerge = 0;
93 	return 1;
94 }
95 
96 __setup("iommu=", setup_iommu);
97 
98 static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
99 
100 /*
101  * We precalculate the hash to avoid doing it on every allocation.
102  *
103  * The hash is important to spread CPUs across all the pools. For example,
104  * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
105  * with 4 pools all primary threads would map to the same pool.
106  */
107 static int __init setup_iommu_pool_hash(void)
108 {
109 	unsigned int i;
110 
111 	for_each_possible_cpu(i)
112 		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
113 
114 	return 0;
115 }
116 subsys_initcall(setup_iommu_pool_hash);
117 
118 #ifdef CONFIG_FAIL_IOMMU
119 
120 static DECLARE_FAULT_ATTR(fail_iommu);
121 
122 static int __init setup_fail_iommu(char *str)
123 {
124 	return setup_fault_attr(&fail_iommu, str);
125 }
126 __setup("fail_iommu=", setup_fail_iommu);
127 
128 static bool should_fail_iommu(struct device *dev)
129 {
130 	return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
131 }
132 
133 static int __init fail_iommu_debugfs(void)
134 {
135 	struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
136 						       NULL, &fail_iommu);
137 
138 	return PTR_ERR_OR_ZERO(dir);
139 }
140 late_initcall(fail_iommu_debugfs);
141 
142 static ssize_t fail_iommu_show(struct device *dev,
143 			       struct device_attribute *attr, char *buf)
144 {
145 	return sysfs_emit(buf, "%d\n", dev->archdata.fail_iommu);
146 }
147 
148 static ssize_t fail_iommu_store(struct device *dev,
149 				struct device_attribute *attr, const char *buf,
150 				size_t count)
151 {
152 	int i;
153 
154 	if (count > 0 && sscanf(buf, "%d", &i) > 0)
155 		dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
156 
157 	return count;
158 }
159 
160 static DEVICE_ATTR_RW(fail_iommu);
161 
162 static int fail_iommu_bus_notify(struct notifier_block *nb,
163 				 unsigned long action, void *data)
164 {
165 	struct device *dev = data;
166 
167 	if (action == BUS_NOTIFY_ADD_DEVICE) {
168 		if (device_create_file(dev, &dev_attr_fail_iommu))
169 			pr_warn("Unable to create IOMMU fault injection sysfs "
170 				"entries\n");
171 	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
172 		device_remove_file(dev, &dev_attr_fail_iommu);
173 	}
174 
175 	return 0;
176 }
177 
178 /*
179  * PCI and VIO buses need separate notifier_block structs, since they're linked
180  * list nodes.  Sharing a notifier_block would mean that any notifiers later
181  * registered for PCI buses would also get called by VIO buses and vice versa.
182  */
183 static struct notifier_block fail_iommu_pci_bus_notifier = {
184 	.notifier_call = fail_iommu_bus_notify
185 };
186 
187 #ifdef CONFIG_IBMVIO
188 static struct notifier_block fail_iommu_vio_bus_notifier = {
189 	.notifier_call = fail_iommu_bus_notify
190 };
191 #endif
192 
193 static int __init fail_iommu_setup(void)
194 {
195 #ifdef CONFIG_PCI
196 	bus_register_notifier(&pci_bus_type, &fail_iommu_pci_bus_notifier);
197 #endif
198 #ifdef CONFIG_IBMVIO
199 	bus_register_notifier(&vio_bus_type, &fail_iommu_vio_bus_notifier);
200 #endif
201 
202 	return 0;
203 }
204 /*
205  * Must execute after PCI and VIO subsystem have initialised but before
206  * devices are probed.
207  */
208 arch_initcall(fail_iommu_setup);
209 #else
210 static inline bool should_fail_iommu(struct device *dev)
211 {
212 	return false;
213 }
214 #endif
215 
216 static unsigned long iommu_range_alloc(struct device *dev,
217 				       struct iommu_table *tbl,
218                                        unsigned long npages,
219                                        unsigned long *handle,
220                                        unsigned long mask,
221                                        unsigned int align_order)
222 {
223 	unsigned long n, end, start;
224 	unsigned long limit;
225 	int largealloc = npages > 15;
226 	int pass = 0;
227 	unsigned long align_mask;
228 	unsigned long flags;
229 	unsigned int pool_nr;
230 	struct iommu_pool *pool;
231 
232 	align_mask = (1ull << align_order) - 1;
233 
234 	/* This allocator was derived from x86_64's bit string search */
235 
236 	/* Sanity check */
237 	if (unlikely(npages == 0)) {
238 		if (printk_ratelimit())
239 			WARN_ON(1);
240 		return DMA_MAPPING_ERROR;
241 	}
242 
243 	if (should_fail_iommu(dev))
244 		return DMA_MAPPING_ERROR;
245 
246 	/*
247 	 * We don't need to disable preemption here because any CPU can
248 	 * safely use any IOMMU pool.
249 	 */
250 	pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
251 
252 	if (largealloc)
253 		pool = &(tbl->large_pool);
254 	else
255 		pool = &(tbl->pools[pool_nr]);
256 
257 	spin_lock_irqsave(&(pool->lock), flags);
258 
259 again:
260 	if ((pass == 0) && handle && *handle &&
261 	    (*handle >= pool->start) && (*handle < pool->end))
262 		start = *handle;
263 	else
264 		start = pool->hint;
265 
266 	limit = pool->end;
267 
268 	/* The case below can happen if we have a small segment appended
269 	 * to a large, or when the previous alloc was at the very end of
270 	 * the available space. If so, go back to the initial start.
271 	 */
272 	if (start >= limit)
273 		start = pool->start;
274 
275 	if (limit + tbl->it_offset > mask) {
276 		limit = mask - tbl->it_offset + 1;
277 		/* If we're constrained on address range, first try
278 		 * at the masked hint to avoid O(n) search complexity,
279 		 * but on second pass, start at 0 in pool 0.
280 		 */
281 		if ((start & mask) >= limit || pass > 0) {
282 			spin_unlock(&(pool->lock));
283 			pool = &(tbl->pools[0]);
284 			spin_lock(&(pool->lock));
285 			start = pool->start;
286 		} else {
287 			start &= mask;
288 		}
289 	}
290 
291 	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
292 			dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift),
293 			align_mask);
294 	if (n == -1) {
295 		if (likely(pass == 0)) {
296 			/* First try the pool from the start */
297 			pool->hint = pool->start;
298 			pass++;
299 			goto again;
300 
301 		} else if (pass <= tbl->nr_pools) {
302 			/* Now try scanning all the other pools */
303 			spin_unlock(&(pool->lock));
304 			pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
305 			pool = &tbl->pools[pool_nr];
306 			spin_lock(&(pool->lock));
307 			pool->hint = pool->start;
308 			pass++;
309 			goto again;
310 
311 		} else if (pass == tbl->nr_pools + 1) {
312 			/* Last resort: try largepool */
313 			spin_unlock(&pool->lock);
314 			pool = &tbl->large_pool;
315 			spin_lock(&pool->lock);
316 			pool->hint = pool->start;
317 			pass++;
318 			goto again;
319 
320 		} else {
321 			/* Give up */
322 			spin_unlock_irqrestore(&(pool->lock), flags);
323 			return DMA_MAPPING_ERROR;
324 		}
325 	}
326 
327 	end = n + npages;
328 
329 	/* Bump the hint to a new block for small allocs. */
330 	if (largealloc) {
331 		/* Don't bump to new block to avoid fragmentation */
332 		pool->hint = end;
333 	} else {
334 		/* Overflow will be taken care of at the next allocation */
335 		pool->hint = (end + tbl->it_blocksize - 1) &
336 		                ~(tbl->it_blocksize - 1);
337 	}
338 
339 	/* Update handle for SG allocations */
340 	if (handle)
341 		*handle = end;
342 
343 	spin_unlock_irqrestore(&(pool->lock), flags);
344 
345 	return n;
346 }
347 
348 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
349 			      void *page, unsigned int npages,
350 			      enum dma_data_direction direction,
351 			      unsigned long mask, unsigned int align_order,
352 			      unsigned long attrs)
353 {
354 	unsigned long entry;
355 	dma_addr_t ret = DMA_MAPPING_ERROR;
356 	int build_fail;
357 
358 	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
359 
360 	if (unlikely(entry == DMA_MAPPING_ERROR))
361 		return DMA_MAPPING_ERROR;
362 
363 	entry += tbl->it_offset;	/* Offset into real TCE table */
364 	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
365 
366 	/* Put the TCEs in the HW table */
367 	build_fail = tbl->it_ops->set(tbl, entry, npages,
368 				      (unsigned long)page &
369 				      IOMMU_PAGE_MASK(tbl), direction, attrs);
370 
371 	/* tbl->it_ops->set() only returns non-zero for transient errors.
372 	 * Clean up the table bitmap in this case and return
373 	 * DMA_MAPPING_ERROR. For all other errors the functionality is
374 	 * not altered.
375 	 */
376 	if (unlikely(build_fail)) {
377 		__iommu_free(tbl, ret, npages);
378 		return DMA_MAPPING_ERROR;
379 	}
380 
381 	/* Flush/invalidate TLB caches if necessary */
382 	if (tbl->it_ops->flush)
383 		tbl->it_ops->flush(tbl);
384 
385 	/* Make sure updates are seen by hardware */
386 	mb();
387 
388 	return ret;
389 }
390 
391 static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
392 			     unsigned int npages)
393 {
394 	unsigned long entry, free_entry;
395 
396 	entry = dma_addr >> tbl->it_page_shift;
397 	free_entry = entry - tbl->it_offset;
398 
399 	if (((free_entry + npages) > tbl->it_size) ||
400 	    (entry < tbl->it_offset)) {
401 		if (printk_ratelimit()) {
402 			printk(KERN_INFO "iommu_free: invalid entry\n");
403 			printk(KERN_INFO "\tentry     = 0x%lx\n", entry);
404 			printk(KERN_INFO "\tdma_addr  = 0x%llx\n", (u64)dma_addr);
405 			printk(KERN_INFO "\tTable     = 0x%llx\n", (u64)tbl);
406 			printk(KERN_INFO "\tbus#      = 0x%llx\n", (u64)tbl->it_busno);
407 			printk(KERN_INFO "\tsize      = 0x%llx\n", (u64)tbl->it_size);
408 			printk(KERN_INFO "\tstartOff  = 0x%llx\n", (u64)tbl->it_offset);
409 			printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
410 			WARN_ON(1);
411 		}
412 
413 		return false;
414 	}
415 
416 	return true;
417 }
418 
419 static struct iommu_pool *get_pool(struct iommu_table *tbl,
420 				   unsigned long entry)
421 {
422 	struct iommu_pool *p;
423 	unsigned long largepool_start = tbl->large_pool.start;
424 
425 	/* The large pool is the last pool at the top of the table */
426 	if (entry >= largepool_start) {
427 		p = &tbl->large_pool;
428 	} else {
429 		unsigned int pool_nr = entry / tbl->poolsize;
430 
431 		BUG_ON(pool_nr > tbl->nr_pools);
432 		p = &tbl->pools[pool_nr];
433 	}
434 
435 	return p;
436 }
437 
438 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
439 			 unsigned int npages)
440 {
441 	unsigned long entry, free_entry;
442 	unsigned long flags;
443 	struct iommu_pool *pool;
444 
445 	entry = dma_addr >> tbl->it_page_shift;
446 	free_entry = entry - tbl->it_offset;
447 
448 	pool = get_pool(tbl, free_entry);
449 
450 	if (!iommu_free_check(tbl, dma_addr, npages))
451 		return;
452 
453 	tbl->it_ops->clear(tbl, entry, npages);
454 
455 	spin_lock_irqsave(&(pool->lock), flags);
456 	bitmap_clear(tbl->it_map, free_entry, npages);
457 	spin_unlock_irqrestore(&(pool->lock), flags);
458 }
459 
460 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
461 		unsigned int npages)
462 {
463 	__iommu_free(tbl, dma_addr, npages);
464 
465 	/* Make sure TLB cache is flushed if the HW needs it. We do
466 	 * not do an mb() here on purpose, it is not needed on any of
467 	 * the current platforms.
468 	 */
469 	if (tbl->it_ops->flush)
470 		tbl->it_ops->flush(tbl);
471 }
472 
473 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
474 		     struct scatterlist *sglist, int nelems,
475 		     unsigned long mask, enum dma_data_direction direction,
476 		     unsigned long attrs)
477 {
478 	dma_addr_t dma_next = 0, dma_addr;
479 	struct scatterlist *s, *outs, *segstart;
480 	int outcount, incount, i, build_fail = 0;
481 	unsigned int align;
482 	unsigned long handle;
483 	unsigned int max_seg_size;
484 
485 	BUG_ON(direction == DMA_NONE);
486 
487 	if ((nelems == 0) || !tbl)
488 		return -EINVAL;
489 
490 	outs = s = segstart = &sglist[0];
491 	outcount = 1;
492 	incount = nelems;
493 	handle = 0;
494 
495 	/* Init first segment length for backout at failure */
496 	outs->dma_length = 0;
497 
498 	DBG("sg mapping %d elements:\n", nelems);
499 
500 	max_seg_size = dma_get_max_seg_size(dev);
501 	for_each_sg(sglist, s, nelems, i) {
502 		unsigned long vaddr, npages, entry, slen;
503 
504 		slen = s->length;
505 		/* Sanity check */
506 		if (slen == 0) {
507 			dma_next = 0;
508 			continue;
509 		}
510 		/* Allocate iommu entries for that segment */
511 		vaddr = (unsigned long) sg_virt(s);
512 		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
513 		align = 0;
514 		if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
515 		    (vaddr & ~PAGE_MASK) == 0)
516 			align = PAGE_SHIFT - tbl->it_page_shift;
517 		entry = iommu_range_alloc(dev, tbl, npages, &handle,
518 					  mask >> tbl->it_page_shift, align);
519 
520 		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
521 
522 		/* Handle failure */
523 		if (unlikely(entry == DMA_MAPPING_ERROR)) {
524 			if (!(attrs & DMA_ATTR_NO_WARN) &&
525 			    printk_ratelimit())
526 				dev_info(dev, "iommu_alloc failed, tbl %p "
527 					 "vaddr %lx npages %lu\n", tbl, vaddr,
528 					 npages);
529 			goto failure;
530 		}
531 
532 		/* Convert entry to a dma_addr_t */
533 		entry += tbl->it_offset;
534 		dma_addr = entry << tbl->it_page_shift;
535 		dma_addr |= (vaddr & ~IOMMU_PAGE_MASK(tbl));
536 
537 		DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
538 			    npages, entry, dma_addr);
539 
540 		/* Insert into HW table */
541 		build_fail = tbl->it_ops->set(tbl, entry, npages,
542 					      vaddr & IOMMU_PAGE_MASK(tbl),
543 					      direction, attrs);
544 		if(unlikely(build_fail))
545 			goto failure;
546 
547 		/* If we are in an open segment, try merging */
548 		if (segstart != s) {
549 			DBG("  - trying merge...\n");
550 			/* We cannot merge if:
551 			 * - allocated dma_addr isn't contiguous to previous allocation
552 			 */
553 			if (novmerge || (dma_addr != dma_next) ||
554 			    (outs->dma_length + s->length > max_seg_size)) {
555 				/* Can't merge: create a new segment */
556 				segstart = s;
557 				outcount++;
558 				outs = sg_next(outs);
559 				DBG("    can't merge, new segment.\n");
560 			} else {
561 				outs->dma_length += s->length;
562 				DBG("    merged, new len: %ux\n", outs->dma_length);
563 			}
564 		}
565 
566 		if (segstart == s) {
567 			/* This is a new segment, fill entries */
568 			DBG("  - filling new segment.\n");
569 			outs->dma_address = dma_addr;
570 			outs->dma_length = slen;
571 		}
572 
573 		/* Calculate next page pointer for contiguous check */
574 		dma_next = dma_addr + slen;
575 
576 		DBG("  - dma next is: %lx\n", dma_next);
577 	}
578 
579 	/* Flush/invalidate TLB caches if necessary */
580 	if (tbl->it_ops->flush)
581 		tbl->it_ops->flush(tbl);
582 
583 	DBG("mapped %d elements:\n", outcount);
584 
585 	/* For the sake of ppc_iommu_unmap_sg, we clear out the length in the
586 	 * next entry of the sglist if we didn't fill the list completely
587 	 */
588 	if (outcount < incount) {
589 		outs = sg_next(outs);
590 		outs->dma_length = 0;
591 	}
592 
593 	/* Make sure updates are seen by hardware */
594 	mb();
595 
596 	return outcount;
597 
598  failure:
599 	for_each_sg(sglist, s, nelems, i) {
600 		if (s->dma_length != 0) {
601 			unsigned long vaddr, npages;
602 
603 			vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
604 			npages = iommu_num_pages(s->dma_address, s->dma_length,
605 						 IOMMU_PAGE_SIZE(tbl));
606 			__iommu_free(tbl, vaddr, npages);
607 			s->dma_length = 0;
608 		}
609 		if (s == outs)
610 			break;
611 	}
612 	return -EIO;
613 }
614 
615 
616 void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
617 			int nelems, enum dma_data_direction direction,
618 			unsigned long attrs)
619 {
620 	struct scatterlist *sg;
621 
622 	BUG_ON(direction == DMA_NONE);
623 
624 	if (!tbl)
625 		return;
626 
627 	sg = sglist;
628 	while (nelems--) {
629 		unsigned int npages;
630 		dma_addr_t dma_handle = sg->dma_address;
631 
632 		if (sg->dma_length == 0)
633 			break;
634 		npages = iommu_num_pages(dma_handle, sg->dma_length,
635 					 IOMMU_PAGE_SIZE(tbl));
636 		__iommu_free(tbl, dma_handle, npages);
637 		sg = sg_next(sg);
638 	}
639 
640 	/* Flush/invalidate TLBs if necessary. As for iommu_free(), we
641 	 * do not do an mb() here, the affected platforms do not need it
642 	 * when freeing.
643 	 */
644 	if (tbl->it_ops->flush)
645 		tbl->it_ops->flush(tbl);
646 }
647 
648 void iommu_table_clear(struct iommu_table *tbl)
649 {
650 	/*
651 	 * In case of firmware assisted dump system goes through clean
652 	 * reboot process at the time of system crash. Hence it's safe to
653 	 * clear the TCE entries if firmware assisted dump is active.
654 	 */
655 	if (!is_kdump_kernel() || is_fadump_active()) {
656 		/* Clear the table in case firmware left allocations in it */
657 		tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
658 		return;
659 	}
660 
661 #ifdef CONFIG_CRASH_DUMP
662 	if (tbl->it_ops->get) {
663 		unsigned long index, tceval, tcecount = 0;
664 
665 		/* Reserve the existing mappings left by the first kernel. */
666 		for (index = 0; index < tbl->it_size; index++) {
667 			tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
668 			/*
669 			 * Freed TCE entry contains 0x7fffffffffffffff on JS20
670 			 */
671 			if (tceval && (tceval != 0x7fffffffffffffffUL)) {
672 				__set_bit(index, tbl->it_map);
673 				tcecount++;
674 			}
675 		}
676 
677 		if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) {
678 			printk(KERN_WARNING "TCE table is full; freeing ");
679 			printk(KERN_WARNING "%d entries for the kdump boot\n",
680 				KDUMP_MIN_TCE_ENTRIES);
681 			for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
682 				index < tbl->it_size; index++)
683 				__clear_bit(index, tbl->it_map);
684 		}
685 	}
686 #endif
687 }
688 
689 void iommu_table_reserve_pages(struct iommu_table *tbl,
690 		unsigned long res_start, unsigned long res_end)
691 {
692 	unsigned long i;
693 
694 	WARN_ON_ONCE(res_end < res_start);
695 	/*
696 	 * Reserve page 0 so it will not be used for any mappings.
697 	 * This avoids buggy drivers that consider page 0 to be invalid
698 	 * to crash the machine or even lose data.
699 	 */
700 	if (tbl->it_offset == 0)
701 		set_bit(0, tbl->it_map);
702 
703 	if (res_start < tbl->it_offset)
704 		res_start = tbl->it_offset;
705 
706 	if (res_end > (tbl->it_offset + tbl->it_size))
707 		res_end = tbl->it_offset + tbl->it_size;
708 
709 	/* Check if res_start..res_end is a valid range in the table */
710 	if (res_start >= res_end) {
711 		tbl->it_reserved_start = tbl->it_offset;
712 		tbl->it_reserved_end = tbl->it_offset;
713 		return;
714 	}
715 
716 	tbl->it_reserved_start = res_start;
717 	tbl->it_reserved_end = res_end;
718 
719 	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
720 		set_bit(i - tbl->it_offset, tbl->it_map);
721 }
722 
723 /*
724  * Build a iommu_table structure.  This contains a bit map which
725  * is used to manage allocation of the tce space.
726  */
727 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
728 		unsigned long res_start, unsigned long res_end)
729 {
730 	unsigned long sz;
731 	static int welcomed = 0;
732 	unsigned int i;
733 	struct iommu_pool *p;
734 
735 	BUG_ON(!tbl->it_ops);
736 
737 	/* number of bytes needed for the bitmap */
738 	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
739 
740 	tbl->it_map = vzalloc_node(sz, nid);
741 	if (!tbl->it_map) {
742 		pr_err("%s: Can't allocate %ld bytes\n", __func__, sz);
743 		return NULL;
744 	}
745 
746 	iommu_table_reserve_pages(tbl, res_start, res_end);
747 
748 	/* We only split the IOMMU table if we have 1GB or more of space */
749 	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
750 		tbl->nr_pools = IOMMU_NR_POOLS;
751 	else
752 		tbl->nr_pools = 1;
753 
754 	/* We reserve the top 1/4 of the table for large allocations */
755 	tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
756 
757 	for (i = 0; i < tbl->nr_pools; i++) {
758 		p = &tbl->pools[i];
759 		spin_lock_init(&(p->lock));
760 		p->start = tbl->poolsize * i;
761 		p->hint = p->start;
762 		p->end = p->start + tbl->poolsize;
763 	}
764 
765 	p = &tbl->large_pool;
766 	spin_lock_init(&(p->lock));
767 	p->start = tbl->poolsize * i;
768 	p->hint = p->start;
769 	p->end = tbl->it_size;
770 
771 	iommu_table_clear(tbl);
772 
773 	if (!welcomed) {
774 		pr_info("IOMMU table initialized, virtual merging %s\n",
775 			str_disabled_enabled(novmerge));
776 		welcomed = 1;
777 	}
778 
779 	iommu_debugfs_add(tbl);
780 
781 	return tbl;
782 }
783 
784 bool iommu_table_in_use(struct iommu_table *tbl)
785 {
786 	unsigned long start = 0, end;
787 
788 	/* ignore reserved bit0 */
789 	if (tbl->it_offset == 0)
790 		start = 1;
791 
792 	/* Simple case with no reserved MMIO32 region */
793 	if (!tbl->it_reserved_start && !tbl->it_reserved_end)
794 		return find_next_bit(tbl->it_map, tbl->it_size, start) != tbl->it_size;
795 
796 	end = tbl->it_reserved_start - tbl->it_offset;
797 	if (find_next_bit(tbl->it_map, end, start) != end)
798 		return true;
799 
800 	start = tbl->it_reserved_end - tbl->it_offset;
801 	end = tbl->it_size;
802 	return find_next_bit(tbl->it_map, end, start) != end;
803 }
804 
805 static void iommu_table_free(struct kref *kref)
806 {
807 	struct iommu_table *tbl;
808 
809 	tbl = container_of(kref, struct iommu_table, it_kref);
810 
811 	if (tbl->it_ops->free)
812 		tbl->it_ops->free(tbl);
813 
814 	if (!tbl->it_map) {
815 		kfree(tbl);
816 		return;
817 	}
818 
819 	iommu_debugfs_del(tbl);
820 
821 	/* verify that table contains no entries */
822 	if (iommu_table_in_use(tbl))
823 		pr_warn("%s: Unexpected TCEs\n", __func__);
824 
825 	/* free bitmap */
826 	vfree(tbl->it_map);
827 
828 	/* free table */
829 	kfree(tbl);
830 }
831 
832 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
833 {
834 	if (kref_get_unless_zero(&tbl->it_kref))
835 		return tbl;
836 
837 	return NULL;
838 }
839 EXPORT_SYMBOL_GPL(iommu_tce_table_get);
840 
841 int iommu_tce_table_put(struct iommu_table *tbl)
842 {
843 	if (WARN_ON(!tbl))
844 		return 0;
845 
846 	return kref_put(&tbl->it_kref, iommu_table_free);
847 }
848 EXPORT_SYMBOL_GPL(iommu_tce_table_put);
849 
850 /* Creates TCEs for a user provided buffer.  The user buffer must be
851  * contiguous real kernel storage (not vmalloc).  The address passed here
852  * is physical address into that page. The dma_addr_t returned will point
853  * to the same byte within the page as was passed in.
854  */
855 dma_addr_t iommu_map_phys(struct device *dev, struct iommu_table *tbl,
856 			  phys_addr_t phys, size_t size, unsigned long mask,
857 			  enum dma_data_direction direction,
858 			  unsigned long attrs)
859 {
860 	dma_addr_t dma_handle = DMA_MAPPING_ERROR;
861 	void *vaddr;
862 	unsigned long uaddr;
863 	unsigned int npages, align;
864 
865 	BUG_ON(direction == DMA_NONE);
866 
867 	vaddr = phys_to_virt(phys);
868 	uaddr = (unsigned long)vaddr;
869 
870 	if (tbl) {
871 		npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
872 		align = 0;
873 		if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
874 		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
875 			align = PAGE_SHIFT - tbl->it_page_shift;
876 
877 		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
878 					 mask >> tbl->it_page_shift, align,
879 					 attrs);
880 		if (dma_handle == DMA_MAPPING_ERROR) {
881 			if (!(attrs & DMA_ATTR_NO_WARN) &&
882 			    printk_ratelimit())  {
883 				dev_info(dev, "iommu_alloc failed, tbl %p "
884 					 "vaddr %p npages %d\n", tbl, vaddr,
885 					 npages);
886 			}
887 		} else
888 			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
889 	}
890 
891 	return dma_handle;
892 }
893 
894 void iommu_unmap_phys(struct iommu_table *tbl, dma_addr_t dma_handle,
895 		      size_t size, enum dma_data_direction direction,
896 		      unsigned long attrs)
897 {
898 	unsigned int npages;
899 
900 	BUG_ON(direction == DMA_NONE);
901 
902 	if (tbl) {
903 		npages = iommu_num_pages(dma_handle, size,
904 					 IOMMU_PAGE_SIZE(tbl));
905 		iommu_free(tbl, dma_handle, npages);
906 	}
907 }
908 
909 /* Allocates a contiguous real buffer and creates mappings over it.
910  * Returns the virtual address of the buffer and sets dma_handle
911  * to the dma address (mapping) of the first page.
912  */
913 void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
914 			   size_t size,	dma_addr_t *dma_handle,
915 			   unsigned long mask, gfp_t flag, int node)
916 {
917 	void *ret = NULL;
918 	dma_addr_t mapping;
919 	unsigned int order;
920 	unsigned int nio_pages, io_order;
921 	struct page *page;
922 	int tcesize = (1 << tbl->it_page_shift);
923 
924 	size = PAGE_ALIGN(size);
925 	order = get_order(size);
926 
927  	/*
928 	 * Client asked for way too much space.  This is checked later
929 	 * anyway.  It is easier to debug here for the drivers than in
930 	 * the tce tables.
931 	 */
932 	if (order >= IOMAP_MAX_ORDER) {
933 		dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n",
934 			 size);
935 		return NULL;
936 	}
937 
938 	if (!tbl)
939 		return NULL;
940 
941 	/* Alloc enough pages (and possibly more) */
942 	page = alloc_pages_node(node, flag, order);
943 	if (!page)
944 		return NULL;
945 	ret = page_address(page);
946 	memset(ret, 0, size);
947 
948 	/* Set up tces to cover the allocated range */
949 	nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift;
950 
951 	io_order = get_iommu_order(size, tbl);
952 	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
953 			      mask >> tbl->it_page_shift, io_order, 0);
954 	if (mapping == DMA_MAPPING_ERROR) {
955 		free_pages((unsigned long)ret, order);
956 		return NULL;
957 	}
958 
959 	*dma_handle = mapping | ((u64)ret & (tcesize - 1));
960 	return ret;
961 }
962 
963 void iommu_free_coherent(struct iommu_table *tbl, size_t size,
964 			 void *vaddr, dma_addr_t dma_handle)
965 {
966 	if (tbl) {
967 		unsigned int nio_pages;
968 
969 		size = PAGE_ALIGN(size);
970 		nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift;
971 		iommu_free(tbl, dma_handle, nio_pages);
972 		size = PAGE_ALIGN(size);
973 		free_pages((unsigned long)vaddr, get_order(size));
974 	}
975 }
976 
977 unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
978 {
979 	switch (dir) {
980 	case DMA_BIDIRECTIONAL:
981 		return TCE_PCI_READ | TCE_PCI_WRITE;
982 	case DMA_FROM_DEVICE:
983 		return TCE_PCI_WRITE;
984 	case DMA_TO_DEVICE:
985 		return TCE_PCI_READ;
986 	default:
987 		return 0;
988 	}
989 }
990 EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
991 
992 #ifdef CONFIG_IOMMU_API
993 
994 int dev_has_iommu_table(struct device *dev, void *data)
995 {
996 	struct pci_dev *pdev = to_pci_dev(dev);
997 	struct pci_dev **ppdev = data;
998 
999 	if (!dev)
1000 		return 0;
1001 
1002 	if (device_iommu_mapped(dev)) {
1003 		*ppdev = pdev;
1004 		return 1;
1005 	}
1006 
1007 	return 0;
1008 }
1009 
1010 /*
1011  * SPAPR TCE API
1012  */
1013 static void group_release(void *iommu_data)
1014 {
1015 	struct iommu_table_group *table_group = iommu_data;
1016 
1017 	table_group->group = NULL;
1018 }
1019 
1020 void iommu_register_group(struct iommu_table_group *table_group,
1021 		int pci_domain_number, unsigned long pe_num)
1022 {
1023 	struct iommu_group *grp;
1024 	char *name;
1025 
1026 	grp = iommu_group_alloc();
1027 	if (IS_ERR(grp)) {
1028 		pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
1029 				PTR_ERR(grp));
1030 		return;
1031 	}
1032 	table_group->group = grp;
1033 	iommu_group_set_iommudata(grp, table_group, group_release);
1034 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
1035 			pci_domain_number, pe_num);
1036 	if (!name)
1037 		return;
1038 	iommu_group_set_name(grp, name);
1039 	kfree(name);
1040 }
1041 
1042 enum dma_data_direction iommu_tce_direction(unsigned long tce)
1043 {
1044 	if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
1045 		return DMA_BIDIRECTIONAL;
1046 	else if (tce & TCE_PCI_READ)
1047 		return DMA_TO_DEVICE;
1048 	else if (tce & TCE_PCI_WRITE)
1049 		return DMA_FROM_DEVICE;
1050 	else
1051 		return DMA_NONE;
1052 }
1053 EXPORT_SYMBOL_GPL(iommu_tce_direction);
1054 
1055 void iommu_flush_tce(struct iommu_table *tbl)
1056 {
1057 	/* Flush/invalidate TLB caches if necessary */
1058 	if (tbl->it_ops->flush)
1059 		tbl->it_ops->flush(tbl);
1060 
1061 	/* Make sure updates are seen by hardware */
1062 	mb();
1063 }
1064 EXPORT_SYMBOL_GPL(iommu_flush_tce);
1065 
1066 int iommu_tce_check_ioba(unsigned long page_shift,
1067 		unsigned long offset, unsigned long size,
1068 		unsigned long ioba, unsigned long npages)
1069 {
1070 	unsigned long mask = (1UL << page_shift) - 1;
1071 
1072 	if (ioba & mask)
1073 		return -EINVAL;
1074 
1075 	ioba >>= page_shift;
1076 	if (ioba < offset)
1077 		return -EINVAL;
1078 
1079 	if ((ioba + 1) > (offset + size))
1080 		return -EINVAL;
1081 
1082 	return 0;
1083 }
1084 EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
1085 
1086 int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
1087 {
1088 	unsigned long mask = (1UL << page_shift) - 1;
1089 
1090 	if (gpa & mask)
1091 		return -EINVAL;
1092 
1093 	return 0;
1094 }
1095 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
1096 
1097 long iommu_tce_xchg_no_kill(struct mm_struct *mm,
1098 			    struct iommu_table *tbl,
1099 			    unsigned long entry, unsigned long *hpa,
1100 			    enum dma_data_direction *direction)
1101 {
1102 	long ret;
1103 	unsigned long size = 0;
1104 
1105 	ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction);
1106 	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
1107 			(*direction == DMA_BIDIRECTIONAL)) &&
1108 			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
1109 					&size))
1110 		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
1111 
1112 	return ret;
1113 }
1114 EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill);
1115 
1116 void iommu_tce_kill(struct iommu_table *tbl,
1117 		unsigned long entry, unsigned long pages)
1118 {
1119 	if (tbl->it_ops->tce_kill)
1120 		tbl->it_ops->tce_kill(tbl, entry, pages);
1121 }
1122 EXPORT_SYMBOL_GPL(iommu_tce_kill);
1123 
1124 int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
1125 {
1126 	/*
1127 	 * The sysfs entries should be populated before
1128 	 * binding IOMMU group. If sysfs entries isn't
1129 	 * ready, we simply bail.
1130 	 */
1131 	if (!device_is_registered(dev))
1132 		return -ENOENT;
1133 
1134 	if (device_iommu_mapped(dev)) {
1135 		pr_debug("%s: Skipping device %s with iommu group %d\n",
1136 			 __func__, dev_name(dev),
1137 			 iommu_group_id(dev->iommu_group));
1138 		return -EBUSY;
1139 	}
1140 
1141 	pr_debug("%s: Adding %s to iommu group %d\n",
1142 		 __func__, dev_name(dev),  iommu_group_id(table_group->group));
1143 	/*
1144 	 * This is still not adding devices via the IOMMU bus notifier because
1145 	 * of pcibios_init() from arch/powerpc/kernel/pci_64.c which calls
1146 	 * pcibios_scan_phb() first (and this guy adds devices and triggers
1147 	 * the notifier) and only then it calls pci_bus_add_devices() which
1148 	 * configures DMA for buses which also creates PEs and IOMMU groups.
1149 	 */
1150 	return iommu_probe_device(dev);
1151 }
1152 EXPORT_SYMBOL_GPL(iommu_add_device);
1153 
1154 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
1155 /*
1156  * A simple iommu_ops to allow less cruft in generic VFIO code.
1157  */
1158 static int
1159 spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain,
1160 				    struct device *dev,
1161 				    struct iommu_domain *old)
1162 {
1163 	struct iommu_domain *domain = iommu_driver_get_domain_for_dev(dev);
1164 	struct iommu_table_group *table_group;
1165 	struct iommu_group *grp;
1166 
1167 	/* At first attach the ownership is already set */
1168 	if (!domain)
1169 		return 0;
1170 
1171 	grp = iommu_group_get(dev);
1172 	table_group = iommu_group_get_iommudata(grp);
1173 	/*
1174 	 * The domain being set to PLATFORM from earlier
1175 	 * BLOCKED. The table_group ownership has to be released.
1176 	 */
1177 	table_group->ops->release_ownership(table_group, dev);
1178 	iommu_group_put(grp);
1179 
1180 	return 0;
1181 }
1182 
1183 static const struct iommu_domain_ops spapr_tce_platform_domain_ops = {
1184 	.attach_dev = spapr_tce_platform_iommu_attach_dev,
1185 };
1186 
1187 static struct iommu_domain spapr_tce_platform_domain = {
1188 	.type = IOMMU_DOMAIN_PLATFORM,
1189 	.ops = &spapr_tce_platform_domain_ops,
1190 };
1191 
1192 static int
1193 spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain,
1194 				   struct device *dev, struct iommu_domain *old)
1195 {
1196 	struct iommu_group *grp = iommu_group_get(dev);
1197 	struct iommu_table_group *table_group;
1198 	int ret = -EINVAL;
1199 
1200 	/*
1201 	 * FIXME: SPAPR mixes blocked and platform behaviors, the blocked domain
1202 	 * also sets the dma_api ops
1203 	 */
1204 	table_group = iommu_group_get_iommudata(grp);
1205 	ret = table_group->ops->take_ownership(table_group, dev);
1206 	iommu_group_put(grp);
1207 
1208 	return ret;
1209 }
1210 
1211 static const struct iommu_domain_ops spapr_tce_blocked_domain_ops = {
1212 	.attach_dev = spapr_tce_blocked_iommu_attach_dev,
1213 };
1214 
1215 static struct iommu_domain spapr_tce_blocked_domain = {
1216 	.type = IOMMU_DOMAIN_BLOCKED,
1217 	.ops = &spapr_tce_blocked_domain_ops,
1218 };
1219 
1220 static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap)
1221 {
1222 	switch (cap) {
1223 	case IOMMU_CAP_CACHE_COHERENCY:
1224 		return true;
1225 	default:
1226 		break;
1227 	}
1228 
1229 	return false;
1230 }
1231 
1232 static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev)
1233 {
1234 	struct pci_dev *pdev;
1235 	struct pci_controller *hose;
1236 
1237 	if (!dev_is_pci(dev))
1238 		return ERR_PTR(-ENODEV);
1239 
1240 	pdev = to_pci_dev(dev);
1241 	hose = pdev->bus->sysdata;
1242 
1243 	return &hose->iommu;
1244 }
1245 
1246 static void spapr_tce_iommu_release_device(struct device *dev)
1247 {
1248 }
1249 
1250 static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev)
1251 {
1252 	struct pci_controller *hose;
1253 	struct pci_dev *pdev;
1254 
1255 	pdev = to_pci_dev(dev);
1256 	hose = pdev->bus->sysdata;
1257 
1258 	if (!hose->controller_ops.device_group)
1259 		return ERR_PTR(-ENOENT);
1260 
1261 	return hose->controller_ops.device_group(hose, pdev);
1262 }
1263 
1264 static const struct iommu_ops spapr_tce_iommu_ops = {
1265 	.default_domain = &spapr_tce_platform_domain,
1266 	.blocked_domain = &spapr_tce_blocked_domain,
1267 	.capable = spapr_tce_iommu_capable,
1268 	.probe_device = spapr_tce_iommu_probe_device,
1269 	.release_device = spapr_tce_iommu_release_device,
1270 	.device_group = spapr_tce_iommu_device_group,
1271 };
1272 
1273 static struct attribute *spapr_tce_iommu_attrs[] = {
1274 	NULL,
1275 };
1276 
1277 static struct attribute_group spapr_tce_iommu_group = {
1278 	.name = "spapr-tce-iommu",
1279 	.attrs = spapr_tce_iommu_attrs,
1280 };
1281 
1282 static const struct attribute_group *spapr_tce_iommu_groups[] = {
1283 	&spapr_tce_iommu_group,
1284 	NULL,
1285 };
1286 
1287 void ppc_iommu_register_device(struct pci_controller *phb)
1288 {
1289 	iommu_device_sysfs_add(&phb->iommu, phb->parent,
1290 				spapr_tce_iommu_groups, "iommu-phb%04x",
1291 				phb->global_number);
1292 	iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops,
1293 				phb->parent);
1294 }
1295 
1296 void ppc_iommu_unregister_device(struct pci_controller *phb)
1297 {
1298 	iommu_device_unregister(&phb->iommu);
1299 	iommu_device_sysfs_remove(&phb->iommu);
1300 }
1301 
1302 /*
1303  * This registers IOMMU devices of PHBs. This needs to happen
1304  * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and
1305  * before subsys_initcall(iommu_subsys_init).
1306  */
1307 static int __init spapr_tce_setup_phb_iommus_initcall(void)
1308 {
1309 	struct pci_controller *hose;
1310 
1311 	list_for_each_entry(hose, &hose_list, list_node) {
1312 		ppc_iommu_register_device(hose);
1313 	}
1314 	return 0;
1315 }
1316 postcore_initcall_sync(spapr_tce_setup_phb_iommus_initcall);
1317 #endif
1318 
1319 #endif /* CONFIG_IOMMU_API */
1320