xref: /freebsd/sys/dev/iommu/busdma_iommu.c (revision a134ebd6e63f658f2d3d04ac0c60d23bcaa86dd7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
8  * under sponsorship from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/domainset.h>
38 #include <sys/malloc.h>
39 #include <sys/bus.h>
40 #include <sys/conf.h>
41 #include <sys/interrupt.h>
42 #include <sys/kernel.h>
43 #include <sys/ktr.h>
44 #include <sys/lock.h>
45 #include <sys/proc.h>
46 #include <sys/memdesc.h>
47 #include <sys/mutex.h>
48 #include <sys/sysctl.h>
49 #include <sys/rman.h>
50 #include <sys/taskqueue.h>
51 #include <sys/tree.h>
52 #include <sys/uio.h>
53 #include <sys/vmem.h>
54 #include <dev/pci/pcireg.h>
55 #include <dev/pci/pcivar.h>
56 #include <vm/vm.h>
57 #include <vm/vm_extern.h>
58 #include <vm/vm_kern.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_map.h>
62 #include <machine/atomic.h>
63 #include <machine/bus.h>
64 #include <machine/md_var.h>
65 #if defined(__amd64__) || defined(__i386__)
66 #include <machine/specialreg.h>
67 #include <x86/include/busdma_impl.h>
68 #include <x86/iommu/intel_reg.h>
69 #include <dev/iommu/busdma_iommu.h>
70 #include <dev/iommu/iommu.h>
71 #include <x86/iommu/intel_dmar.h>
72 #endif
73 
74 /*
75  * busdma_iommu.c, the implementation of the busdma(9) interface using
76  * IOMMU units from Intel VT-d.
77  */
78 
79 static bool
80 iommu_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func)
81 {
82 	char str[128], *env;
83 	int default_bounce;
84 	bool ret;
85 	static const char bounce_str[] = "bounce";
86 	static const char iommu_str[] = "iommu";
87 	static const char dmar_str[] = "dmar"; /* compatibility */
88 
89 	default_bounce = 0;
90 	env = kern_getenv("hw.busdma.default");
91 	if (env != NULL) {
92 		if (strcmp(env, bounce_str) == 0)
93 			default_bounce = 1;
94 		else if (strcmp(env, iommu_str) == 0 ||
95 		    strcmp(env, dmar_str) == 0)
96 			default_bounce = 0;
97 		freeenv(env);
98 	}
99 
100 	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d",
101 	    domain, bus, slot, func);
102 	env = kern_getenv(str);
103 	if (env == NULL)
104 		return (default_bounce != 0);
105 	if (strcmp(env, bounce_str) == 0)
106 		ret = true;
107 	else if (strcmp(env, iommu_str) == 0 ||
108 	    strcmp(env, dmar_str) == 0)
109 		ret = false;
110 	else
111 		ret = default_bounce != 0;
112 	freeenv(env);
113 	return (ret);
114 }
115 
116 /*
117  * Given original device, find the requester ID that will be seen by
118  * the IOMMU unit and used for page table lookup.  PCI bridges may take
119  * ownership of transactions from downstream devices, so it may not be
120  * the same as the BSF of the target device.  In those cases, all
121  * devices downstream of the bridge must share a single mapping
122  * domain, and must collectively be assigned to use either IOMMU or
123  * bounce mapping.
124  */
125 device_t
126 iommu_get_requester(device_t dev, uint16_t *rid)
127 {
128 	devclass_t pci_class;
129 	device_t l, pci, pcib, pcip, pcibp, requester;
130 	int cap_offset;
131 	uint16_t pcie_flags;
132 	bool bridge_is_pcie;
133 
134 	pci_class = devclass_find("pci");
135 	l = requester = dev;
136 
137 	*rid = pci_get_rid(dev);
138 
139 	/*
140 	 * Walk the bridge hierarchy from the target device to the
141 	 * host port to find the translating bridge nearest the IOMMU
142 	 * unit.
143 	 */
144 	for (;;) {
145 		pci = device_get_parent(l);
146 		KASSERT(pci != NULL, ("iommu_get_requester(%s): NULL parent "
147 		    "for %s", device_get_name(dev), device_get_name(l)));
148 		KASSERT(device_get_devclass(pci) == pci_class,
149 		    ("iommu_get_requester(%s): non-pci parent %s for %s",
150 		    device_get_name(dev), device_get_name(pci),
151 		    device_get_name(l)));
152 
153 		pcib = device_get_parent(pci);
154 		KASSERT(pcib != NULL, ("iommu_get_requester(%s): NULL bridge "
155 		    "for %s", device_get_name(dev), device_get_name(pci)));
156 
157 		/*
158 		 * The parent of our "bridge" isn't another PCI bus,
159 		 * so pcib isn't a PCI->PCI bridge but rather a host
160 		 * port, and the requester ID won't be translated
161 		 * further.
162 		 */
163 		pcip = device_get_parent(pcib);
164 		if (device_get_devclass(pcip) != pci_class)
165 			break;
166 		pcibp = device_get_parent(pcip);
167 
168 		if (pci_find_cap(l, PCIY_EXPRESS, &cap_offset) == 0) {
169 			/*
170 			 * Do not stop the loop even if the target
171 			 * device is PCIe, because it is possible (but
172 			 * unlikely) to have a PCI->PCIe bridge
173 			 * somewhere in the hierarchy.
174 			 */
175 			l = pcib;
176 		} else {
177 			/*
178 			 * Device is not PCIe, it cannot be seen as a
179 			 * requester by IOMMU unit.  Check whether the
180 			 * bridge is PCIe.
181 			 */
182 			bridge_is_pcie = pci_find_cap(pcib, PCIY_EXPRESS,
183 			    &cap_offset) == 0;
184 			requester = pcib;
185 
186 			/*
187 			 * Check for a buggy PCIe/PCI bridge that
188 			 * doesn't report the express capability.  If
189 			 * the bridge above it is express but isn't a
190 			 * PCI bridge, then we know pcib is actually a
191 			 * PCIe/PCI bridge.
192 			 */
193 			if (!bridge_is_pcie && pci_find_cap(pcibp,
194 			    PCIY_EXPRESS, &cap_offset) == 0) {
195 				pcie_flags = pci_read_config(pcibp,
196 				    cap_offset + PCIER_FLAGS, 2);
197 				if ((pcie_flags & PCIEM_FLAGS_TYPE) !=
198 				    PCIEM_TYPE_PCI_BRIDGE)
199 					bridge_is_pcie = true;
200 			}
201 
202 			if (bridge_is_pcie) {
203 				/*
204 				 * The current device is not PCIe, but
205 				 * the bridge above it is.  This is a
206 				 * PCIe->PCI bridge.  Assume that the
207 				 * requester ID will be the secondary
208 				 * bus number with slot and function
209 				 * set to zero.
210 				 *
211 				 * XXX: Doesn't handle the case where
212 				 * the bridge is PCIe->PCI-X, and the
213 				 * bridge will only take ownership of
214 				 * requests in some cases.  We should
215 				 * provide context entries with the
216 				 * same page tables for taken and
217 				 * non-taken transactions.
218 				 */
219 				*rid = PCI_RID(pci_get_bus(l), 0, 0);
220 				l = pcibp;
221 			} else {
222 				/*
223 				 * Neither the device nor the bridge
224 				 * above it are PCIe.  This is a
225 				 * conventional PCI->PCI bridge, which
226 				 * will use the bridge's BSF as the
227 				 * requester ID.
228 				 */
229 				*rid = pci_get_rid(pcib);
230 				l = pcib;
231 			}
232 		}
233 	}
234 	return (requester);
235 }
236 
237 struct iommu_ctx *
238 iommu_instantiate_ctx(struct iommu_unit *unit, device_t dev, bool rmrr)
239 {
240 	device_t requester;
241 	struct iommu_ctx *ctx;
242 	bool disabled;
243 	uint16_t rid;
244 
245 	requester = iommu_get_requester(dev, &rid);
246 
247 	/*
248 	 * If the user requested the IOMMU disabled for the device, we
249 	 * cannot disable the IOMMU unit, due to possibility of other
250 	 * devices on the same IOMMU unit still requiring translation.
251 	 * Instead provide the identity mapping for the device
252 	 * context.
253 	 */
254 	disabled = iommu_bus_dma_is_dev_disabled(pci_get_domain(requester),
255 	    pci_get_bus(requester), pci_get_slot(requester),
256 	    pci_get_function(requester));
257 	ctx = iommu_get_ctx(unit, requester, rid, disabled, rmrr);
258 	if (ctx == NULL)
259 		return (NULL);
260 	if (disabled) {
261 		/*
262 		 * Keep the first reference on context, release the
263 		 * later refs.
264 		 */
265 		IOMMU_LOCK(unit);
266 		if ((ctx->flags & IOMMU_CTX_DISABLED) == 0) {
267 			ctx->flags |= IOMMU_CTX_DISABLED;
268 			IOMMU_UNLOCK(unit);
269 		} else {
270 			iommu_free_ctx_locked(unit, ctx);
271 		}
272 		ctx = NULL;
273 	}
274 	return (ctx);
275 }
276 
277 bus_dma_tag_t
278 acpi_iommu_get_dma_tag(device_t dev, device_t child)
279 {
280 	struct iommu_unit *unit;
281 	struct iommu_ctx *ctx;
282 	bus_dma_tag_t res;
283 
284 	unit = iommu_find(child, bootverbose);
285 	/* Not in scope of any IOMMU ? */
286 	if (unit == NULL)
287 		return (NULL);
288 	if (!unit->dma_enabled)
289 		return (NULL);
290 
291 #if defined(__amd64__) || defined(__i386__)
292 	dmar_quirks_pre_use(unit);
293 	dmar_instantiate_rmrr_ctxs(unit);
294 #endif
295 
296 	ctx = iommu_instantiate_ctx(unit, child, false);
297 	res = ctx == NULL ? NULL : (bus_dma_tag_t)ctx->tag;
298 	return (res);
299 }
300 
301 bool
302 bus_dma_iommu_set_buswide(device_t dev)
303 {
304 	struct iommu_unit *unit;
305 	device_t parent;
306 	u_int busno, slot, func;
307 
308 	parent = device_get_parent(dev);
309 	if (device_get_devclass(parent) != devclass_find("pci"))
310 		return (false);
311 	unit = iommu_find(dev, bootverbose);
312 	if (unit == NULL)
313 		return (false);
314 	busno = pci_get_bus(dev);
315 	slot = pci_get_slot(dev);
316 	func = pci_get_function(dev);
317 	if (slot != 0 || func != 0) {
318 		if (bootverbose) {
319 			device_printf(dev,
320 			    "iommu%d pci%d:%d:%d requested buswide busdma\n",
321 			    unit->unit, busno, slot, func);
322 		}
323 		return (false);
324 	}
325 	iommu_set_buswide_ctx(unit, busno);
326 	return (true);
327 }
328 
329 void
330 iommu_set_buswide_ctx(struct iommu_unit *unit, u_int busno)
331 {
332 
333 	MPASS(busno <= PCI_BUSMAX);
334 	IOMMU_LOCK(unit);
335 	unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] |=
336 	    1 << (busno % (NBBY * sizeof(uint32_t)));
337 	IOMMU_UNLOCK(unit);
338 }
339 
340 bool
341 iommu_is_buswide_ctx(struct iommu_unit *unit, u_int busno)
342 {
343 
344 	MPASS(busno <= PCI_BUSMAX);
345 	return ((unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] &
346 	    (1U << (busno % (NBBY * sizeof(uint32_t))))) != 0);
347 }
348 
349 static MALLOC_DEFINE(M_IOMMU_DMAMAP, "iommu_dmamap", "IOMMU DMA Map");
350 
351 static void iommu_bus_schedule_dmamap(struct iommu_unit *unit,
352     struct bus_dmamap_iommu *map);
353 
354 static int
355 iommu_bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
356     bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr,
357     bus_dma_filter_t *filter, void *filterarg, bus_size_t maxsize,
358     int nsegments, bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc,
359     void *lockfuncarg, bus_dma_tag_t *dmat)
360 {
361 	struct bus_dma_tag_iommu *newtag, *oldtag;
362 	int error;
363 
364 	*dmat = NULL;
365 	error = common_bus_dma_tag_create(parent != NULL ?
366 	    &((struct bus_dma_tag_iommu *)parent)->common : NULL, alignment,
367 	    boundary, lowaddr, highaddr, filter, filterarg, maxsize,
368 	    nsegments, maxsegsz, flags, lockfunc, lockfuncarg,
369 	    sizeof(struct bus_dma_tag_iommu), (void **)&newtag);
370 	if (error != 0)
371 		goto out;
372 
373 	oldtag = (struct bus_dma_tag_iommu *)parent;
374 	newtag->common.impl = &bus_dma_iommu_impl;
375 	newtag->ctx = oldtag->ctx;
376 	newtag->owner = oldtag->owner;
377 
378 	*dmat = (bus_dma_tag_t)newtag;
379 out:
380 	CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
381 	    __func__, newtag, (newtag != NULL ? newtag->common.flags : 0),
382 	    error);
383 	return (error);
384 }
385 
386 static int
387 iommu_bus_dma_tag_set_domain(bus_dma_tag_t dmat)
388 {
389 
390 	return (0);
391 }
392 
393 static int
394 iommu_bus_dma_tag_destroy(bus_dma_tag_t dmat1)
395 {
396 	struct bus_dma_tag_iommu *dmat, *dmat_copy, *parent;
397 	int error;
398 
399 	error = 0;
400 	dmat_copy = dmat = (struct bus_dma_tag_iommu *)dmat1;
401 
402 	if (dmat != NULL) {
403 		if (dmat->map_count != 0) {
404 			error = EBUSY;
405 			goto out;
406 		}
407 		while (dmat != NULL) {
408 			parent = (struct bus_dma_tag_iommu *)dmat->common.parent;
409 			if (atomic_fetchadd_int(&dmat->common.ref_count, -1) ==
410 			    1) {
411 				if (dmat == dmat->ctx->tag)
412 					iommu_free_ctx(dmat->ctx);
413 				free(dmat->segments, M_IOMMU_DMAMAP);
414 				free(dmat, M_DEVBUF);
415 				dmat = parent;
416 			} else
417 				dmat = NULL;
418 		}
419 	}
420 out:
421 	CTR3(KTR_BUSDMA, "%s tag %p error %d", __func__, dmat_copy, error);
422 	return (error);
423 }
424 
425 static bool
426 iommu_bus_dma_id_mapped(bus_dma_tag_t dmat, vm_paddr_t buf, bus_size_t buflen)
427 {
428 
429 	return (false);
430 }
431 
432 static int
433 iommu_bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp)
434 {
435 	struct bus_dma_tag_iommu *tag;
436 	struct bus_dmamap_iommu *map;
437 
438 	tag = (struct bus_dma_tag_iommu *)dmat;
439 	map = malloc_domainset(sizeof(*map), M_IOMMU_DMAMAP,
440 	    DOMAINSET_PREF(tag->common.domain), M_NOWAIT | M_ZERO);
441 	if (map == NULL) {
442 		*mapp = NULL;
443 		return (ENOMEM);
444 	}
445 	if (tag->segments == NULL) {
446 		tag->segments = malloc_domainset(sizeof(bus_dma_segment_t) *
447 		    tag->common.nsegments, M_IOMMU_DMAMAP,
448 		    DOMAINSET_PREF(tag->common.domain), M_NOWAIT);
449 		if (tag->segments == NULL) {
450 			free(map, M_IOMMU_DMAMAP);
451 			*mapp = NULL;
452 			return (ENOMEM);
453 		}
454 	}
455 	TAILQ_INIT(&map->map_entries);
456 	map->tag = tag;
457 	map->locked = true;
458 	map->cansleep = false;
459 	tag->map_count++;
460 	*mapp = (bus_dmamap_t)map;
461 
462 	return (0);
463 }
464 
465 static int
466 iommu_bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map1)
467 {
468 	struct bus_dma_tag_iommu *tag;
469 	struct bus_dmamap_iommu *map;
470 	struct iommu_domain *domain;
471 
472 	tag = (struct bus_dma_tag_iommu *)dmat;
473 	map = (struct bus_dmamap_iommu *)map1;
474 	if (map != NULL) {
475 		domain = tag->ctx->domain;
476 		IOMMU_DOMAIN_LOCK(domain);
477 		if (!TAILQ_EMPTY(&map->map_entries)) {
478 			IOMMU_DOMAIN_UNLOCK(domain);
479 			return (EBUSY);
480 		}
481 		IOMMU_DOMAIN_UNLOCK(domain);
482 		free(map, M_IOMMU_DMAMAP);
483 	}
484 	tag->map_count--;
485 	return (0);
486 }
487 
488 
489 static int
490 iommu_bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
491     bus_dmamap_t *mapp)
492 {
493 	struct bus_dma_tag_iommu *tag;
494 	struct bus_dmamap_iommu *map;
495 	int error, mflags;
496 	vm_memattr_t attr;
497 
498 	error = iommu_bus_dmamap_create(dmat, flags, mapp);
499 	if (error != 0)
500 		return (error);
501 
502 	mflags = (flags & BUS_DMA_NOWAIT) != 0 ? M_NOWAIT : M_WAITOK;
503 	mflags |= (flags & BUS_DMA_ZERO) != 0 ? M_ZERO : 0;
504 	attr = (flags & BUS_DMA_NOCACHE) != 0 ? VM_MEMATTR_UNCACHEABLE :
505 	    VM_MEMATTR_DEFAULT;
506 
507 	tag = (struct bus_dma_tag_iommu *)dmat;
508 	map = (struct bus_dmamap_iommu *)*mapp;
509 
510 	if (tag->common.maxsize < PAGE_SIZE &&
511 	    tag->common.alignment <= tag->common.maxsize &&
512 	    attr == VM_MEMATTR_DEFAULT) {
513 		*vaddr = malloc_domainset(tag->common.maxsize, M_DEVBUF,
514 		    DOMAINSET_PREF(tag->common.domain), mflags);
515 		map->flags |= BUS_DMAMAP_IOMMU_MALLOC;
516 	} else {
517 		*vaddr = (void *)kmem_alloc_attr_domainset(
518 		    DOMAINSET_PREF(tag->common.domain), tag->common.maxsize,
519 		    mflags, 0ul, BUS_SPACE_MAXADDR, attr);
520 		map->flags |= BUS_DMAMAP_IOMMU_KMEM_ALLOC;
521 	}
522 	if (*vaddr == NULL) {
523 		iommu_bus_dmamap_destroy(dmat, *mapp);
524 		*mapp = NULL;
525 		return (ENOMEM);
526 	}
527 	return (0);
528 }
529 
530 static void
531 iommu_bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map1)
532 {
533 	struct bus_dma_tag_iommu *tag;
534 	struct bus_dmamap_iommu *map;
535 
536 	tag = (struct bus_dma_tag_iommu *)dmat;
537 	map = (struct bus_dmamap_iommu *)map1;
538 
539 	if ((map->flags & BUS_DMAMAP_IOMMU_MALLOC) != 0) {
540 		free(vaddr, M_DEVBUF);
541 		map->flags &= ~BUS_DMAMAP_IOMMU_MALLOC;
542 	} else {
543 		KASSERT((map->flags & BUS_DMAMAP_IOMMU_KMEM_ALLOC) != 0,
544 		    ("iommu_bus_dmamem_free for non alloced map %p", map));
545 		kmem_free((vm_offset_t)vaddr, tag->common.maxsize);
546 		map->flags &= ~BUS_DMAMAP_IOMMU_KMEM_ALLOC;
547 	}
548 
549 	iommu_bus_dmamap_destroy(dmat, map1);
550 }
551 
552 static int
553 iommu_bus_dmamap_load_something1(struct bus_dma_tag_iommu *tag,
554     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
555     int flags, bus_dma_segment_t *segs, int *segp,
556     struct iommu_map_entries_tailq *unroll_list)
557 {
558 	struct iommu_ctx *ctx;
559 	struct iommu_domain *domain;
560 	struct iommu_map_entry *entry;
561 	iommu_gaddr_t size;
562 	bus_size_t buflen1;
563 	int error, idx, gas_flags, seg;
564 
565 	KASSERT(offset < IOMMU_PAGE_SIZE, ("offset %d", offset));
566 	if (segs == NULL)
567 		segs = tag->segments;
568 	ctx = tag->ctx;
569 	domain = ctx->domain;
570 	seg = *segp;
571 	error = 0;
572 	idx = 0;
573 	while (buflen > 0) {
574 		seg++;
575 		if (seg >= tag->common.nsegments) {
576 			error = EFBIG;
577 			break;
578 		}
579 		buflen1 = buflen > tag->common.maxsegsz ?
580 		    tag->common.maxsegsz : buflen;
581 		size = round_page(offset + buflen1);
582 
583 		/*
584 		 * (Too) optimistically allow split if there are more
585 		 * then one segments left.
586 		 */
587 		gas_flags = map->cansleep ? IOMMU_MF_CANWAIT : 0;
588 		if (seg + 1 < tag->common.nsegments)
589 			gas_flags |= IOMMU_MF_CANSPLIT;
590 
591 		error = iommu_map(domain, &tag->common, size, offset,
592 		    IOMMU_MAP_ENTRY_READ |
593 		    ((flags & BUS_DMA_NOWRITE) == 0 ? IOMMU_MAP_ENTRY_WRITE : 0),
594 		    gas_flags, ma + idx, &entry);
595 		if (error != 0)
596 			break;
597 		if ((gas_flags & IOMMU_MF_CANSPLIT) != 0) {
598 			KASSERT(size >= entry->end - entry->start,
599 			    ("split increased entry size %jx %jx %jx",
600 			    (uintmax_t)size, (uintmax_t)entry->start,
601 			    (uintmax_t)entry->end));
602 			size = entry->end - entry->start;
603 			if (buflen1 > size)
604 				buflen1 = size;
605 		} else {
606 			KASSERT(entry->end - entry->start == size,
607 			    ("no split allowed %jx %jx %jx",
608 			    (uintmax_t)size, (uintmax_t)entry->start,
609 			    (uintmax_t)entry->end));
610 		}
611 		if (offset + buflen1 > size)
612 			buflen1 = size - offset;
613 		if (buflen1 > tag->common.maxsegsz)
614 			buflen1 = tag->common.maxsegsz;
615 
616 		KASSERT(((entry->start + offset) & (tag->common.alignment - 1))
617 		    == 0,
618 		    ("alignment failed: ctx %p start 0x%jx offset %x "
619 		    "align 0x%jx", ctx, (uintmax_t)entry->start, offset,
620 		    (uintmax_t)tag->common.alignment));
621 		KASSERT(entry->end <= tag->common.lowaddr ||
622 		    entry->start >= tag->common.highaddr,
623 		    ("entry placement failed: ctx %p start 0x%jx end 0x%jx "
624 		    "lowaddr 0x%jx highaddr 0x%jx", ctx,
625 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
626 		    (uintmax_t)tag->common.lowaddr,
627 		    (uintmax_t)tag->common.highaddr));
628 		KASSERT(iommu_test_boundary(entry->start + offset, buflen1,
629 		    tag->common.boundary),
630 		    ("boundary failed: ctx %p start 0x%jx end 0x%jx "
631 		    "boundary 0x%jx", ctx, (uintmax_t)entry->start,
632 		    (uintmax_t)entry->end, (uintmax_t)tag->common.boundary));
633 		KASSERT(buflen1 <= tag->common.maxsegsz,
634 		    ("segment too large: ctx %p start 0x%jx end 0x%jx "
635 		    "buflen1 0x%jx maxsegsz 0x%jx", ctx,
636 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
637 		    (uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz));
638 
639 		IOMMU_DOMAIN_LOCK(domain);
640 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
641 		entry->flags |= IOMMU_MAP_ENTRY_MAP;
642 		IOMMU_DOMAIN_UNLOCK(domain);
643 		TAILQ_INSERT_TAIL(unroll_list, entry, unroll_link);
644 
645 		segs[seg].ds_addr = entry->start + offset;
646 		segs[seg].ds_len = buflen1;
647 
648 		idx += OFF_TO_IDX(trunc_page(offset + buflen1));
649 		offset += buflen1;
650 		offset &= IOMMU_PAGE_MASK;
651 		buflen -= buflen1;
652 	}
653 	if (error == 0)
654 		*segp = seg;
655 	return (error);
656 }
657 
658 static int
659 iommu_bus_dmamap_load_something(struct bus_dma_tag_iommu *tag,
660     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
661     int flags, bus_dma_segment_t *segs, int *segp)
662 {
663 	struct iommu_ctx *ctx;
664 	struct iommu_domain *domain;
665 	struct iommu_map_entry *entry, *entry1;
666 	struct iommu_map_entries_tailq unroll_list;
667 	int error;
668 
669 	ctx = tag->ctx;
670 	domain = ctx->domain;
671 	atomic_add_long(&ctx->loads, 1);
672 
673 	TAILQ_INIT(&unroll_list);
674 	error = iommu_bus_dmamap_load_something1(tag, map, ma, offset,
675 	    buflen, flags, segs, segp, &unroll_list);
676 	if (error != 0) {
677 		/*
678 		 * The busdma interface does not allow us to report
679 		 * partial buffer load, so unfortunately we have to
680 		 * revert all work done.
681 		 */
682 		IOMMU_DOMAIN_LOCK(domain);
683 		TAILQ_FOREACH_SAFE(entry, &unroll_list, unroll_link,
684 		    entry1) {
685 			/*
686 			 * No entries other than what we have created
687 			 * during the failed run might have been
688 			 * inserted there in between, since we own ctx
689 			 * pglock.
690 			 */
691 			TAILQ_REMOVE(&map->map_entries, entry, dmamap_link);
692 			TAILQ_REMOVE(&unroll_list, entry, unroll_link);
693 			TAILQ_INSERT_TAIL(&domain->unload_entries, entry,
694 			    dmamap_link);
695 		}
696 		IOMMU_DOMAIN_UNLOCK(domain);
697 		taskqueue_enqueue(domain->iommu->delayed_taskqueue,
698 		    &domain->unload_task);
699 	}
700 
701 	if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 &&
702 	    !map->cansleep)
703 		error = EINPROGRESS;
704 	if (error == EINPROGRESS)
705 		iommu_bus_schedule_dmamap(domain->iommu, map);
706 	return (error);
707 }
708 
709 static int
710 iommu_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map1,
711     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
712     bus_dma_segment_t *segs, int *segp)
713 {
714 	struct bus_dma_tag_iommu *tag;
715 	struct bus_dmamap_iommu *map;
716 
717 	tag = (struct bus_dma_tag_iommu *)dmat;
718 	map = (struct bus_dmamap_iommu *)map1;
719 	return (iommu_bus_dmamap_load_something(tag, map, ma, ma_offs, tlen,
720 	    flags, segs, segp));
721 }
722 
723 static int
724 iommu_bus_dmamap_load_phys(bus_dma_tag_t dmat, bus_dmamap_t map1,
725     vm_paddr_t buf, bus_size_t buflen, int flags, bus_dma_segment_t *segs,
726     int *segp)
727 {
728 	struct bus_dma_tag_iommu *tag;
729 	struct bus_dmamap_iommu *map;
730 	vm_page_t *ma, fma;
731 	vm_paddr_t pstart, pend, paddr;
732 	int error, i, ma_cnt, mflags, offset;
733 
734 	tag = (struct bus_dma_tag_iommu *)dmat;
735 	map = (struct bus_dmamap_iommu *)map1;
736 	pstart = trunc_page(buf);
737 	pend = round_page(buf + buflen);
738 	offset = buf & PAGE_MASK;
739 	ma_cnt = OFF_TO_IDX(pend - pstart);
740 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
741 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
742 	if (ma == NULL)
743 		return (ENOMEM);
744 	fma = NULL;
745 	for (i = 0; i < ma_cnt; i++) {
746 		paddr = pstart + ptoa(i);
747 		ma[i] = PHYS_TO_VM_PAGE(paddr);
748 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
749 			/*
750 			 * If PHYS_TO_VM_PAGE() returned NULL or the
751 			 * vm_page was not initialized we'll use a
752 			 * fake page.
753 			 */
754 			if (fma == NULL) {
755 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
756 				    M_DEVBUF, M_ZERO | mflags);
757 				if (fma == NULL) {
758 					free(ma, M_DEVBUF);
759 					return (ENOMEM);
760 				}
761 			}
762 			vm_page_initfake(&fma[i], pstart + ptoa(i),
763 			    VM_MEMATTR_DEFAULT);
764 			ma[i] = &fma[i];
765 		}
766 	}
767 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
768 	    flags, segs, segp);
769 	free(fma, M_DEVBUF);
770 	free(ma, M_DEVBUF);
771 	return (error);
772 }
773 
774 static int
775 iommu_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map1, void *buf,
776     bus_size_t buflen, pmap_t pmap, int flags, bus_dma_segment_t *segs,
777     int *segp)
778 {
779 	struct bus_dma_tag_iommu *tag;
780 	struct bus_dmamap_iommu *map;
781 	vm_page_t *ma, fma;
782 	vm_paddr_t pstart, pend, paddr;
783 	int error, i, ma_cnt, mflags, offset;
784 
785 	tag = (struct bus_dma_tag_iommu *)dmat;
786 	map = (struct bus_dmamap_iommu *)map1;
787 	pstart = trunc_page((vm_offset_t)buf);
788 	pend = round_page((vm_offset_t)buf + buflen);
789 	offset = (vm_offset_t)buf & PAGE_MASK;
790 	ma_cnt = OFF_TO_IDX(pend - pstart);
791 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
792 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
793 	if (ma == NULL)
794 		return (ENOMEM);
795 	fma = NULL;
796 	for (i = 0; i < ma_cnt; i++, pstart += PAGE_SIZE) {
797 		if (pmap == kernel_pmap)
798 			paddr = pmap_kextract(pstart);
799 		else
800 			paddr = pmap_extract(pmap, pstart);
801 		ma[i] = PHYS_TO_VM_PAGE(paddr);
802 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
803 			/*
804 			 * If PHYS_TO_VM_PAGE() returned NULL or the
805 			 * vm_page was not initialized we'll use a
806 			 * fake page.
807 			 */
808 			if (fma == NULL) {
809 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
810 				    M_DEVBUF, M_ZERO | mflags);
811 				if (fma == NULL) {
812 					free(ma, M_DEVBUF);
813 					return (ENOMEM);
814 				}
815 			}
816 			vm_page_initfake(&fma[i], paddr, VM_MEMATTR_DEFAULT);
817 			ma[i] = &fma[i];
818 		}
819 	}
820 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
821 	    flags, segs, segp);
822 	free(ma, M_DEVBUF);
823 	free(fma, M_DEVBUF);
824 	return (error);
825 }
826 
827 static void
828 iommu_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map1,
829     struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg)
830 {
831 	struct bus_dmamap_iommu *map;
832 
833 	if (map1 == NULL)
834 		return;
835 	map = (struct bus_dmamap_iommu *)map1;
836 	map->mem = *mem;
837 	map->tag = (struct bus_dma_tag_iommu *)dmat;
838 	map->callback = callback;
839 	map->callback_arg = callback_arg;
840 }
841 
842 static bus_dma_segment_t *
843 iommu_bus_dmamap_complete(bus_dma_tag_t dmat, bus_dmamap_t map1,
844     bus_dma_segment_t *segs, int nsegs, int error)
845 {
846 	struct bus_dma_tag_iommu *tag;
847 	struct bus_dmamap_iommu *map;
848 
849 	tag = (struct bus_dma_tag_iommu *)dmat;
850 	map = (struct bus_dmamap_iommu *)map1;
851 
852 	if (!map->locked) {
853 		KASSERT(map->cansleep,
854 		    ("map not locked and not sleepable context %p", map));
855 
856 		/*
857 		 * We are called from the delayed context.  Relock the
858 		 * driver.
859 		 */
860 		(tag->common.lockfunc)(tag->common.lockfuncarg, BUS_DMA_LOCK);
861 		map->locked = true;
862 	}
863 
864 	if (segs == NULL)
865 		segs = tag->segments;
866 	return (segs);
867 }
868 
869 /*
870  * The limitations of busdma KPI forces the iommu to perform the actual
871  * unload, consisting of the unmapping of the map entries page tables,
872  * from the delayed context on i386, since page table page mapping
873  * might require a sleep to be successfull.  The unfortunate
874  * consequence is that the DMA requests can be served some time after
875  * the bus_dmamap_unload() call returned.
876  *
877  * On amd64, we assume that sf allocation cannot fail.
878  */
879 static void
880 iommu_bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t map1)
881 {
882 	struct bus_dma_tag_iommu *tag;
883 	struct bus_dmamap_iommu *map;
884 	struct iommu_ctx *ctx;
885 	struct iommu_domain *domain;
886 #if defined(__amd64__)
887 	struct iommu_map_entries_tailq entries;
888 #endif
889 
890 	tag = (struct bus_dma_tag_iommu *)dmat;
891 	map = (struct bus_dmamap_iommu *)map1;
892 	ctx = tag->ctx;
893 	domain = ctx->domain;
894 	atomic_add_long(&ctx->unloads, 1);
895 
896 #if defined(__i386__)
897 	IOMMU_DOMAIN_LOCK(domain);
898 	TAILQ_CONCAT(&domain->unload_entries, &map->map_entries, dmamap_link);
899 	IOMMU_DOMAIN_UNLOCK(domain);
900 	taskqueue_enqueue(domain->iommu->delayed_taskqueue,
901 	    &domain->unload_task);
902 #else /* defined(__amd64__) */
903 	TAILQ_INIT(&entries);
904 	IOMMU_DOMAIN_LOCK(domain);
905 	TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link);
906 	IOMMU_DOMAIN_UNLOCK(domain);
907 	THREAD_NO_SLEEPING();
908 	iommu_domain_unload(domain, &entries, false);
909 	THREAD_SLEEPING_OK();
910 	KASSERT(TAILQ_EMPTY(&entries), ("lazy iommu_ctx_unload %p", ctx));
911 #endif
912 }
913 
914 static void
915 iommu_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map,
916     bus_dmasync_op_t op)
917 {
918 }
919 
920 struct bus_dma_impl bus_dma_iommu_impl = {
921 	.tag_create = iommu_bus_dma_tag_create,
922 	.tag_destroy = iommu_bus_dma_tag_destroy,
923 	.tag_set_domain = iommu_bus_dma_tag_set_domain,
924 	.id_mapped = iommu_bus_dma_id_mapped,
925 	.map_create = iommu_bus_dmamap_create,
926 	.map_destroy = iommu_bus_dmamap_destroy,
927 	.mem_alloc = iommu_bus_dmamem_alloc,
928 	.mem_free = iommu_bus_dmamem_free,
929 	.load_phys = iommu_bus_dmamap_load_phys,
930 	.load_buffer = iommu_bus_dmamap_load_buffer,
931 	.load_ma = iommu_bus_dmamap_load_ma,
932 	.map_waitok = iommu_bus_dmamap_waitok,
933 	.map_complete = iommu_bus_dmamap_complete,
934 	.map_unload = iommu_bus_dmamap_unload,
935 	.map_sync = iommu_bus_dmamap_sync,
936 };
937 
938 static void
939 iommu_bus_task_dmamap(void *arg, int pending)
940 {
941 	struct bus_dma_tag_iommu *tag;
942 	struct bus_dmamap_iommu *map;
943 	struct iommu_unit *unit;
944 
945 	unit = arg;
946 	IOMMU_LOCK(unit);
947 	while ((map = TAILQ_FIRST(&unit->delayed_maps)) != NULL) {
948 		TAILQ_REMOVE(&unit->delayed_maps, map, delay_link);
949 		IOMMU_UNLOCK(unit);
950 		tag = map->tag;
951 		map->cansleep = true;
952 		map->locked = false;
953 		bus_dmamap_load_mem((bus_dma_tag_t)tag, (bus_dmamap_t)map,
954 		    &map->mem, map->callback, map->callback_arg,
955 		    BUS_DMA_WAITOK);
956 		map->cansleep = false;
957 		if (map->locked) {
958 			(tag->common.lockfunc)(tag->common.lockfuncarg,
959 			    BUS_DMA_UNLOCK);
960 		} else
961 			map->locked = true;
962 		map->cansleep = false;
963 		IOMMU_LOCK(unit);
964 	}
965 	IOMMU_UNLOCK(unit);
966 }
967 
968 static void
969 iommu_bus_schedule_dmamap(struct iommu_unit *unit, struct bus_dmamap_iommu *map)
970 {
971 
972 	map->locked = false;
973 	IOMMU_LOCK(unit);
974 	TAILQ_INSERT_TAIL(&unit->delayed_maps, map, delay_link);
975 	IOMMU_UNLOCK(unit);
976 	taskqueue_enqueue(unit->delayed_taskqueue, &unit->dmamap_load_task);
977 }
978 
979 int
980 iommu_init_busdma(struct iommu_unit *unit)
981 {
982 	int error;
983 
984 	unit->dma_enabled = 1;
985 	error = TUNABLE_INT_FETCH("hw.iommu.dma", &unit->dma_enabled);
986 	if (error == 0) /* compatibility */
987 		TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled);
988 	TAILQ_INIT(&unit->delayed_maps);
989 	TASK_INIT(&unit->dmamap_load_task, 0, iommu_bus_task_dmamap, unit);
990 	unit->delayed_taskqueue = taskqueue_create("iommu", M_WAITOK,
991 	    taskqueue_thread_enqueue, &unit->delayed_taskqueue);
992 	taskqueue_start_threads(&unit->delayed_taskqueue, 1, PI_DISK,
993 	    "iommu%d busdma taskq", unit->unit);
994 	return (0);
995 }
996 
997 void
998 iommu_fini_busdma(struct iommu_unit *unit)
999 {
1000 
1001 	if (unit->delayed_taskqueue == NULL)
1002 		return;
1003 
1004 	taskqueue_drain(unit->delayed_taskqueue, &unit->dmamap_load_task);
1005 	taskqueue_free(unit->delayed_taskqueue);
1006 	unit->delayed_taskqueue = NULL;
1007 }
1008 
1009 int
1010 bus_dma_iommu_load_ident(bus_dma_tag_t dmat, bus_dmamap_t map1,
1011     vm_paddr_t start, vm_size_t length, int flags)
1012 {
1013 	struct bus_dma_tag_common *tc;
1014 	struct bus_dma_tag_iommu *tag;
1015 	struct bus_dmamap_iommu *map;
1016 	struct iommu_ctx *ctx;
1017 	struct iommu_domain *domain;
1018 	struct iommu_map_entry *entry;
1019 	vm_page_t *ma;
1020 	vm_size_t i;
1021 	int error;
1022 	bool waitok;
1023 
1024 	MPASS((start & PAGE_MASK) == 0);
1025 	MPASS((length & PAGE_MASK) == 0);
1026 	MPASS(length > 0);
1027 	MPASS(start + length >= start);
1028 	MPASS((flags & ~(BUS_DMA_NOWAIT | BUS_DMA_NOWRITE)) == 0);
1029 
1030 	tc = (struct bus_dma_tag_common *)dmat;
1031 	if (tc->impl != &bus_dma_iommu_impl)
1032 		return (0);
1033 
1034 	tag = (struct bus_dma_tag_iommu *)dmat;
1035 	ctx = tag->ctx;
1036 	domain = ctx->domain;
1037 	map = (struct bus_dmamap_iommu *)map1;
1038 	waitok = (flags & BUS_DMA_NOWAIT) != 0;
1039 
1040 	entry = iommu_map_alloc_entry(domain, waitok ? 0 : IOMMU_PGF_WAITOK);
1041 	if (entry == NULL)
1042 		return (ENOMEM);
1043 	entry->start = start;
1044 	entry->end = start + length;
1045 	ma = malloc(sizeof(vm_page_t) * atop(length), M_TEMP, waitok ?
1046 	    M_WAITOK : M_NOWAIT);
1047 	if (ma == NULL) {
1048 		iommu_map_free_entry(domain, entry);
1049 		return (ENOMEM);
1050 	}
1051 	for (i = 0; i < atop(length); i++) {
1052 		ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
1053 		    VM_MEMATTR_DEFAULT);
1054 	}
1055 	error = iommu_map_region(domain, entry, IOMMU_MAP_ENTRY_READ |
1056 	    ((flags & BUS_DMA_NOWRITE) ? 0 : IOMMU_MAP_ENTRY_WRITE),
1057 	    waitok ? IOMMU_MF_CANWAIT : 0, ma);
1058 	if (error == 0) {
1059 		IOMMU_DOMAIN_LOCK(domain);
1060 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
1061 		entry->flags |= IOMMU_MAP_ENTRY_MAP;
1062 		IOMMU_DOMAIN_UNLOCK(domain);
1063 	} else {
1064 		iommu_domain_unload_entry(entry, true);
1065 	}
1066 	for (i = 0; i < atop(length); i++)
1067 		vm_page_putfake(ma[i]);
1068 	free(ma, M_TEMP);
1069 	return (error);
1070 }
1071