xref: /freebsd/sys/dev/iommu/busdma_iommu.c (revision d7d962ead0b6e5e8a39202d0590022082bf5bfb6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/domainset.h>
37 #include <sys/malloc.h>
38 #include <sys/bus.h>
39 #include <sys/conf.h>
40 #include <sys/interrupt.h>
41 #include <sys/kernel.h>
42 #include <sys/ktr.h>
43 #include <sys/lock.h>
44 #include <sys/proc.h>
45 #include <sys/memdesc.h>
46 #include <sys/msan.h>
47 #include <sys/mutex.h>
48 #include <sys/sysctl.h>
49 #include <sys/rman.h>
50 #include <sys/taskqueue.h>
51 #include <sys/tree.h>
52 #include <sys/uio.h>
53 #include <sys/vmem.h>
54 #include <dev/pci/pcireg.h>
55 #include <dev/pci/pcivar.h>
56 #include <vm/vm.h>
57 #include <vm/vm_extern.h>
58 #include <vm/vm_kern.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_map.h>
62 #include <dev/iommu/iommu.h>
63 #include <machine/atomic.h>
64 #include <machine/bus.h>
65 #include <machine/md_var.h>
66 #include <machine/iommu.h>
67 #include <dev/iommu/busdma_iommu.h>
68 
69 /*
70  * busdma_iommu.c, the implementation of the busdma(9) interface using
71  * IOMMU units from Intel VT-d.
72  */
73 
74 static bool
75 iommu_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func)
76 {
77 	char str[128], *env;
78 	int default_bounce;
79 	bool ret;
80 	static const char bounce_str[] = "bounce";
81 	static const char iommu_str[] = "iommu";
82 	static const char dmar_str[] = "dmar"; /* compatibility */
83 
84 	default_bounce = 0;
85 	env = kern_getenv("hw.busdma.default");
86 	if (env != NULL) {
87 		if (strcmp(env, bounce_str) == 0)
88 			default_bounce = 1;
89 		else if (strcmp(env, iommu_str) == 0 ||
90 		    strcmp(env, dmar_str) == 0)
91 			default_bounce = 0;
92 		freeenv(env);
93 	}
94 
95 	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d",
96 	    domain, bus, slot, func);
97 	env = kern_getenv(str);
98 	if (env == NULL)
99 		return (default_bounce != 0);
100 	if (strcmp(env, bounce_str) == 0)
101 		ret = true;
102 	else if (strcmp(env, iommu_str) == 0 ||
103 	    strcmp(env, dmar_str) == 0)
104 		ret = false;
105 	else
106 		ret = default_bounce != 0;
107 	freeenv(env);
108 	return (ret);
109 }
110 
111 /*
112  * Given original device, find the requester ID that will be seen by
113  * the IOMMU unit and used for page table lookup.  PCI bridges may take
114  * ownership of transactions from downstream devices, so it may not be
115  * the same as the BSF of the target device.  In those cases, all
116  * devices downstream of the bridge must share a single mapping
117  * domain, and must collectively be assigned to use either IOMMU or
118  * bounce mapping.
119  */
120 device_t
121 iommu_get_requester(device_t dev, uint16_t *rid)
122 {
123 	devclass_t pci_class;
124 	device_t l, pci, pcib, pcip, pcibp, requester;
125 	int cap_offset;
126 	uint16_t pcie_flags;
127 	bool bridge_is_pcie;
128 
129 	pci_class = devclass_find("pci");
130 	l = requester = dev;
131 
132 	*rid = pci_get_rid(dev);
133 
134 	/*
135 	 * Walk the bridge hierarchy from the target device to the
136 	 * host port to find the translating bridge nearest the IOMMU
137 	 * unit.
138 	 */
139 	for (;;) {
140 		pci = device_get_parent(l);
141 		KASSERT(pci != NULL, ("iommu_get_requester(%s): NULL parent "
142 		    "for %s", device_get_name(dev), device_get_name(l)));
143 		KASSERT(device_get_devclass(pci) == pci_class,
144 		    ("iommu_get_requester(%s): non-pci parent %s for %s",
145 		    device_get_name(dev), device_get_name(pci),
146 		    device_get_name(l)));
147 
148 		pcib = device_get_parent(pci);
149 		KASSERT(pcib != NULL, ("iommu_get_requester(%s): NULL bridge "
150 		    "for %s", device_get_name(dev), device_get_name(pci)));
151 
152 		/*
153 		 * The parent of our "bridge" isn't another PCI bus,
154 		 * so pcib isn't a PCI->PCI bridge but rather a host
155 		 * port, and the requester ID won't be translated
156 		 * further.
157 		 */
158 		pcip = device_get_parent(pcib);
159 		if (device_get_devclass(pcip) != pci_class)
160 			break;
161 		pcibp = device_get_parent(pcip);
162 
163 		if (pci_find_cap(l, PCIY_EXPRESS, &cap_offset) == 0) {
164 			/*
165 			 * Do not stop the loop even if the target
166 			 * device is PCIe, because it is possible (but
167 			 * unlikely) to have a PCI->PCIe bridge
168 			 * somewhere in the hierarchy.
169 			 */
170 			l = pcib;
171 		} else {
172 			/*
173 			 * Device is not PCIe, it cannot be seen as a
174 			 * requester by IOMMU unit.  Check whether the
175 			 * bridge is PCIe.
176 			 */
177 			bridge_is_pcie = pci_find_cap(pcib, PCIY_EXPRESS,
178 			    &cap_offset) == 0;
179 			requester = pcib;
180 
181 			/*
182 			 * Check for a buggy PCIe/PCI bridge that
183 			 * doesn't report the express capability.  If
184 			 * the bridge above it is express but isn't a
185 			 * PCI bridge, then we know pcib is actually a
186 			 * PCIe/PCI bridge.
187 			 */
188 			if (!bridge_is_pcie && pci_find_cap(pcibp,
189 			    PCIY_EXPRESS, &cap_offset) == 0) {
190 				pcie_flags = pci_read_config(pcibp,
191 				    cap_offset + PCIER_FLAGS, 2);
192 				if ((pcie_flags & PCIEM_FLAGS_TYPE) !=
193 				    PCIEM_TYPE_PCI_BRIDGE)
194 					bridge_is_pcie = true;
195 			}
196 
197 			if (bridge_is_pcie) {
198 				/*
199 				 * The current device is not PCIe, but
200 				 * the bridge above it is.  This is a
201 				 * PCIe->PCI bridge.  Assume that the
202 				 * requester ID will be the secondary
203 				 * bus number with slot and function
204 				 * set to zero.
205 				 *
206 				 * XXX: Doesn't handle the case where
207 				 * the bridge is PCIe->PCI-X, and the
208 				 * bridge will only take ownership of
209 				 * requests in some cases.  We should
210 				 * provide context entries with the
211 				 * same page tables for taken and
212 				 * non-taken transactions.
213 				 */
214 				*rid = PCI_RID(pci_get_bus(l), 0, 0);
215 				l = pcibp;
216 			} else {
217 				/*
218 				 * Neither the device nor the bridge
219 				 * above it are PCIe.  This is a
220 				 * conventional PCI->PCI bridge, which
221 				 * will use the bridge's BSF as the
222 				 * requester ID.
223 				 */
224 				*rid = pci_get_rid(pcib);
225 				l = pcib;
226 			}
227 		}
228 	}
229 	return (requester);
230 }
231 
232 struct iommu_ctx *
233 iommu_instantiate_ctx(struct iommu_unit *unit, device_t dev, bool rmrr)
234 {
235 	device_t requester;
236 	struct iommu_ctx *ctx;
237 	bool disabled;
238 	uint16_t rid;
239 
240 	requester = iommu_get_requester(dev, &rid);
241 
242 	/*
243 	 * If the user requested the IOMMU disabled for the device, we
244 	 * cannot disable the IOMMU unit, due to possibility of other
245 	 * devices on the same IOMMU unit still requiring translation.
246 	 * Instead provide the identity mapping for the device
247 	 * context.
248 	 */
249 	disabled = iommu_bus_dma_is_dev_disabled(pci_get_domain(requester),
250 	    pci_get_bus(requester), pci_get_slot(requester),
251 	    pci_get_function(requester));
252 	ctx = iommu_get_ctx(unit, requester, rid, disabled, rmrr);
253 	if (ctx == NULL)
254 		return (NULL);
255 	if (disabled) {
256 		/*
257 		 * Keep the first reference on context, release the
258 		 * later refs.
259 		 */
260 		IOMMU_LOCK(unit);
261 		if ((ctx->flags & IOMMU_CTX_DISABLED) == 0) {
262 			ctx->flags |= IOMMU_CTX_DISABLED;
263 			IOMMU_UNLOCK(unit);
264 		} else {
265 			iommu_free_ctx_locked(unit, ctx);
266 		}
267 		ctx = NULL;
268 	}
269 	return (ctx);
270 }
271 
272 struct iommu_ctx *
273 iommu_get_dev_ctx(device_t dev)
274 {
275 	struct iommu_unit *unit;
276 
277 	unit = iommu_find(dev, bootverbose);
278 	/* Not in scope of any IOMMU ? */
279 	if (unit == NULL)
280 		return (NULL);
281 	if (!unit->dma_enabled)
282 		return (NULL);
283 
284 #if defined(__amd64__) || defined(__i386__)
285 	dmar_quirks_pre_use(unit);
286 	dmar_instantiate_rmrr_ctxs(unit);
287 #endif
288 
289 	return (iommu_instantiate_ctx(unit, dev, false));
290 }
291 
292 bus_dma_tag_t
293 iommu_get_dma_tag(device_t dev, device_t child)
294 {
295 	struct iommu_ctx *ctx;
296 	bus_dma_tag_t res;
297 
298 	ctx = iommu_get_dev_ctx(child);
299 	if (ctx == NULL)
300 		return (NULL);
301 
302 	res = (bus_dma_tag_t)ctx->tag;
303 	return (res);
304 }
305 
306 bool
307 bus_dma_iommu_set_buswide(device_t dev)
308 {
309 	struct iommu_unit *unit;
310 	device_t parent;
311 	u_int busno, slot, func;
312 
313 	parent = device_get_parent(dev);
314 	if (device_get_devclass(parent) != devclass_find("pci"))
315 		return (false);
316 	unit = iommu_find(dev, bootverbose);
317 	if (unit == NULL)
318 		return (false);
319 	busno = pci_get_bus(dev);
320 	slot = pci_get_slot(dev);
321 	func = pci_get_function(dev);
322 	if (slot != 0 || func != 0) {
323 		if (bootverbose) {
324 			device_printf(dev,
325 			    "iommu%d pci%d:%d:%d requested buswide busdma\n",
326 			    unit->unit, busno, slot, func);
327 		}
328 		return (false);
329 	}
330 	iommu_set_buswide_ctx(unit, busno);
331 	return (true);
332 }
333 
334 void
335 iommu_set_buswide_ctx(struct iommu_unit *unit, u_int busno)
336 {
337 
338 	MPASS(busno <= PCI_BUSMAX);
339 	IOMMU_LOCK(unit);
340 	unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] |=
341 	    1 << (busno % (NBBY * sizeof(uint32_t)));
342 	IOMMU_UNLOCK(unit);
343 }
344 
345 bool
346 iommu_is_buswide_ctx(struct iommu_unit *unit, u_int busno)
347 {
348 
349 	MPASS(busno <= PCI_BUSMAX);
350 	return ((unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] &
351 	    (1U << (busno % (NBBY * sizeof(uint32_t))))) != 0);
352 }
353 
354 static MALLOC_DEFINE(M_IOMMU_DMAMAP, "iommu_dmamap", "IOMMU DMA Map");
355 
356 static void iommu_bus_schedule_dmamap(struct iommu_unit *unit,
357     struct bus_dmamap_iommu *map);
358 
359 static int
360 iommu_bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
361     bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr,
362     bus_dma_filter_t *filter, void *filterarg, bus_size_t maxsize,
363     int nsegments, bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc,
364     void *lockfuncarg, bus_dma_tag_t *dmat)
365 {
366 	struct bus_dma_tag_iommu *newtag, *oldtag;
367 	int error;
368 
369 	*dmat = NULL;
370 	error = common_bus_dma_tag_create(parent != NULL ?
371 	    &((struct bus_dma_tag_iommu *)parent)->common : NULL, alignment,
372 	    boundary, lowaddr, highaddr, filter, filterarg, maxsize,
373 	    nsegments, maxsegsz, flags, lockfunc, lockfuncarg,
374 	    sizeof(struct bus_dma_tag_iommu), (void **)&newtag);
375 	if (error != 0)
376 		goto out;
377 
378 	oldtag = (struct bus_dma_tag_iommu *)parent;
379 	newtag->common.impl = &bus_dma_iommu_impl;
380 	newtag->ctx = oldtag->ctx;
381 	newtag->owner = oldtag->owner;
382 
383 	*dmat = (bus_dma_tag_t)newtag;
384 out:
385 	CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
386 	    __func__, newtag, (newtag != NULL ? newtag->common.flags : 0),
387 	    error);
388 	return (error);
389 }
390 
391 static int
392 iommu_bus_dma_tag_set_domain(bus_dma_tag_t dmat)
393 {
394 
395 	return (0);
396 }
397 
398 static int
399 iommu_bus_dma_tag_destroy(bus_dma_tag_t dmat1)
400 {
401 	struct bus_dma_tag_iommu *dmat, *dmat_copy, *parent;
402 	int error;
403 
404 	error = 0;
405 	dmat_copy = dmat = (struct bus_dma_tag_iommu *)dmat1;
406 
407 	if (dmat != NULL) {
408 		if (dmat->map_count != 0) {
409 			error = EBUSY;
410 			goto out;
411 		}
412 		while (dmat != NULL) {
413 			parent = (struct bus_dma_tag_iommu *)dmat->common.parent;
414 			if (atomic_fetchadd_int(&dmat->common.ref_count, -1) ==
415 			    1) {
416 				if (dmat == dmat->ctx->tag)
417 					iommu_free_ctx(dmat->ctx);
418 				free(dmat->segments, M_IOMMU_DMAMAP);
419 				free(dmat, M_DEVBUF);
420 				dmat = parent;
421 			} else
422 				dmat = NULL;
423 		}
424 	}
425 out:
426 	CTR3(KTR_BUSDMA, "%s tag %p error %d", __func__, dmat_copy, error);
427 	return (error);
428 }
429 
430 static bool
431 iommu_bus_dma_id_mapped(bus_dma_tag_t dmat, vm_paddr_t buf, bus_size_t buflen)
432 {
433 
434 	return (false);
435 }
436 
437 static int
438 iommu_bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp)
439 {
440 	struct bus_dma_tag_iommu *tag;
441 	struct bus_dmamap_iommu *map;
442 
443 	tag = (struct bus_dma_tag_iommu *)dmat;
444 	map = malloc_domainset(sizeof(*map), M_IOMMU_DMAMAP,
445 	    DOMAINSET_PREF(tag->common.domain), M_NOWAIT | M_ZERO);
446 	if (map == NULL) {
447 		*mapp = NULL;
448 		return (ENOMEM);
449 	}
450 	if (tag->segments == NULL) {
451 		tag->segments = malloc_domainset(sizeof(bus_dma_segment_t) *
452 		    tag->common.nsegments, M_IOMMU_DMAMAP,
453 		    DOMAINSET_PREF(tag->common.domain), M_NOWAIT);
454 		if (tag->segments == NULL) {
455 			free(map, M_IOMMU_DMAMAP);
456 			*mapp = NULL;
457 			return (ENOMEM);
458 		}
459 	}
460 	TAILQ_INIT(&map->map_entries);
461 	map->tag = tag;
462 	map->locked = true;
463 	map->cansleep = false;
464 	tag->map_count++;
465 	*mapp = (bus_dmamap_t)map;
466 
467 	return (0);
468 }
469 
470 static int
471 iommu_bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map1)
472 {
473 	struct bus_dma_tag_iommu *tag;
474 	struct bus_dmamap_iommu *map;
475 	struct iommu_domain *domain;
476 
477 	tag = (struct bus_dma_tag_iommu *)dmat;
478 	map = (struct bus_dmamap_iommu *)map1;
479 	if (map != NULL) {
480 		domain = tag->ctx->domain;
481 		IOMMU_DOMAIN_LOCK(domain);
482 		if (!TAILQ_EMPTY(&map->map_entries)) {
483 			IOMMU_DOMAIN_UNLOCK(domain);
484 			return (EBUSY);
485 		}
486 		IOMMU_DOMAIN_UNLOCK(domain);
487 		free(map, M_IOMMU_DMAMAP);
488 	}
489 	tag->map_count--;
490 	return (0);
491 }
492 
493 
494 static int
495 iommu_bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
496     bus_dmamap_t *mapp)
497 {
498 	struct bus_dma_tag_iommu *tag;
499 	struct bus_dmamap_iommu *map;
500 	int error, mflags;
501 	vm_memattr_t attr;
502 
503 	error = iommu_bus_dmamap_create(dmat, flags, mapp);
504 	if (error != 0)
505 		return (error);
506 
507 	mflags = (flags & BUS_DMA_NOWAIT) != 0 ? M_NOWAIT : M_WAITOK;
508 	mflags |= (flags & BUS_DMA_ZERO) != 0 ? M_ZERO : 0;
509 	attr = (flags & BUS_DMA_NOCACHE) != 0 ? VM_MEMATTR_UNCACHEABLE :
510 	    VM_MEMATTR_DEFAULT;
511 
512 	tag = (struct bus_dma_tag_iommu *)dmat;
513 	map = (struct bus_dmamap_iommu *)*mapp;
514 
515 	if (tag->common.maxsize < PAGE_SIZE &&
516 	    tag->common.alignment <= tag->common.maxsize &&
517 	    attr == VM_MEMATTR_DEFAULT) {
518 		*vaddr = malloc_domainset(tag->common.maxsize, M_DEVBUF,
519 		    DOMAINSET_PREF(tag->common.domain), mflags);
520 		map->flags |= BUS_DMAMAP_IOMMU_MALLOC;
521 	} else {
522 		*vaddr = (void *)kmem_alloc_attr_domainset(
523 		    DOMAINSET_PREF(tag->common.domain), tag->common.maxsize,
524 		    mflags, 0ul, BUS_SPACE_MAXADDR, attr);
525 		map->flags |= BUS_DMAMAP_IOMMU_KMEM_ALLOC;
526 	}
527 	if (*vaddr == NULL) {
528 		iommu_bus_dmamap_destroy(dmat, *mapp);
529 		*mapp = NULL;
530 		return (ENOMEM);
531 	}
532 	return (0);
533 }
534 
535 static void
536 iommu_bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map1)
537 {
538 	struct bus_dma_tag_iommu *tag;
539 	struct bus_dmamap_iommu *map;
540 
541 	tag = (struct bus_dma_tag_iommu *)dmat;
542 	map = (struct bus_dmamap_iommu *)map1;
543 
544 	if ((map->flags & BUS_DMAMAP_IOMMU_MALLOC) != 0) {
545 		free(vaddr, M_DEVBUF);
546 		map->flags &= ~BUS_DMAMAP_IOMMU_MALLOC;
547 	} else {
548 		KASSERT((map->flags & BUS_DMAMAP_IOMMU_KMEM_ALLOC) != 0,
549 		    ("iommu_bus_dmamem_free for non alloced map %p", map));
550 		kmem_free((vm_offset_t)vaddr, tag->common.maxsize);
551 		map->flags &= ~BUS_DMAMAP_IOMMU_KMEM_ALLOC;
552 	}
553 
554 	iommu_bus_dmamap_destroy(dmat, map1);
555 }
556 
557 static int
558 iommu_bus_dmamap_load_something1(struct bus_dma_tag_iommu *tag,
559     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
560     int flags, bus_dma_segment_t *segs, int *segp,
561     struct iommu_map_entries_tailq *unroll_list)
562 {
563 	struct iommu_ctx *ctx;
564 	struct iommu_domain *domain;
565 	struct iommu_map_entry *entry;
566 	iommu_gaddr_t size;
567 	bus_size_t buflen1;
568 	int error, idx, gas_flags, seg;
569 
570 	KASSERT(offset < IOMMU_PAGE_SIZE, ("offset %d", offset));
571 	if (segs == NULL)
572 		segs = tag->segments;
573 	ctx = tag->ctx;
574 	domain = ctx->domain;
575 	seg = *segp;
576 	error = 0;
577 	idx = 0;
578 	while (buflen > 0) {
579 		seg++;
580 		if (seg >= tag->common.nsegments) {
581 			error = EFBIG;
582 			break;
583 		}
584 		buflen1 = buflen > tag->common.maxsegsz ?
585 		    tag->common.maxsegsz : buflen;
586 		size = round_page(offset + buflen1);
587 
588 		/*
589 		 * (Too) optimistically allow split if there are more
590 		 * then one segments left.
591 		 */
592 		gas_flags = map->cansleep ? IOMMU_MF_CANWAIT : 0;
593 		if (seg + 1 < tag->common.nsegments)
594 			gas_flags |= IOMMU_MF_CANSPLIT;
595 
596 		error = iommu_map(domain, &tag->common, size, offset,
597 		    IOMMU_MAP_ENTRY_READ |
598 		    ((flags & BUS_DMA_NOWRITE) == 0 ? IOMMU_MAP_ENTRY_WRITE : 0),
599 		    gas_flags, ma + idx, &entry);
600 		if (error != 0)
601 			break;
602 		if ((gas_flags & IOMMU_MF_CANSPLIT) != 0) {
603 			KASSERT(size >= entry->end - entry->start,
604 			    ("split increased entry size %jx %jx %jx",
605 			    (uintmax_t)size, (uintmax_t)entry->start,
606 			    (uintmax_t)entry->end));
607 			size = entry->end - entry->start;
608 			if (buflen1 > size)
609 				buflen1 = size;
610 		} else {
611 			KASSERT(entry->end - entry->start == size,
612 			    ("no split allowed %jx %jx %jx",
613 			    (uintmax_t)size, (uintmax_t)entry->start,
614 			    (uintmax_t)entry->end));
615 		}
616 		if (offset + buflen1 > size)
617 			buflen1 = size - offset;
618 		if (buflen1 > tag->common.maxsegsz)
619 			buflen1 = tag->common.maxsegsz;
620 
621 		KASSERT(((entry->start + offset) & (tag->common.alignment - 1))
622 		    == 0,
623 		    ("alignment failed: ctx %p start 0x%jx offset %x "
624 		    "align 0x%jx", ctx, (uintmax_t)entry->start, offset,
625 		    (uintmax_t)tag->common.alignment));
626 		KASSERT(entry->end <= tag->common.lowaddr ||
627 		    entry->start >= tag->common.highaddr,
628 		    ("entry placement failed: ctx %p start 0x%jx end 0x%jx "
629 		    "lowaddr 0x%jx highaddr 0x%jx", ctx,
630 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
631 		    (uintmax_t)tag->common.lowaddr,
632 		    (uintmax_t)tag->common.highaddr));
633 		KASSERT(iommu_test_boundary(entry->start + offset, buflen1,
634 		    tag->common.boundary),
635 		    ("boundary failed: ctx %p start 0x%jx end 0x%jx "
636 		    "boundary 0x%jx", ctx, (uintmax_t)entry->start,
637 		    (uintmax_t)entry->end, (uintmax_t)tag->common.boundary));
638 		KASSERT(buflen1 <= tag->common.maxsegsz,
639 		    ("segment too large: ctx %p start 0x%jx end 0x%jx "
640 		    "buflen1 0x%jx maxsegsz 0x%jx", ctx,
641 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
642 		    (uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz));
643 
644 		IOMMU_DOMAIN_LOCK(domain);
645 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
646 		entry->flags |= IOMMU_MAP_ENTRY_MAP;
647 		IOMMU_DOMAIN_UNLOCK(domain);
648 		TAILQ_INSERT_TAIL(unroll_list, entry, unroll_link);
649 
650 		segs[seg].ds_addr = entry->start + offset;
651 		segs[seg].ds_len = buflen1;
652 
653 		idx += OFF_TO_IDX(trunc_page(offset + buflen1));
654 		offset += buflen1;
655 		offset &= IOMMU_PAGE_MASK;
656 		buflen -= buflen1;
657 	}
658 	if (error == 0)
659 		*segp = seg;
660 	return (error);
661 }
662 
663 static int
664 iommu_bus_dmamap_load_something(struct bus_dma_tag_iommu *tag,
665     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
666     int flags, bus_dma_segment_t *segs, int *segp)
667 {
668 	struct iommu_ctx *ctx;
669 	struct iommu_domain *domain;
670 	struct iommu_map_entry *entry, *entry1;
671 	struct iommu_map_entries_tailq unroll_list;
672 	int error;
673 
674 	ctx = tag->ctx;
675 	domain = ctx->domain;
676 	atomic_add_long(&ctx->loads, 1);
677 
678 	TAILQ_INIT(&unroll_list);
679 	error = iommu_bus_dmamap_load_something1(tag, map, ma, offset,
680 	    buflen, flags, segs, segp, &unroll_list);
681 	if (error != 0) {
682 		/*
683 		 * The busdma interface does not allow us to report
684 		 * partial buffer load, so unfortunately we have to
685 		 * revert all work done.
686 		 */
687 		IOMMU_DOMAIN_LOCK(domain);
688 		TAILQ_FOREACH_SAFE(entry, &unroll_list, unroll_link,
689 		    entry1) {
690 			/*
691 			 * No entries other than what we have created
692 			 * during the failed run might have been
693 			 * inserted there in between, since we own ctx
694 			 * pglock.
695 			 */
696 			TAILQ_REMOVE(&map->map_entries, entry, dmamap_link);
697 			TAILQ_REMOVE(&unroll_list, entry, unroll_link);
698 			TAILQ_INSERT_TAIL(&domain->unload_entries, entry,
699 			    dmamap_link);
700 		}
701 		IOMMU_DOMAIN_UNLOCK(domain);
702 		taskqueue_enqueue(domain->iommu->delayed_taskqueue,
703 		    &domain->unload_task);
704 	}
705 
706 	if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 &&
707 	    !map->cansleep)
708 		error = EINPROGRESS;
709 	if (error == EINPROGRESS)
710 		iommu_bus_schedule_dmamap(domain->iommu, map);
711 	return (error);
712 }
713 
714 static int
715 iommu_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map1,
716     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
717     bus_dma_segment_t *segs, int *segp)
718 {
719 	struct bus_dma_tag_iommu *tag;
720 	struct bus_dmamap_iommu *map;
721 
722 	tag = (struct bus_dma_tag_iommu *)dmat;
723 	map = (struct bus_dmamap_iommu *)map1;
724 	return (iommu_bus_dmamap_load_something(tag, map, ma, ma_offs, tlen,
725 	    flags, segs, segp));
726 }
727 
728 static int
729 iommu_bus_dmamap_load_phys(bus_dma_tag_t dmat, bus_dmamap_t map1,
730     vm_paddr_t buf, bus_size_t buflen, int flags, bus_dma_segment_t *segs,
731     int *segp)
732 {
733 	struct bus_dma_tag_iommu *tag;
734 	struct bus_dmamap_iommu *map;
735 	vm_page_t *ma, fma;
736 	vm_paddr_t pstart, pend, paddr;
737 	int error, i, ma_cnt, mflags, offset;
738 
739 	tag = (struct bus_dma_tag_iommu *)dmat;
740 	map = (struct bus_dmamap_iommu *)map1;
741 	pstart = trunc_page(buf);
742 	pend = round_page(buf + buflen);
743 	offset = buf & PAGE_MASK;
744 	ma_cnt = OFF_TO_IDX(pend - pstart);
745 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
746 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
747 	if (ma == NULL)
748 		return (ENOMEM);
749 	fma = NULL;
750 	for (i = 0; i < ma_cnt; i++) {
751 		paddr = pstart + ptoa(i);
752 		ma[i] = PHYS_TO_VM_PAGE(paddr);
753 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
754 			/*
755 			 * If PHYS_TO_VM_PAGE() returned NULL or the
756 			 * vm_page was not initialized we'll use a
757 			 * fake page.
758 			 */
759 			if (fma == NULL) {
760 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
761 				    M_DEVBUF, M_ZERO | mflags);
762 				if (fma == NULL) {
763 					free(ma, M_DEVBUF);
764 					return (ENOMEM);
765 				}
766 			}
767 			vm_page_initfake(&fma[i], pstart + ptoa(i),
768 			    VM_MEMATTR_DEFAULT);
769 			ma[i] = &fma[i];
770 		}
771 	}
772 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
773 	    flags, segs, segp);
774 	free(fma, M_DEVBUF);
775 	free(ma, M_DEVBUF);
776 	return (error);
777 }
778 
779 static int
780 iommu_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map1, void *buf,
781     bus_size_t buflen, pmap_t pmap, int flags, bus_dma_segment_t *segs,
782     int *segp)
783 {
784 	struct bus_dma_tag_iommu *tag;
785 	struct bus_dmamap_iommu *map;
786 	vm_page_t *ma, fma;
787 	vm_paddr_t pstart, pend, paddr;
788 	int error, i, ma_cnt, mflags, offset;
789 
790 	tag = (struct bus_dma_tag_iommu *)dmat;
791 	map = (struct bus_dmamap_iommu *)map1;
792 	pstart = trunc_page((vm_offset_t)buf);
793 	pend = round_page((vm_offset_t)buf + buflen);
794 	offset = (vm_offset_t)buf & PAGE_MASK;
795 	ma_cnt = OFF_TO_IDX(pend - pstart);
796 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
797 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
798 	if (ma == NULL)
799 		return (ENOMEM);
800 	fma = NULL;
801 	for (i = 0; i < ma_cnt; i++, pstart += PAGE_SIZE) {
802 		if (pmap == kernel_pmap)
803 			paddr = pmap_kextract(pstart);
804 		else
805 			paddr = pmap_extract(pmap, pstart);
806 		ma[i] = PHYS_TO_VM_PAGE(paddr);
807 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
808 			/*
809 			 * If PHYS_TO_VM_PAGE() returned NULL or the
810 			 * vm_page was not initialized we'll use a
811 			 * fake page.
812 			 */
813 			if (fma == NULL) {
814 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
815 				    M_DEVBUF, M_ZERO | mflags);
816 				if (fma == NULL) {
817 					free(ma, M_DEVBUF);
818 					return (ENOMEM);
819 				}
820 			}
821 			vm_page_initfake(&fma[i], paddr, VM_MEMATTR_DEFAULT);
822 			ma[i] = &fma[i];
823 		}
824 	}
825 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
826 	    flags, segs, segp);
827 	free(ma, M_DEVBUF);
828 	free(fma, M_DEVBUF);
829 	return (error);
830 }
831 
832 static void
833 iommu_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map1,
834     struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg)
835 {
836 	struct bus_dmamap_iommu *map;
837 
838 	if (map1 == NULL)
839 		return;
840 	map = (struct bus_dmamap_iommu *)map1;
841 	map->mem = *mem;
842 	map->tag = (struct bus_dma_tag_iommu *)dmat;
843 	map->callback = callback;
844 	map->callback_arg = callback_arg;
845 }
846 
847 static bus_dma_segment_t *
848 iommu_bus_dmamap_complete(bus_dma_tag_t dmat, bus_dmamap_t map1,
849     bus_dma_segment_t *segs, int nsegs, int error)
850 {
851 	struct bus_dma_tag_iommu *tag;
852 	struct bus_dmamap_iommu *map;
853 
854 	tag = (struct bus_dma_tag_iommu *)dmat;
855 	map = (struct bus_dmamap_iommu *)map1;
856 
857 	if (!map->locked) {
858 		KASSERT(map->cansleep,
859 		    ("map not locked and not sleepable context %p", map));
860 
861 		/*
862 		 * We are called from the delayed context.  Relock the
863 		 * driver.
864 		 */
865 		(tag->common.lockfunc)(tag->common.lockfuncarg, BUS_DMA_LOCK);
866 		map->locked = true;
867 	}
868 
869 	if (segs == NULL)
870 		segs = tag->segments;
871 	return (segs);
872 }
873 
874 /*
875  * The limitations of busdma KPI forces the iommu to perform the actual
876  * unload, consisting of the unmapping of the map entries page tables,
877  * from the delayed context on i386, since page table page mapping
878  * might require a sleep to be successfull.  The unfortunate
879  * consequence is that the DMA requests can be served some time after
880  * the bus_dmamap_unload() call returned.
881  *
882  * On amd64, we assume that sf allocation cannot fail.
883  */
884 static void
885 iommu_bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t map1)
886 {
887 	struct bus_dma_tag_iommu *tag;
888 	struct bus_dmamap_iommu *map;
889 	struct iommu_ctx *ctx;
890 	struct iommu_domain *domain;
891 #ifndef IOMMU_DOMAIN_UNLOAD_SLEEP
892 	struct iommu_map_entries_tailq entries;
893 #endif
894 
895 	tag = (struct bus_dma_tag_iommu *)dmat;
896 	map = (struct bus_dmamap_iommu *)map1;
897 	ctx = tag->ctx;
898 	domain = ctx->domain;
899 	atomic_add_long(&ctx->unloads, 1);
900 
901 #if defined(IOMMU_DOMAIN_UNLOAD_SLEEP)
902 	IOMMU_DOMAIN_LOCK(domain);
903 	TAILQ_CONCAT(&domain->unload_entries, &map->map_entries, dmamap_link);
904 	IOMMU_DOMAIN_UNLOCK(domain);
905 	taskqueue_enqueue(domain->iommu->delayed_taskqueue,
906 	    &domain->unload_task);
907 #else
908 	TAILQ_INIT(&entries);
909 	IOMMU_DOMAIN_LOCK(domain);
910 	TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link);
911 	IOMMU_DOMAIN_UNLOCK(domain);
912 	THREAD_NO_SLEEPING();
913 	iommu_domain_unload(domain, &entries, false);
914 	THREAD_SLEEPING_OK();
915 	KASSERT(TAILQ_EMPTY(&entries), ("lazy iommu_ctx_unload %p", ctx));
916 #endif
917 }
918 
919 static void
920 iommu_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map1,
921     bus_dmasync_op_t op)
922 {
923 	struct bus_dmamap_iommu *map;
924 
925 	map = (struct bus_dmamap_iommu *)map1;
926 	kmsan_bus_dmamap_sync(&map->kmsan_mem, op);
927 }
928 
929 #ifdef KMSAN
930 static void
931 iommu_bus_dmamap_load_kmsan(bus_dmamap_t map1, struct memdesc *mem)
932 {
933 	struct bus_dmamap_iommu *map;
934 
935 	map = (struct bus_dmamap_iommu *)map1;
936 	if (map == NULL)
937 		return;
938 	memcpy(&map->kmsan_mem, mem, sizeof(struct memdesc));
939 }
940 #endif
941 
942 struct bus_dma_impl bus_dma_iommu_impl = {
943 	.tag_create = iommu_bus_dma_tag_create,
944 	.tag_destroy = iommu_bus_dma_tag_destroy,
945 	.tag_set_domain = iommu_bus_dma_tag_set_domain,
946 	.id_mapped = iommu_bus_dma_id_mapped,
947 	.map_create = iommu_bus_dmamap_create,
948 	.map_destroy = iommu_bus_dmamap_destroy,
949 	.mem_alloc = iommu_bus_dmamem_alloc,
950 	.mem_free = iommu_bus_dmamem_free,
951 	.load_phys = iommu_bus_dmamap_load_phys,
952 	.load_buffer = iommu_bus_dmamap_load_buffer,
953 	.load_ma = iommu_bus_dmamap_load_ma,
954 	.map_waitok = iommu_bus_dmamap_waitok,
955 	.map_complete = iommu_bus_dmamap_complete,
956 	.map_unload = iommu_bus_dmamap_unload,
957 	.map_sync = iommu_bus_dmamap_sync,
958 #ifdef KMSAN
959 	.load_kmsan = iommu_bus_dmamap_load_kmsan,
960 #endif
961 };
962 
963 static void
964 iommu_bus_task_dmamap(void *arg, int pending)
965 {
966 	struct bus_dma_tag_iommu *tag;
967 	struct bus_dmamap_iommu *map;
968 	struct iommu_unit *unit;
969 
970 	unit = arg;
971 	IOMMU_LOCK(unit);
972 	while ((map = TAILQ_FIRST(&unit->delayed_maps)) != NULL) {
973 		TAILQ_REMOVE(&unit->delayed_maps, map, delay_link);
974 		IOMMU_UNLOCK(unit);
975 		tag = map->tag;
976 		map->cansleep = true;
977 		map->locked = false;
978 		bus_dmamap_load_mem((bus_dma_tag_t)tag, (bus_dmamap_t)map,
979 		    &map->mem, map->callback, map->callback_arg,
980 		    BUS_DMA_WAITOK);
981 		map->cansleep = false;
982 		if (map->locked) {
983 			(tag->common.lockfunc)(tag->common.lockfuncarg,
984 			    BUS_DMA_UNLOCK);
985 		} else
986 			map->locked = true;
987 		map->cansleep = false;
988 		IOMMU_LOCK(unit);
989 	}
990 	IOMMU_UNLOCK(unit);
991 }
992 
993 static void
994 iommu_bus_schedule_dmamap(struct iommu_unit *unit, struct bus_dmamap_iommu *map)
995 {
996 
997 	map->locked = false;
998 	IOMMU_LOCK(unit);
999 	TAILQ_INSERT_TAIL(&unit->delayed_maps, map, delay_link);
1000 	IOMMU_UNLOCK(unit);
1001 	taskqueue_enqueue(unit->delayed_taskqueue, &unit->dmamap_load_task);
1002 }
1003 
1004 int
1005 iommu_init_busdma(struct iommu_unit *unit)
1006 {
1007 	int error;
1008 
1009 	unit->dma_enabled = 1;
1010 	error = TUNABLE_INT_FETCH("hw.iommu.dma", &unit->dma_enabled);
1011 	if (error == 0) /* compatibility */
1012 		TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled);
1013 	TAILQ_INIT(&unit->delayed_maps);
1014 	TASK_INIT(&unit->dmamap_load_task, 0, iommu_bus_task_dmamap, unit);
1015 	unit->delayed_taskqueue = taskqueue_create("iommu", M_WAITOK,
1016 	    taskqueue_thread_enqueue, &unit->delayed_taskqueue);
1017 	taskqueue_start_threads(&unit->delayed_taskqueue, 1, PI_DISK,
1018 	    "iommu%d busdma taskq", unit->unit);
1019 	return (0);
1020 }
1021 
1022 void
1023 iommu_fini_busdma(struct iommu_unit *unit)
1024 {
1025 
1026 	if (unit->delayed_taskqueue == NULL)
1027 		return;
1028 
1029 	taskqueue_drain(unit->delayed_taskqueue, &unit->dmamap_load_task);
1030 	taskqueue_free(unit->delayed_taskqueue);
1031 	unit->delayed_taskqueue = NULL;
1032 }
1033 
1034 int
1035 bus_dma_iommu_load_ident(bus_dma_tag_t dmat, bus_dmamap_t map1,
1036     vm_paddr_t start, vm_size_t length, int flags)
1037 {
1038 	struct bus_dma_tag_common *tc;
1039 	struct bus_dma_tag_iommu *tag;
1040 	struct bus_dmamap_iommu *map;
1041 	struct iommu_ctx *ctx;
1042 	struct iommu_domain *domain;
1043 	struct iommu_map_entry *entry;
1044 	vm_page_t *ma;
1045 	vm_size_t i;
1046 	int error;
1047 	bool waitok;
1048 
1049 	MPASS((start & PAGE_MASK) == 0);
1050 	MPASS((length & PAGE_MASK) == 0);
1051 	MPASS(length > 0);
1052 	MPASS(start + length >= start);
1053 	MPASS((flags & ~(BUS_DMA_NOWAIT | BUS_DMA_NOWRITE)) == 0);
1054 
1055 	tc = (struct bus_dma_tag_common *)dmat;
1056 	if (tc->impl != &bus_dma_iommu_impl)
1057 		return (0);
1058 
1059 	tag = (struct bus_dma_tag_iommu *)dmat;
1060 	ctx = tag->ctx;
1061 	domain = ctx->domain;
1062 	map = (struct bus_dmamap_iommu *)map1;
1063 	waitok = (flags & BUS_DMA_NOWAIT) != 0;
1064 
1065 	entry = iommu_map_alloc_entry(domain, waitok ? 0 : IOMMU_PGF_WAITOK);
1066 	if (entry == NULL)
1067 		return (ENOMEM);
1068 	entry->start = start;
1069 	entry->end = start + length;
1070 	ma = malloc(sizeof(vm_page_t) * atop(length), M_TEMP, waitok ?
1071 	    M_WAITOK : M_NOWAIT);
1072 	if (ma == NULL) {
1073 		iommu_map_free_entry(domain, entry);
1074 		return (ENOMEM);
1075 	}
1076 	for (i = 0; i < atop(length); i++) {
1077 		ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
1078 		    VM_MEMATTR_DEFAULT);
1079 	}
1080 	error = iommu_map_region(domain, entry, IOMMU_MAP_ENTRY_READ |
1081 	    ((flags & BUS_DMA_NOWRITE) ? 0 : IOMMU_MAP_ENTRY_WRITE),
1082 	    waitok ? IOMMU_MF_CANWAIT : 0, ma);
1083 	if (error == 0) {
1084 		IOMMU_DOMAIN_LOCK(domain);
1085 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
1086 		entry->flags |= IOMMU_MAP_ENTRY_MAP;
1087 		IOMMU_DOMAIN_UNLOCK(domain);
1088 	} else {
1089 		iommu_domain_unload_entry(entry, true);
1090 	}
1091 	for (i = 0; i < atop(length); i++)
1092 		vm_page_putfake(ma[i]);
1093 	free(ma, M_TEMP);
1094 	return (error);
1095 }
1096 
1097 static void
1098 iommu_domain_unload_task(void *arg, int pending)
1099 {
1100 	struct iommu_domain *domain;
1101 	struct iommu_map_entries_tailq entries;
1102 
1103 	domain = arg;
1104 	TAILQ_INIT(&entries);
1105 
1106 	for (;;) {
1107 		IOMMU_DOMAIN_LOCK(domain);
1108 		TAILQ_SWAP(&domain->unload_entries, &entries,
1109 		    iommu_map_entry, dmamap_link);
1110 		IOMMU_DOMAIN_UNLOCK(domain);
1111 		if (TAILQ_EMPTY(&entries))
1112 			break;
1113 		iommu_domain_unload(domain, &entries, true);
1114 	}
1115 }
1116 
1117 void
1118 iommu_domain_init(struct iommu_unit *unit, struct iommu_domain *domain,
1119     const struct iommu_domain_map_ops *ops)
1120 {
1121 
1122 	domain->ops = ops;
1123 	domain->iommu = unit;
1124 
1125 	TASK_INIT(&domain->unload_task, 0, iommu_domain_unload_task, domain);
1126 	RB_INIT(&domain->rb_root);
1127 	TAILQ_INIT(&domain->unload_entries);
1128 	mtx_init(&domain->lock, "iodom", NULL, MTX_DEF);
1129 }
1130 
1131 void
1132 iommu_domain_fini(struct iommu_domain *domain)
1133 {
1134 
1135 	mtx_destroy(&domain->lock);
1136 }
1137