xref: /freebsd/sys/dev/iommu/busdma_iommu.c (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/domainset.h>
34 #include <sys/malloc.h>
35 #include <sys/bus.h>
36 #include <sys/conf.h>
37 #include <sys/interrupt.h>
38 #include <sys/kernel.h>
39 #include <sys/ktr.h>
40 #include <sys/lock.h>
41 #include <sys/proc.h>
42 #include <sys/memdesc.h>
43 #include <sys/msan.h>
44 #include <sys/mutex.h>
45 #include <sys/sysctl.h>
46 #include <sys/rman.h>
47 #include <sys/taskqueue.h>
48 #include <sys/tree.h>
49 #include <sys/uio.h>
50 #include <sys/vmem.h>
51 #include <dev/pci/pcireg.h>
52 #include <dev/pci/pcivar.h>
53 #include <vm/vm.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_object.h>
57 #include <vm/vm_page.h>
58 #include <vm/vm_map.h>
59 #include <dev/iommu/iommu.h>
60 #include <machine/atomic.h>
61 #include <machine/bus.h>
62 #include <machine/md_var.h>
63 #include <machine/iommu.h>
64 #include <dev/iommu/busdma_iommu.h>
65 
66 /*
67  * busdma_iommu.c, the implementation of the busdma(9) interface using
68  * IOMMU units from Intel VT-d.
69  */
70 
71 static bool
72 iommu_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func)
73 {
74 	char str[128], *env;
75 	int default_bounce;
76 	bool ret;
77 	static const char bounce_str[] = "bounce";
78 	static const char iommu_str[] = "iommu";
79 	static const char dmar_str[] = "dmar"; /* compatibility */
80 
81 	default_bounce = 0;
82 	env = kern_getenv("hw.busdma.default");
83 	if (env != NULL) {
84 		if (strcmp(env, bounce_str) == 0)
85 			default_bounce = 1;
86 		else if (strcmp(env, iommu_str) == 0 ||
87 		    strcmp(env, dmar_str) == 0)
88 			default_bounce = 0;
89 		freeenv(env);
90 	}
91 
92 	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d",
93 	    domain, bus, slot, func);
94 	env = kern_getenv(str);
95 	if (env == NULL)
96 		return (default_bounce != 0);
97 	if (strcmp(env, bounce_str) == 0)
98 		ret = true;
99 	else if (strcmp(env, iommu_str) == 0 ||
100 	    strcmp(env, dmar_str) == 0)
101 		ret = false;
102 	else
103 		ret = default_bounce != 0;
104 	freeenv(env);
105 	return (ret);
106 }
107 
108 /*
109  * Given original device, find the requester ID that will be seen by
110  * the IOMMU unit and used for page table lookup.  PCI bridges may take
111  * ownership of transactions from downstream devices, so it may not be
112  * the same as the BSF of the target device.  In those cases, all
113  * devices downstream of the bridge must share a single mapping
114  * domain, and must collectively be assigned to use either IOMMU or
115  * bounce mapping.
116  */
117 device_t
118 iommu_get_requester(device_t dev, uint16_t *rid)
119 {
120 	devclass_t pci_class;
121 	device_t l, pci, pcib, pcip, pcibp, requester;
122 	int cap_offset;
123 	uint16_t pcie_flags;
124 	bool bridge_is_pcie;
125 
126 	pci_class = devclass_find("pci");
127 	l = requester = dev;
128 
129 	pci = device_get_parent(dev);
130 	if (pci == NULL || device_get_devclass(pci) != pci_class) {
131 		*rid = 0;	/* XXXKIB: Could be ACPI HID */
132 		return (requester);
133 	}
134 
135 	*rid = pci_get_rid(dev);
136 
137 	/*
138 	 * Walk the bridge hierarchy from the target device to the
139 	 * host port to find the translating bridge nearest the IOMMU
140 	 * unit.
141 	 */
142 	for (;;) {
143 		pci = device_get_parent(l);
144 		KASSERT(pci != NULL, ("iommu_get_requester(%s): NULL parent "
145 		    "for %s", device_get_name(dev), device_get_name(l)));
146 		KASSERT(device_get_devclass(pci) == pci_class,
147 		    ("iommu_get_requester(%s): non-pci parent %s for %s",
148 		    device_get_name(dev), device_get_name(pci),
149 		    device_get_name(l)));
150 
151 		pcib = device_get_parent(pci);
152 		KASSERT(pcib != NULL, ("iommu_get_requester(%s): NULL bridge "
153 		    "for %s", device_get_name(dev), device_get_name(pci)));
154 
155 		/*
156 		 * The parent of our "bridge" isn't another PCI bus,
157 		 * so pcib isn't a PCI->PCI bridge but rather a host
158 		 * port, and the requester ID won't be translated
159 		 * further.
160 		 */
161 		pcip = device_get_parent(pcib);
162 		if (device_get_devclass(pcip) != pci_class)
163 			break;
164 		pcibp = device_get_parent(pcip);
165 
166 		if (pci_find_cap(l, PCIY_EXPRESS, &cap_offset) == 0) {
167 			/*
168 			 * Do not stop the loop even if the target
169 			 * device is PCIe, because it is possible (but
170 			 * unlikely) to have a PCI->PCIe bridge
171 			 * somewhere in the hierarchy.
172 			 */
173 			l = pcib;
174 		} else {
175 			/*
176 			 * Device is not PCIe, it cannot be seen as a
177 			 * requester by IOMMU unit.  Check whether the
178 			 * bridge is PCIe.
179 			 */
180 			bridge_is_pcie = pci_find_cap(pcib, PCIY_EXPRESS,
181 			    &cap_offset) == 0;
182 			requester = pcib;
183 
184 			/*
185 			 * Check for a buggy PCIe/PCI bridge that
186 			 * doesn't report the express capability.  If
187 			 * the bridge above it is express but isn't a
188 			 * PCI bridge, then we know pcib is actually a
189 			 * PCIe/PCI bridge.
190 			 */
191 			if (!bridge_is_pcie && pci_find_cap(pcibp,
192 			    PCIY_EXPRESS, &cap_offset) == 0) {
193 				pcie_flags = pci_read_config(pcibp,
194 				    cap_offset + PCIER_FLAGS, 2);
195 				if ((pcie_flags & PCIEM_FLAGS_TYPE) !=
196 				    PCIEM_TYPE_PCI_BRIDGE)
197 					bridge_is_pcie = true;
198 			}
199 
200 			if (bridge_is_pcie) {
201 				/*
202 				 * The current device is not PCIe, but
203 				 * the bridge above it is.  This is a
204 				 * PCIe->PCI bridge.  Assume that the
205 				 * requester ID will be the secondary
206 				 * bus number with slot and function
207 				 * set to zero.
208 				 *
209 				 * XXX: Doesn't handle the case where
210 				 * the bridge is PCIe->PCI-X, and the
211 				 * bridge will only take ownership of
212 				 * requests in some cases.  We should
213 				 * provide context entries with the
214 				 * same page tables for taken and
215 				 * non-taken transactions.
216 				 */
217 				*rid = PCI_RID(pci_get_bus(l), 0, 0);
218 				l = pcibp;
219 			} else {
220 				/*
221 				 * Neither the device nor the bridge
222 				 * above it are PCIe.  This is a
223 				 * conventional PCI->PCI bridge, which
224 				 * will use the bridge's BSF as the
225 				 * requester ID.
226 				 */
227 				*rid = pci_get_rid(pcib);
228 				l = pcib;
229 			}
230 		}
231 	}
232 	return (requester);
233 }
234 
235 struct iommu_ctx *
236 iommu_instantiate_ctx(struct iommu_unit *unit, device_t dev, bool rmrr)
237 {
238 	device_t requester;
239 	struct iommu_ctx *ctx;
240 	bool disabled;
241 	uint16_t rid;
242 
243 	requester = iommu_get_requester(dev, &rid);
244 
245 	/*
246 	 * If the user requested the IOMMU disabled for the device, we
247 	 * cannot disable the IOMMU unit, due to possibility of other
248 	 * devices on the same IOMMU unit still requiring translation.
249 	 * Instead provide the identity mapping for the device
250 	 * context.
251 	 */
252 	disabled = iommu_bus_dma_is_dev_disabled(pci_get_domain(requester),
253 	    pci_get_bus(requester), pci_get_slot(requester),
254 	    pci_get_function(requester));
255 	ctx = iommu_get_ctx(unit, requester, rid, disabled, rmrr);
256 	if (ctx == NULL)
257 		return (NULL);
258 	if (disabled) {
259 		/*
260 		 * Keep the first reference on context, release the
261 		 * later refs.
262 		 */
263 		IOMMU_LOCK(unit);
264 		if ((ctx->flags & IOMMU_CTX_DISABLED) == 0) {
265 			ctx->flags |= IOMMU_CTX_DISABLED;
266 			IOMMU_UNLOCK(unit);
267 		} else {
268 			iommu_free_ctx_locked(unit, ctx);
269 		}
270 		ctx = NULL;
271 	}
272 	return (ctx);
273 }
274 
275 struct iommu_ctx *
276 iommu_get_dev_ctx(device_t dev)
277 {
278 	struct iommu_unit *unit;
279 
280 	unit = iommu_find(dev, bootverbose);
281 	/* Not in scope of any IOMMU ? */
282 	if (unit == NULL)
283 		return (NULL);
284 	if (!unit->dma_enabled)
285 		return (NULL);
286 
287 	iommu_unit_pre_instantiate_ctx(unit);
288 	return (iommu_instantiate_ctx(unit, dev, false));
289 }
290 
291 bus_dma_tag_t
292 iommu_get_dma_tag(device_t dev, device_t child)
293 {
294 	struct iommu_ctx *ctx;
295 	bus_dma_tag_t res;
296 
297 	ctx = iommu_get_dev_ctx(child);
298 	if (ctx == NULL)
299 		return (NULL);
300 
301 	res = (bus_dma_tag_t)ctx->tag;
302 	return (res);
303 }
304 
305 bool
306 bus_dma_iommu_set_buswide(device_t dev)
307 {
308 	struct iommu_unit *unit;
309 	device_t parent;
310 	u_int busno, slot, func;
311 
312 	parent = device_get_parent(dev);
313 	if (device_get_devclass(parent) != devclass_find("pci"))
314 		return (false);
315 	unit = iommu_find(dev, bootverbose);
316 	if (unit == NULL)
317 		return (false);
318 	busno = pci_get_bus(dev);
319 	slot = pci_get_slot(dev);
320 	func = pci_get_function(dev);
321 	if (slot != 0 || func != 0) {
322 		if (bootverbose) {
323 			device_printf(dev,
324 			    "iommu%d pci%d:%d:%d requested buswide busdma\n",
325 			    unit->unit, busno, slot, func);
326 		}
327 		return (false);
328 	}
329 	iommu_set_buswide_ctx(unit, busno);
330 	return (true);
331 }
332 
333 void
334 iommu_set_buswide_ctx(struct iommu_unit *unit, u_int busno)
335 {
336 
337 	MPASS(busno <= PCI_BUSMAX);
338 	IOMMU_LOCK(unit);
339 	unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] |=
340 	    1 << (busno % (NBBY * sizeof(uint32_t)));
341 	IOMMU_UNLOCK(unit);
342 }
343 
344 bool
345 iommu_is_buswide_ctx(struct iommu_unit *unit, u_int busno)
346 {
347 
348 	MPASS(busno <= PCI_BUSMAX);
349 	return ((unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] &
350 	    (1U << (busno % (NBBY * sizeof(uint32_t))))) != 0);
351 }
352 
353 static MALLOC_DEFINE(M_IOMMU_DMAMAP, "iommu_dmamap", "IOMMU DMA Map");
354 
355 static void iommu_bus_schedule_dmamap(struct iommu_unit *unit,
356     struct bus_dmamap_iommu *map);
357 
358 static int
359 iommu_bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
360     bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr,
361     bus_size_t maxsize, int nsegments, bus_size_t maxsegsz, int flags,
362     bus_dma_lock_t *lockfunc, void *lockfuncarg, bus_dma_tag_t *dmat)
363 {
364 	struct bus_dma_tag_iommu *newtag, *oldtag;
365 	int error;
366 
367 	*dmat = NULL;
368 	error = common_bus_dma_tag_create(parent != NULL ?
369 	    &((struct bus_dma_tag_iommu *)parent)->common : NULL, alignment,
370 	    boundary, lowaddr, highaddr, maxsize, nsegments, maxsegsz, flags,
371 	    lockfunc, lockfuncarg, sizeof(struct bus_dma_tag_iommu),
372 	    (void **)&newtag);
373 	if (error != 0)
374 		goto out;
375 
376 	oldtag = (struct bus_dma_tag_iommu *)parent;
377 	newtag->common.impl = &bus_dma_iommu_impl;
378 	newtag->ctx = oldtag->ctx;
379 	newtag->owner = oldtag->owner;
380 
381 	*dmat = (bus_dma_tag_t)newtag;
382 out:
383 	CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
384 	    __func__, newtag, (newtag != NULL ? newtag->common.flags : 0),
385 	    error);
386 	return (error);
387 }
388 
389 static int
390 iommu_bus_dma_tag_set_domain(bus_dma_tag_t dmat)
391 {
392 
393 	return (0);
394 }
395 
396 static int
397 iommu_bus_dma_tag_destroy(bus_dma_tag_t dmat1)
398 {
399 	struct bus_dma_tag_iommu *dmat;
400 	struct iommu_unit *iommu;
401 	struct iommu_ctx *ctx;
402 	int error;
403 
404 	error = 0;
405 	dmat = (struct bus_dma_tag_iommu *)dmat1;
406 
407 	if (dmat != NULL) {
408 		if (dmat->map_count != 0) {
409 			error = EBUSY;
410 			goto out;
411 		}
412 		ctx = dmat->ctx;
413 		if (dmat == ctx->tag) {
414 			iommu = ctx->domain->iommu;
415 			IOMMU_LOCK(iommu);
416 			iommu_free_ctx_locked(iommu, dmat->ctx);
417 		}
418 		free(dmat->segments, M_IOMMU_DMAMAP);
419 		free(dmat, M_DEVBUF);
420 	}
421 out:
422 	CTR3(KTR_BUSDMA, "%s tag %p error %d", __func__, dmat, error);
423 	return (error);
424 }
425 
426 static bool
427 iommu_bus_dma_id_mapped(bus_dma_tag_t dmat, vm_paddr_t buf, bus_size_t buflen)
428 {
429 
430 	return (false);
431 }
432 
433 static int
434 iommu_bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp)
435 {
436 	struct bus_dma_tag_iommu *tag;
437 	struct bus_dmamap_iommu *map;
438 
439 	tag = (struct bus_dma_tag_iommu *)dmat;
440 	map = malloc_domainset(sizeof(*map), M_IOMMU_DMAMAP,
441 	    DOMAINSET_PREF(tag->common.domain), M_NOWAIT | M_ZERO);
442 	if (map == NULL) {
443 		*mapp = NULL;
444 		return (ENOMEM);
445 	}
446 	if (tag->segments == NULL) {
447 		tag->segments = malloc_domainset(sizeof(bus_dma_segment_t) *
448 		    tag->common.nsegments, M_IOMMU_DMAMAP,
449 		    DOMAINSET_PREF(tag->common.domain), M_NOWAIT);
450 		if (tag->segments == NULL) {
451 			free(map, M_IOMMU_DMAMAP);
452 			*mapp = NULL;
453 			return (ENOMEM);
454 		}
455 	}
456 	IOMMU_DMAMAP_INIT(map);
457 	TAILQ_INIT(&map->map_entries);
458 	map->tag = tag;
459 	map->locked = true;
460 	map->cansleep = false;
461 	tag->map_count++;
462 	*mapp = (bus_dmamap_t)map;
463 
464 	return (0);
465 }
466 
467 static int
468 iommu_bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map1)
469 {
470 	struct bus_dma_tag_iommu *tag;
471 	struct bus_dmamap_iommu *map;
472 
473 	tag = (struct bus_dma_tag_iommu *)dmat;
474 	map = (struct bus_dmamap_iommu *)map1;
475 	if (map != NULL) {
476 		IOMMU_DMAMAP_LOCK(map);
477 		if (!TAILQ_EMPTY(&map->map_entries)) {
478 			IOMMU_DMAMAP_UNLOCK(map);
479 			return (EBUSY);
480 		}
481 		IOMMU_DMAMAP_DESTROY(map);
482 		free(map, M_IOMMU_DMAMAP);
483 	}
484 	tag->map_count--;
485 	return (0);
486 }
487 
488 
489 static int
490 iommu_bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
491     bus_dmamap_t *mapp)
492 {
493 	struct bus_dma_tag_iommu *tag;
494 	struct bus_dmamap_iommu *map;
495 	int error, mflags;
496 	vm_memattr_t attr;
497 
498 	error = iommu_bus_dmamap_create(dmat, flags, mapp);
499 	if (error != 0)
500 		return (error);
501 
502 	mflags = (flags & BUS_DMA_NOWAIT) != 0 ? M_NOWAIT : M_WAITOK;
503 	mflags |= (flags & BUS_DMA_ZERO) != 0 ? M_ZERO : 0;
504 	attr = (flags & BUS_DMA_NOCACHE) != 0 ? VM_MEMATTR_UNCACHEABLE :
505 	    VM_MEMATTR_DEFAULT;
506 
507 	tag = (struct bus_dma_tag_iommu *)dmat;
508 	map = (struct bus_dmamap_iommu *)*mapp;
509 
510 	if (tag->common.maxsize < PAGE_SIZE &&
511 	    tag->common.alignment <= tag->common.maxsize &&
512 	    attr == VM_MEMATTR_DEFAULT) {
513 		*vaddr = malloc_domainset(tag->common.maxsize, M_DEVBUF,
514 		    DOMAINSET_PREF(tag->common.domain), mflags);
515 		map->flags |= BUS_DMAMAP_IOMMU_MALLOC;
516 	} else {
517 		*vaddr = kmem_alloc_attr_domainset(
518 		    DOMAINSET_PREF(tag->common.domain), tag->common.maxsize,
519 		    mflags, 0ul, BUS_SPACE_MAXADDR, attr);
520 		map->flags |= BUS_DMAMAP_IOMMU_KMEM_ALLOC;
521 	}
522 	if (*vaddr == NULL) {
523 		iommu_bus_dmamap_destroy(dmat, *mapp);
524 		*mapp = NULL;
525 		return (ENOMEM);
526 	}
527 	return (0);
528 }
529 
530 static void
531 iommu_bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map1)
532 {
533 	struct bus_dma_tag_iommu *tag;
534 	struct bus_dmamap_iommu *map;
535 
536 	tag = (struct bus_dma_tag_iommu *)dmat;
537 	map = (struct bus_dmamap_iommu *)map1;
538 
539 	if ((map->flags & BUS_DMAMAP_IOMMU_MALLOC) != 0) {
540 		free(vaddr, M_DEVBUF);
541 		map->flags &= ~BUS_DMAMAP_IOMMU_MALLOC;
542 	} else {
543 		KASSERT((map->flags & BUS_DMAMAP_IOMMU_KMEM_ALLOC) != 0,
544 		    ("iommu_bus_dmamem_free for non alloced map %p", map));
545 		kmem_free(vaddr, tag->common.maxsize);
546 		map->flags &= ~BUS_DMAMAP_IOMMU_KMEM_ALLOC;
547 	}
548 
549 	iommu_bus_dmamap_destroy(dmat, map1);
550 }
551 
552 static int
553 iommu_bus_dmamap_load_something1(struct bus_dma_tag_iommu *tag,
554     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
555     int flags, bus_dma_segment_t *segs, int *segp,
556     struct iommu_map_entries_tailq *entries)
557 {
558 	struct iommu_ctx *ctx;
559 	struct iommu_domain *domain;
560 	struct iommu_map_entry *entry;
561 	bus_size_t buflen1;
562 	int error, e_flags, idx, gas_flags, seg;
563 
564 	KASSERT(offset < IOMMU_PAGE_SIZE, ("offset %d", offset));
565 	if (segs == NULL)
566 		segs = tag->segments;
567 	ctx = tag->ctx;
568 	domain = ctx->domain;
569 	e_flags = IOMMU_MAP_ENTRY_READ |
570 	    ((flags & BUS_DMA_NOWRITE) == 0 ? IOMMU_MAP_ENTRY_WRITE : 0);
571 	seg = *segp;
572 	error = 0;
573 	idx = 0;
574 	while (buflen > 0) {
575 		seg++;
576 		if (seg >= tag->common.nsegments) {
577 			error = EFBIG;
578 			break;
579 		}
580 		buflen1 = buflen > tag->common.maxsegsz ?
581 		    tag->common.maxsegsz : buflen;
582 
583 		/*
584 		 * (Too) optimistically allow split if there are more
585 		 * then one segments left.
586 		 */
587 		gas_flags = map->cansleep ? IOMMU_MF_CANWAIT : 0;
588 		if (seg + 1 < tag->common.nsegments)
589 			gas_flags |= IOMMU_MF_CANSPLIT;
590 
591 		error = iommu_gas_map(domain, &tag->common, buflen1,
592 		    offset, e_flags, gas_flags, ma + idx, &entry);
593 		if (error != 0)
594 			break;
595 		/* Update buflen1 in case buffer split. */
596 		if (buflen1 > entry->end - entry->start - offset)
597 			buflen1 = entry->end - entry->start - offset;
598 
599 		KASSERT(vm_addr_align_ok(entry->start + offset,
600 		    tag->common.alignment),
601 		    ("alignment failed: ctx %p start 0x%jx offset %x "
602 		    "align 0x%jx", ctx, (uintmax_t)entry->start, offset,
603 		    (uintmax_t)tag->common.alignment));
604 		KASSERT(entry->end <= tag->common.lowaddr ||
605 		    entry->start >= tag->common.highaddr,
606 		    ("entry placement failed: ctx %p start 0x%jx end 0x%jx "
607 		    "lowaddr 0x%jx highaddr 0x%jx", ctx,
608 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
609 		    (uintmax_t)tag->common.lowaddr,
610 		    (uintmax_t)tag->common.highaddr));
611 		KASSERT(vm_addr_bound_ok(entry->start + offset, buflen1,
612 		    tag->common.boundary),
613 		    ("boundary failed: ctx %p start 0x%jx end 0x%jx "
614 		    "boundary 0x%jx", ctx, (uintmax_t)entry->start,
615 		    (uintmax_t)entry->end, (uintmax_t)tag->common.boundary));
616 		KASSERT(buflen1 <= tag->common.maxsegsz,
617 		    ("segment too large: ctx %p start 0x%jx end 0x%jx "
618 		    "buflen1 0x%jx maxsegsz 0x%jx", ctx,
619 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
620 		    (uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz));
621 
622 		KASSERT((entry->flags & IOMMU_MAP_ENTRY_MAP) != 0,
623 		    ("entry %p missing IOMMU_MAP_ENTRY_MAP", entry));
624 		TAILQ_INSERT_TAIL(entries, entry, dmamap_link);
625 
626 		segs[seg].ds_addr = entry->start + offset;
627 		segs[seg].ds_len = buflen1;
628 
629 		idx += OFF_TO_IDX(offset + buflen1);
630 		offset += buflen1;
631 		offset &= IOMMU_PAGE_MASK;
632 		buflen -= buflen1;
633 	}
634 	if (error == 0)
635 		*segp = seg;
636 	return (error);
637 }
638 
639 static int
640 iommu_bus_dmamap_load_something(struct bus_dma_tag_iommu *tag,
641     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
642     int flags, bus_dma_segment_t *segs, int *segp)
643 {
644 	struct iommu_ctx *ctx;
645 	struct iommu_domain *domain;
646 	struct iommu_map_entries_tailq entries;
647 	int error;
648 
649 	ctx = tag->ctx;
650 	domain = ctx->domain;
651 	atomic_add_long(&ctx->loads, 1);
652 
653 	TAILQ_INIT(&entries);
654 	error = iommu_bus_dmamap_load_something1(tag, map, ma, offset,
655 	    buflen, flags, segs, segp, &entries);
656 	if (error == 0) {
657 		IOMMU_DMAMAP_LOCK(map);
658 		TAILQ_CONCAT(&map->map_entries, &entries, dmamap_link);
659 		IOMMU_DMAMAP_UNLOCK(map);
660 	} else if (!TAILQ_EMPTY(&entries)) {
661 		/*
662 		 * The busdma interface does not allow us to report
663 		 * partial buffer load, so unfortunately we have to
664 		 * revert all work done.
665 		 */
666 		IOMMU_DOMAIN_LOCK(domain);
667 		TAILQ_CONCAT(&domain->unload_entries, &entries, dmamap_link);
668 		IOMMU_DOMAIN_UNLOCK(domain);
669 		taskqueue_enqueue(domain->iommu->delayed_taskqueue,
670 		    &domain->unload_task);
671 	}
672 
673 	if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 &&
674 	    !map->cansleep)
675 		error = EINPROGRESS;
676 	if (error == EINPROGRESS)
677 		iommu_bus_schedule_dmamap(domain->iommu, map);
678 	return (error);
679 }
680 
681 static int
682 iommu_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map1,
683     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
684     bus_dma_segment_t *segs, int *segp)
685 {
686 	struct bus_dma_tag_iommu *tag;
687 	struct bus_dmamap_iommu *map;
688 
689 	tag = (struct bus_dma_tag_iommu *)dmat;
690 	map = (struct bus_dmamap_iommu *)map1;
691 	return (iommu_bus_dmamap_load_something(tag, map, ma, ma_offs, tlen,
692 	    flags, segs, segp));
693 }
694 
695 static int
696 iommu_bus_dmamap_load_phys(bus_dma_tag_t dmat, bus_dmamap_t map1,
697     vm_paddr_t buf, bus_size_t buflen, int flags, bus_dma_segment_t *segs,
698     int *segp)
699 {
700 	struct bus_dma_tag_iommu *tag;
701 	struct bus_dmamap_iommu *map;
702 	vm_page_t *ma, fma;
703 	vm_paddr_t pstart, pend, paddr;
704 	int error, i, ma_cnt, mflags, offset;
705 
706 	tag = (struct bus_dma_tag_iommu *)dmat;
707 	map = (struct bus_dmamap_iommu *)map1;
708 	pstart = trunc_page(buf);
709 	pend = round_page(buf + buflen);
710 	offset = buf & PAGE_MASK;
711 	ma_cnt = OFF_TO_IDX(pend - pstart);
712 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
713 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
714 	if (ma == NULL)
715 		return (ENOMEM);
716 	fma = NULL;
717 	for (i = 0; i < ma_cnt; i++) {
718 		paddr = pstart + ptoa(i);
719 		ma[i] = PHYS_TO_VM_PAGE(paddr);
720 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
721 			/*
722 			 * If PHYS_TO_VM_PAGE() returned NULL or the
723 			 * vm_page was not initialized we'll use a
724 			 * fake page.
725 			 */
726 			if (fma == NULL) {
727 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
728 				    M_DEVBUF, M_ZERO | mflags);
729 				if (fma == NULL) {
730 					free(ma, M_DEVBUF);
731 					return (ENOMEM);
732 				}
733 			}
734 			vm_page_initfake(&fma[i], pstart + ptoa(i),
735 			    VM_MEMATTR_DEFAULT);
736 			ma[i] = &fma[i];
737 		}
738 	}
739 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
740 	    flags, segs, segp);
741 	free(fma, M_DEVBUF);
742 	free(ma, M_DEVBUF);
743 	return (error);
744 }
745 
746 static int
747 iommu_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map1, void *buf,
748     bus_size_t buflen, pmap_t pmap, int flags, bus_dma_segment_t *segs,
749     int *segp)
750 {
751 	struct bus_dma_tag_iommu *tag;
752 	struct bus_dmamap_iommu *map;
753 	vm_page_t *ma, fma;
754 	vm_paddr_t pstart, pend, paddr;
755 	int error, i, ma_cnt, mflags, offset;
756 
757 	tag = (struct bus_dma_tag_iommu *)dmat;
758 	map = (struct bus_dmamap_iommu *)map1;
759 	pstart = trunc_page((vm_offset_t)buf);
760 	pend = round_page((vm_offset_t)buf + buflen);
761 	offset = (vm_offset_t)buf & PAGE_MASK;
762 	ma_cnt = OFF_TO_IDX(pend - pstart);
763 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
764 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
765 	if (ma == NULL)
766 		return (ENOMEM);
767 	fma = NULL;
768 	for (i = 0; i < ma_cnt; i++, pstart += PAGE_SIZE) {
769 		if (pmap == kernel_pmap)
770 			paddr = pmap_kextract(pstart);
771 		else
772 			paddr = pmap_extract(pmap, pstart);
773 		ma[i] = PHYS_TO_VM_PAGE(paddr);
774 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
775 			/*
776 			 * If PHYS_TO_VM_PAGE() returned NULL or the
777 			 * vm_page was not initialized we'll use a
778 			 * fake page.
779 			 */
780 			if (fma == NULL) {
781 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
782 				    M_DEVBUF, M_ZERO | mflags);
783 				if (fma == NULL) {
784 					free(ma, M_DEVBUF);
785 					return (ENOMEM);
786 				}
787 			}
788 			vm_page_initfake(&fma[i], paddr, VM_MEMATTR_DEFAULT);
789 			ma[i] = &fma[i];
790 		}
791 	}
792 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
793 	    flags, segs, segp);
794 	free(ma, M_DEVBUF);
795 	free(fma, M_DEVBUF);
796 	return (error);
797 }
798 
799 static void
800 iommu_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map1,
801     struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg)
802 {
803 	struct bus_dmamap_iommu *map;
804 
805 	if (map1 == NULL)
806 		return;
807 	map = (struct bus_dmamap_iommu *)map1;
808 	map->mem = *mem;
809 	map->tag = (struct bus_dma_tag_iommu *)dmat;
810 	map->callback = callback;
811 	map->callback_arg = callback_arg;
812 }
813 
814 static bus_dma_segment_t *
815 iommu_bus_dmamap_complete(bus_dma_tag_t dmat, bus_dmamap_t map1,
816     bus_dma_segment_t *segs, int nsegs, int error)
817 {
818 	struct bus_dma_tag_iommu *tag;
819 	struct bus_dmamap_iommu *map;
820 
821 	tag = (struct bus_dma_tag_iommu *)dmat;
822 	map = (struct bus_dmamap_iommu *)map1;
823 
824 	if (!map->locked) {
825 		KASSERT(map->cansleep,
826 		    ("map not locked and not sleepable context %p", map));
827 
828 		/*
829 		 * We are called from the delayed context.  Relock the
830 		 * driver.
831 		 */
832 		(tag->common.lockfunc)(tag->common.lockfuncarg, BUS_DMA_LOCK);
833 		map->locked = true;
834 	}
835 
836 	if (segs == NULL)
837 		segs = tag->segments;
838 	return (segs);
839 }
840 
841 /*
842  * The limitations of busdma KPI forces the iommu to perform the actual
843  * unload, consisting of the unmapping of the map entries page tables,
844  * from the delayed context on i386, since page table page mapping
845  * might require a sleep to be successfull.  The unfortunate
846  * consequence is that the DMA requests can be served some time after
847  * the bus_dmamap_unload() call returned.
848  *
849  * On amd64, we assume that sf allocation cannot fail.
850  */
851 static void
852 iommu_bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t map1)
853 {
854 	struct bus_dma_tag_iommu *tag;
855 	struct bus_dmamap_iommu *map;
856 	struct iommu_ctx *ctx;
857 	struct iommu_domain *domain;
858 	struct iommu_map_entries_tailq entries;
859 
860 	tag = (struct bus_dma_tag_iommu *)dmat;
861 	map = (struct bus_dmamap_iommu *)map1;
862 	ctx = tag->ctx;
863 	domain = ctx->domain;
864 	atomic_add_long(&ctx->unloads, 1);
865 
866 	TAILQ_INIT(&entries);
867 	IOMMU_DMAMAP_LOCK(map);
868 	TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link);
869 	IOMMU_DMAMAP_UNLOCK(map);
870 #if defined(IOMMU_DOMAIN_UNLOAD_SLEEP)
871 	IOMMU_DOMAIN_LOCK(domain);
872 	TAILQ_CONCAT(&domain->unload_entries, &entries, dmamap_link);
873 	IOMMU_DOMAIN_UNLOCK(domain);
874 	taskqueue_enqueue(domain->iommu->delayed_taskqueue,
875 	    &domain->unload_task);
876 #else
877 	THREAD_NO_SLEEPING();
878 	iommu_domain_unload(domain, &entries, false);
879 	THREAD_SLEEPING_OK();
880 	KASSERT(TAILQ_EMPTY(&entries), ("lazy iommu_ctx_unload %p", ctx));
881 #endif
882 }
883 
884 static void
885 iommu_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map1,
886     bus_dmasync_op_t op)
887 {
888 	struct bus_dmamap_iommu *map __unused;
889 
890 	map = (struct bus_dmamap_iommu *)map1;
891 	kmsan_bus_dmamap_sync(&map->kmsan_mem, op);
892 }
893 
894 #ifdef KMSAN
895 static void
896 iommu_bus_dmamap_load_kmsan(bus_dmamap_t map1, struct memdesc *mem)
897 {
898 	struct bus_dmamap_iommu *map;
899 
900 	map = (struct bus_dmamap_iommu *)map1;
901 	if (map == NULL)
902 		return;
903 	memcpy(&map->kmsan_mem, mem, sizeof(struct memdesc));
904 }
905 #endif
906 
907 struct bus_dma_impl bus_dma_iommu_impl = {
908 	.tag_create = iommu_bus_dma_tag_create,
909 	.tag_destroy = iommu_bus_dma_tag_destroy,
910 	.tag_set_domain = iommu_bus_dma_tag_set_domain,
911 	.id_mapped = iommu_bus_dma_id_mapped,
912 	.map_create = iommu_bus_dmamap_create,
913 	.map_destroy = iommu_bus_dmamap_destroy,
914 	.mem_alloc = iommu_bus_dmamem_alloc,
915 	.mem_free = iommu_bus_dmamem_free,
916 	.load_phys = iommu_bus_dmamap_load_phys,
917 	.load_buffer = iommu_bus_dmamap_load_buffer,
918 	.load_ma = iommu_bus_dmamap_load_ma,
919 	.map_waitok = iommu_bus_dmamap_waitok,
920 	.map_complete = iommu_bus_dmamap_complete,
921 	.map_unload = iommu_bus_dmamap_unload,
922 	.map_sync = iommu_bus_dmamap_sync,
923 #ifdef KMSAN
924 	.load_kmsan = iommu_bus_dmamap_load_kmsan,
925 #endif
926 };
927 
928 static void
929 iommu_bus_task_dmamap(void *arg, int pending)
930 {
931 	struct bus_dma_tag_iommu *tag;
932 	struct bus_dmamap_iommu *map;
933 	struct iommu_unit *unit;
934 
935 	unit = arg;
936 	IOMMU_LOCK(unit);
937 	while ((map = TAILQ_FIRST(&unit->delayed_maps)) != NULL) {
938 		TAILQ_REMOVE(&unit->delayed_maps, map, delay_link);
939 		IOMMU_UNLOCK(unit);
940 		tag = map->tag;
941 		map->cansleep = true;
942 		map->locked = false;
943 		bus_dmamap_load_mem((bus_dma_tag_t)tag, (bus_dmamap_t)map,
944 		    &map->mem, map->callback, map->callback_arg,
945 		    BUS_DMA_WAITOK);
946 		map->cansleep = false;
947 		if (map->locked) {
948 			(tag->common.lockfunc)(tag->common.lockfuncarg,
949 			    BUS_DMA_UNLOCK);
950 		} else
951 			map->locked = true;
952 		map->cansleep = false;
953 		IOMMU_LOCK(unit);
954 	}
955 	IOMMU_UNLOCK(unit);
956 }
957 
958 static void
959 iommu_bus_schedule_dmamap(struct iommu_unit *unit, struct bus_dmamap_iommu *map)
960 {
961 
962 	map->locked = false;
963 	IOMMU_LOCK(unit);
964 	TAILQ_INSERT_TAIL(&unit->delayed_maps, map, delay_link);
965 	IOMMU_UNLOCK(unit);
966 	taskqueue_enqueue(unit->delayed_taskqueue, &unit->dmamap_load_task);
967 }
968 
969 int
970 iommu_init_busdma(struct iommu_unit *unit)
971 {
972 	int error;
973 
974 	unit->dma_enabled = 0;
975 	error = TUNABLE_INT_FETCH("hw.iommu.dma", &unit->dma_enabled);
976 	if (error == 0) /* compatibility */
977 		TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled);
978 	SYSCTL_ADD_INT(&unit->sysctl_ctx,
979 	    SYSCTL_CHILDREN(device_get_sysctl_tree(unit->dev)),
980 	    OID_AUTO, "dma", CTLFLAG_RD, &unit->dma_enabled, 0,
981 	    "DMA ops enabled");
982 	TAILQ_INIT(&unit->delayed_maps);
983 	TASK_INIT(&unit->dmamap_load_task, 0, iommu_bus_task_dmamap, unit);
984 	unit->delayed_taskqueue = taskqueue_create("iommu", M_WAITOK,
985 	    taskqueue_thread_enqueue, &unit->delayed_taskqueue);
986 	taskqueue_start_threads(&unit->delayed_taskqueue, 1, PI_DISK,
987 	    "iommu%d busdma taskq", unit->unit);
988 	return (0);
989 }
990 
991 void
992 iommu_fini_busdma(struct iommu_unit *unit)
993 {
994 
995 	if (unit->delayed_taskqueue == NULL)
996 		return;
997 
998 	taskqueue_drain(unit->delayed_taskqueue, &unit->dmamap_load_task);
999 	taskqueue_free(unit->delayed_taskqueue);
1000 	unit->delayed_taskqueue = NULL;
1001 }
1002 
1003 int
1004 bus_dma_iommu_load_ident(bus_dma_tag_t dmat, bus_dmamap_t map1,
1005     vm_paddr_t start, vm_size_t length, int flags)
1006 {
1007 	struct bus_dma_tag_common *tc;
1008 	struct bus_dma_tag_iommu *tag;
1009 	struct bus_dmamap_iommu *map;
1010 	struct iommu_ctx *ctx;
1011 	struct iommu_domain *domain;
1012 	struct iommu_map_entry *entry;
1013 	vm_page_t *ma;
1014 	vm_size_t i;
1015 	int error;
1016 	bool waitok;
1017 
1018 	MPASS((start & PAGE_MASK) == 0);
1019 	MPASS((length & PAGE_MASK) == 0);
1020 	MPASS(length > 0);
1021 	MPASS(start + length >= start);
1022 	MPASS((flags & ~(BUS_DMA_NOWAIT | BUS_DMA_NOWRITE)) == 0);
1023 
1024 	tc = (struct bus_dma_tag_common *)dmat;
1025 	if (tc->impl != &bus_dma_iommu_impl)
1026 		return (0);
1027 
1028 	tag = (struct bus_dma_tag_iommu *)dmat;
1029 	ctx = tag->ctx;
1030 	domain = ctx->domain;
1031 	map = (struct bus_dmamap_iommu *)map1;
1032 	waitok = (flags & BUS_DMA_NOWAIT) != 0;
1033 
1034 	entry = iommu_gas_alloc_entry(domain, waitok ? 0 : IOMMU_PGF_WAITOK);
1035 	if (entry == NULL)
1036 		return (ENOMEM);
1037 	entry->start = start;
1038 	entry->end = start + length;
1039 	ma = malloc(sizeof(vm_page_t) * atop(length), M_TEMP, waitok ?
1040 	    M_WAITOK : M_NOWAIT);
1041 	if (ma == NULL) {
1042 		iommu_gas_free_entry(entry);
1043 		return (ENOMEM);
1044 	}
1045 	for (i = 0; i < atop(length); i++) {
1046 		ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
1047 		    VM_MEMATTR_DEFAULT);
1048 	}
1049 	error = iommu_gas_map_region(domain, entry, IOMMU_MAP_ENTRY_READ |
1050 	    ((flags & BUS_DMA_NOWRITE) ? 0 : IOMMU_MAP_ENTRY_WRITE) |
1051 	    IOMMU_MAP_ENTRY_MAP, waitok ? IOMMU_MF_CANWAIT : 0, ma);
1052 	if (error == 0) {
1053 		IOMMU_DMAMAP_LOCK(map);
1054 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
1055 		IOMMU_DMAMAP_UNLOCK(map);
1056 	} else {
1057 		iommu_gas_free_entry(entry);
1058 	}
1059 	for (i = 0; i < atop(length); i++)
1060 		vm_page_putfake(ma[i]);
1061 	free(ma, M_TEMP);
1062 	return (error);
1063 }
1064 
1065 static void
1066 iommu_domain_unload_task(void *arg, int pending)
1067 {
1068 	struct iommu_domain *domain;
1069 	struct iommu_map_entries_tailq entries;
1070 
1071 	domain = arg;
1072 	TAILQ_INIT(&entries);
1073 
1074 	for (;;) {
1075 		IOMMU_DOMAIN_LOCK(domain);
1076 		TAILQ_SWAP(&domain->unload_entries, &entries,
1077 		    iommu_map_entry, dmamap_link);
1078 		IOMMU_DOMAIN_UNLOCK(domain);
1079 		if (TAILQ_EMPTY(&entries))
1080 			break;
1081 		iommu_domain_unload(domain, &entries, true);
1082 	}
1083 }
1084 
1085 void
1086 iommu_domain_init(struct iommu_unit *unit, struct iommu_domain *domain,
1087     const struct iommu_domain_map_ops *ops)
1088 {
1089 
1090 	domain->ops = ops;
1091 	domain->iommu = unit;
1092 
1093 	TASK_INIT(&domain->unload_task, 0, iommu_domain_unload_task, domain);
1094 	RB_INIT(&domain->rb_root);
1095 	TAILQ_INIT(&domain->unload_entries);
1096 	mtx_init(&domain->lock, "iodom", NULL, MTX_DEF);
1097 }
1098 
1099 void
1100 iommu_domain_fini(struct iommu_domain *domain)
1101 {
1102 
1103 	mtx_destroy(&domain->lock);
1104 }
1105