xref: /freebsd/sys/dev/iommu/busdma_iommu.c (revision 525fe93dc7487a1e63a90f6a2b956abc601963c1)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/domainset.h>
34 #include <sys/malloc.h>
35 #include <sys/bus.h>
36 #include <sys/conf.h>
37 #include <sys/interrupt.h>
38 #include <sys/kernel.h>
39 #include <sys/ktr.h>
40 #include <sys/lock.h>
41 #include <sys/proc.h>
42 #include <sys/memdesc.h>
43 #include <sys/msan.h>
44 #include <sys/mutex.h>
45 #include <sys/sysctl.h>
46 #include <sys/rman.h>
47 #include <sys/taskqueue.h>
48 #include <sys/tree.h>
49 #include <sys/uio.h>
50 #include <sys/vmem.h>
51 #include <dev/pci/pcireg.h>
52 #include <dev/pci/pcivar.h>
53 #include <vm/vm.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_object.h>
57 #include <vm/vm_page.h>
58 #include <vm/vm_map.h>
59 #include <dev/iommu/iommu.h>
60 #include <machine/atomic.h>
61 #include <machine/bus.h>
62 #include <machine/md_var.h>
63 #include <machine/iommu.h>
64 #include <dev/iommu/busdma_iommu.h>
65 
66 /*
67  * busdma_iommu.c, the implementation of the busdma(9) interface using
68  * IOMMU units from Intel VT-d.
69  */
70 
71 static bool
72 iommu_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func)
73 {
74 	char str[128], *env;
75 	int default_bounce;
76 	bool ret;
77 	static const char bounce_str[] = "bounce";
78 	static const char iommu_str[] = "iommu";
79 	static const char dmar_str[] = "dmar"; /* compatibility */
80 
81 	default_bounce = 0;
82 	env = kern_getenv("hw.busdma.default");
83 	if (env != NULL) {
84 		if (strcmp(env, bounce_str) == 0)
85 			default_bounce = 1;
86 		else if (strcmp(env, iommu_str) == 0 ||
87 		    strcmp(env, dmar_str) == 0)
88 			default_bounce = 0;
89 		freeenv(env);
90 	}
91 
92 	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d",
93 	    domain, bus, slot, func);
94 	env = kern_getenv(str);
95 	if (env == NULL)
96 		return (default_bounce != 0);
97 	if (strcmp(env, bounce_str) == 0)
98 		ret = true;
99 	else if (strcmp(env, iommu_str) == 0 ||
100 	    strcmp(env, dmar_str) == 0)
101 		ret = false;
102 	else
103 		ret = default_bounce != 0;
104 	freeenv(env);
105 	return (ret);
106 }
107 
108 /*
109  * Given original device, find the requester ID that will be seen by
110  * the IOMMU unit and used for page table lookup.  PCI bridges may take
111  * ownership of transactions from downstream devices, so it may not be
112  * the same as the BSF of the target device.  In those cases, all
113  * devices downstream of the bridge must share a single mapping
114  * domain, and must collectively be assigned to use either IOMMU or
115  * bounce mapping.
116  */
117 device_t
118 iommu_get_requester(device_t dev, uint16_t *rid)
119 {
120 	devclass_t pci_class;
121 	device_t l, pci, pcib, pcip, pcibp, requester;
122 	int cap_offset;
123 	uint16_t pcie_flags;
124 	bool bridge_is_pcie;
125 
126 	pci_class = devclass_find("pci");
127 	l = requester = dev;
128 
129 	*rid = pci_get_rid(dev);
130 
131 	/*
132 	 * Walk the bridge hierarchy from the target device to the
133 	 * host port to find the translating bridge nearest the IOMMU
134 	 * unit.
135 	 */
136 	for (;;) {
137 		pci = device_get_parent(l);
138 		KASSERT(pci != NULL, ("iommu_get_requester(%s): NULL parent "
139 		    "for %s", device_get_name(dev), device_get_name(l)));
140 		KASSERT(device_get_devclass(pci) == pci_class,
141 		    ("iommu_get_requester(%s): non-pci parent %s for %s",
142 		    device_get_name(dev), device_get_name(pci),
143 		    device_get_name(l)));
144 
145 		pcib = device_get_parent(pci);
146 		KASSERT(pcib != NULL, ("iommu_get_requester(%s): NULL bridge "
147 		    "for %s", device_get_name(dev), device_get_name(pci)));
148 
149 		/*
150 		 * The parent of our "bridge" isn't another PCI bus,
151 		 * so pcib isn't a PCI->PCI bridge but rather a host
152 		 * port, and the requester ID won't be translated
153 		 * further.
154 		 */
155 		pcip = device_get_parent(pcib);
156 		if (device_get_devclass(pcip) != pci_class)
157 			break;
158 		pcibp = device_get_parent(pcip);
159 
160 		if (pci_find_cap(l, PCIY_EXPRESS, &cap_offset) == 0) {
161 			/*
162 			 * Do not stop the loop even if the target
163 			 * device is PCIe, because it is possible (but
164 			 * unlikely) to have a PCI->PCIe bridge
165 			 * somewhere in the hierarchy.
166 			 */
167 			l = pcib;
168 		} else {
169 			/*
170 			 * Device is not PCIe, it cannot be seen as a
171 			 * requester by IOMMU unit.  Check whether the
172 			 * bridge is PCIe.
173 			 */
174 			bridge_is_pcie = pci_find_cap(pcib, PCIY_EXPRESS,
175 			    &cap_offset) == 0;
176 			requester = pcib;
177 
178 			/*
179 			 * Check for a buggy PCIe/PCI bridge that
180 			 * doesn't report the express capability.  If
181 			 * the bridge above it is express but isn't a
182 			 * PCI bridge, then we know pcib is actually a
183 			 * PCIe/PCI bridge.
184 			 */
185 			if (!bridge_is_pcie && pci_find_cap(pcibp,
186 			    PCIY_EXPRESS, &cap_offset) == 0) {
187 				pcie_flags = pci_read_config(pcibp,
188 				    cap_offset + PCIER_FLAGS, 2);
189 				if ((pcie_flags & PCIEM_FLAGS_TYPE) !=
190 				    PCIEM_TYPE_PCI_BRIDGE)
191 					bridge_is_pcie = true;
192 			}
193 
194 			if (bridge_is_pcie) {
195 				/*
196 				 * The current device is not PCIe, but
197 				 * the bridge above it is.  This is a
198 				 * PCIe->PCI bridge.  Assume that the
199 				 * requester ID will be the secondary
200 				 * bus number with slot and function
201 				 * set to zero.
202 				 *
203 				 * XXX: Doesn't handle the case where
204 				 * the bridge is PCIe->PCI-X, and the
205 				 * bridge will only take ownership of
206 				 * requests in some cases.  We should
207 				 * provide context entries with the
208 				 * same page tables for taken and
209 				 * non-taken transactions.
210 				 */
211 				*rid = PCI_RID(pci_get_bus(l), 0, 0);
212 				l = pcibp;
213 			} else {
214 				/*
215 				 * Neither the device nor the bridge
216 				 * above it are PCIe.  This is a
217 				 * conventional PCI->PCI bridge, which
218 				 * will use the bridge's BSF as the
219 				 * requester ID.
220 				 */
221 				*rid = pci_get_rid(pcib);
222 				l = pcib;
223 			}
224 		}
225 	}
226 	return (requester);
227 }
228 
229 struct iommu_ctx *
230 iommu_instantiate_ctx(struct iommu_unit *unit, device_t dev, bool rmrr)
231 {
232 	device_t requester;
233 	struct iommu_ctx *ctx;
234 	bool disabled;
235 	uint16_t rid;
236 
237 	requester = iommu_get_requester(dev, &rid);
238 
239 	/*
240 	 * If the user requested the IOMMU disabled for the device, we
241 	 * cannot disable the IOMMU unit, due to possibility of other
242 	 * devices on the same IOMMU unit still requiring translation.
243 	 * Instead provide the identity mapping for the device
244 	 * context.
245 	 */
246 	disabled = iommu_bus_dma_is_dev_disabled(pci_get_domain(requester),
247 	    pci_get_bus(requester), pci_get_slot(requester),
248 	    pci_get_function(requester));
249 	ctx = iommu_get_ctx(unit, requester, rid, disabled, rmrr);
250 	if (ctx == NULL)
251 		return (NULL);
252 	if (disabled) {
253 		/*
254 		 * Keep the first reference on context, release the
255 		 * later refs.
256 		 */
257 		IOMMU_LOCK(unit);
258 		if ((ctx->flags & IOMMU_CTX_DISABLED) == 0) {
259 			ctx->flags |= IOMMU_CTX_DISABLED;
260 			IOMMU_UNLOCK(unit);
261 		} else {
262 			iommu_free_ctx_locked(unit, ctx);
263 		}
264 		ctx = NULL;
265 	}
266 	return (ctx);
267 }
268 
269 struct iommu_ctx *
270 iommu_get_dev_ctx(device_t dev)
271 {
272 	struct iommu_unit *unit;
273 
274 	unit = iommu_find(dev, bootverbose);
275 	/* Not in scope of any IOMMU ? */
276 	if (unit == NULL)
277 		return (NULL);
278 	if (!unit->dma_enabled)
279 		return (NULL);
280 
281 #if defined(__amd64__) || defined(__i386__)
282 	dmar_quirks_pre_use(unit);
283 	dmar_instantiate_rmrr_ctxs(unit);
284 #endif
285 
286 	return (iommu_instantiate_ctx(unit, dev, false));
287 }
288 
289 bus_dma_tag_t
290 iommu_get_dma_tag(device_t dev, device_t child)
291 {
292 	struct iommu_ctx *ctx;
293 	bus_dma_tag_t res;
294 
295 	ctx = iommu_get_dev_ctx(child);
296 	if (ctx == NULL)
297 		return (NULL);
298 
299 	res = (bus_dma_tag_t)ctx->tag;
300 	return (res);
301 }
302 
303 bool
304 bus_dma_iommu_set_buswide(device_t dev)
305 {
306 	struct iommu_unit *unit;
307 	device_t parent;
308 	u_int busno, slot, func;
309 
310 	parent = device_get_parent(dev);
311 	if (device_get_devclass(parent) != devclass_find("pci"))
312 		return (false);
313 	unit = iommu_find(dev, bootverbose);
314 	if (unit == NULL)
315 		return (false);
316 	busno = pci_get_bus(dev);
317 	slot = pci_get_slot(dev);
318 	func = pci_get_function(dev);
319 	if (slot != 0 || func != 0) {
320 		if (bootverbose) {
321 			device_printf(dev,
322 			    "iommu%d pci%d:%d:%d requested buswide busdma\n",
323 			    unit->unit, busno, slot, func);
324 		}
325 		return (false);
326 	}
327 	iommu_set_buswide_ctx(unit, busno);
328 	return (true);
329 }
330 
331 void
332 iommu_set_buswide_ctx(struct iommu_unit *unit, u_int busno)
333 {
334 
335 	MPASS(busno <= PCI_BUSMAX);
336 	IOMMU_LOCK(unit);
337 	unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] |=
338 	    1 << (busno % (NBBY * sizeof(uint32_t)));
339 	IOMMU_UNLOCK(unit);
340 }
341 
342 bool
343 iommu_is_buswide_ctx(struct iommu_unit *unit, u_int busno)
344 {
345 
346 	MPASS(busno <= PCI_BUSMAX);
347 	return ((unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] &
348 	    (1U << (busno % (NBBY * sizeof(uint32_t))))) != 0);
349 }
350 
351 static MALLOC_DEFINE(M_IOMMU_DMAMAP, "iommu_dmamap", "IOMMU DMA Map");
352 
353 static void iommu_bus_schedule_dmamap(struct iommu_unit *unit,
354     struct bus_dmamap_iommu *map);
355 
356 static int
357 iommu_bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
358     bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr,
359     bus_dma_filter_t *filter, void *filterarg, bus_size_t maxsize,
360     int nsegments, bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc,
361     void *lockfuncarg, bus_dma_tag_t *dmat)
362 {
363 	struct bus_dma_tag_iommu *newtag, *oldtag;
364 	int error;
365 
366 	*dmat = NULL;
367 	error = common_bus_dma_tag_create(parent != NULL ?
368 	    &((struct bus_dma_tag_iommu *)parent)->common : NULL, alignment,
369 	    boundary, lowaddr, highaddr, filter, filterarg, maxsize,
370 	    nsegments, maxsegsz, flags, lockfunc, lockfuncarg,
371 	    sizeof(struct bus_dma_tag_iommu), (void **)&newtag);
372 	if (error != 0)
373 		goto out;
374 
375 	oldtag = (struct bus_dma_tag_iommu *)parent;
376 	newtag->common.impl = &bus_dma_iommu_impl;
377 	newtag->ctx = oldtag->ctx;
378 	newtag->owner = oldtag->owner;
379 
380 	*dmat = (bus_dma_tag_t)newtag;
381 out:
382 	CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
383 	    __func__, newtag, (newtag != NULL ? newtag->common.flags : 0),
384 	    error);
385 	return (error);
386 }
387 
388 static int
389 iommu_bus_dma_tag_set_domain(bus_dma_tag_t dmat)
390 {
391 
392 	return (0);
393 }
394 
395 static int
396 iommu_bus_dma_tag_destroy(bus_dma_tag_t dmat1)
397 {
398 	struct bus_dma_tag_iommu *dmat, *parent;
399 	struct bus_dma_tag_iommu *dmat_copy __unused;
400 	int error;
401 
402 	error = 0;
403 	dmat_copy = dmat = (struct bus_dma_tag_iommu *)dmat1;
404 
405 	if (dmat != NULL) {
406 		if (dmat->map_count != 0) {
407 			error = EBUSY;
408 			goto out;
409 		}
410 		while (dmat != NULL) {
411 			parent = (struct bus_dma_tag_iommu *)dmat->common.parent;
412 			if (atomic_fetchadd_int(&dmat->common.ref_count, -1) ==
413 			    1) {
414 				if (dmat == dmat->ctx->tag)
415 					iommu_free_ctx(dmat->ctx);
416 				free(dmat->segments, M_IOMMU_DMAMAP);
417 				free(dmat, M_DEVBUF);
418 				dmat = parent;
419 			} else
420 				dmat = NULL;
421 		}
422 	}
423 out:
424 	CTR3(KTR_BUSDMA, "%s tag %p error %d", __func__, dmat_copy, error);
425 	return (error);
426 }
427 
428 static bool
429 iommu_bus_dma_id_mapped(bus_dma_tag_t dmat, vm_paddr_t buf, bus_size_t buflen)
430 {
431 
432 	return (false);
433 }
434 
435 static int
436 iommu_bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp)
437 {
438 	struct bus_dma_tag_iommu *tag;
439 	struct bus_dmamap_iommu *map;
440 
441 	tag = (struct bus_dma_tag_iommu *)dmat;
442 	map = malloc_domainset(sizeof(*map), M_IOMMU_DMAMAP,
443 	    DOMAINSET_PREF(tag->common.domain), M_NOWAIT | M_ZERO);
444 	if (map == NULL) {
445 		*mapp = NULL;
446 		return (ENOMEM);
447 	}
448 	if (tag->segments == NULL) {
449 		tag->segments = malloc_domainset(sizeof(bus_dma_segment_t) *
450 		    tag->common.nsegments, M_IOMMU_DMAMAP,
451 		    DOMAINSET_PREF(tag->common.domain), M_NOWAIT);
452 		if (tag->segments == NULL) {
453 			free(map, M_IOMMU_DMAMAP);
454 			*mapp = NULL;
455 			return (ENOMEM);
456 		}
457 	}
458 	IOMMU_DMAMAP_INIT(map);
459 	TAILQ_INIT(&map->map_entries);
460 	map->tag = tag;
461 	map->locked = true;
462 	map->cansleep = false;
463 	tag->map_count++;
464 	*mapp = (bus_dmamap_t)map;
465 
466 	return (0);
467 }
468 
469 static int
470 iommu_bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map1)
471 {
472 	struct bus_dma_tag_iommu *tag;
473 	struct bus_dmamap_iommu *map;
474 
475 	tag = (struct bus_dma_tag_iommu *)dmat;
476 	map = (struct bus_dmamap_iommu *)map1;
477 	if (map != NULL) {
478 		IOMMU_DMAMAP_LOCK(map);
479 		if (!TAILQ_EMPTY(&map->map_entries)) {
480 			IOMMU_DMAMAP_UNLOCK(map);
481 			return (EBUSY);
482 		}
483 		IOMMU_DMAMAP_DESTROY(map);
484 		free(map, M_IOMMU_DMAMAP);
485 	}
486 	tag->map_count--;
487 	return (0);
488 }
489 
490 
491 static int
492 iommu_bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
493     bus_dmamap_t *mapp)
494 {
495 	struct bus_dma_tag_iommu *tag;
496 	struct bus_dmamap_iommu *map;
497 	int error, mflags;
498 	vm_memattr_t attr;
499 
500 	error = iommu_bus_dmamap_create(dmat, flags, mapp);
501 	if (error != 0)
502 		return (error);
503 
504 	mflags = (flags & BUS_DMA_NOWAIT) != 0 ? M_NOWAIT : M_WAITOK;
505 	mflags |= (flags & BUS_DMA_ZERO) != 0 ? M_ZERO : 0;
506 	attr = (flags & BUS_DMA_NOCACHE) != 0 ? VM_MEMATTR_UNCACHEABLE :
507 	    VM_MEMATTR_DEFAULT;
508 
509 	tag = (struct bus_dma_tag_iommu *)dmat;
510 	map = (struct bus_dmamap_iommu *)*mapp;
511 
512 	if (tag->common.maxsize < PAGE_SIZE &&
513 	    tag->common.alignment <= tag->common.maxsize &&
514 	    attr == VM_MEMATTR_DEFAULT) {
515 		*vaddr = malloc_domainset(tag->common.maxsize, M_DEVBUF,
516 		    DOMAINSET_PREF(tag->common.domain), mflags);
517 		map->flags |= BUS_DMAMAP_IOMMU_MALLOC;
518 	} else {
519 		*vaddr = kmem_alloc_attr_domainset(
520 		    DOMAINSET_PREF(tag->common.domain), tag->common.maxsize,
521 		    mflags, 0ul, BUS_SPACE_MAXADDR, attr);
522 		map->flags |= BUS_DMAMAP_IOMMU_KMEM_ALLOC;
523 	}
524 	if (*vaddr == NULL) {
525 		iommu_bus_dmamap_destroy(dmat, *mapp);
526 		*mapp = NULL;
527 		return (ENOMEM);
528 	}
529 	return (0);
530 }
531 
532 static void
533 iommu_bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map1)
534 {
535 	struct bus_dma_tag_iommu *tag;
536 	struct bus_dmamap_iommu *map;
537 
538 	tag = (struct bus_dma_tag_iommu *)dmat;
539 	map = (struct bus_dmamap_iommu *)map1;
540 
541 	if ((map->flags & BUS_DMAMAP_IOMMU_MALLOC) != 0) {
542 		free(vaddr, M_DEVBUF);
543 		map->flags &= ~BUS_DMAMAP_IOMMU_MALLOC;
544 	} else {
545 		KASSERT((map->flags & BUS_DMAMAP_IOMMU_KMEM_ALLOC) != 0,
546 		    ("iommu_bus_dmamem_free for non alloced map %p", map));
547 		kmem_free(vaddr, tag->common.maxsize);
548 		map->flags &= ~BUS_DMAMAP_IOMMU_KMEM_ALLOC;
549 	}
550 
551 	iommu_bus_dmamap_destroy(dmat, map1);
552 }
553 
554 static int
555 iommu_bus_dmamap_load_something1(struct bus_dma_tag_iommu *tag,
556     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
557     int flags, bus_dma_segment_t *segs, int *segp,
558     struct iommu_map_entries_tailq *entries)
559 {
560 	struct iommu_ctx *ctx;
561 	struct iommu_domain *domain;
562 	struct iommu_map_entry *entry;
563 	bus_size_t buflen1;
564 	int error, e_flags, idx, gas_flags, seg;
565 
566 	KASSERT(offset < IOMMU_PAGE_SIZE, ("offset %d", offset));
567 	if (segs == NULL)
568 		segs = tag->segments;
569 	ctx = tag->ctx;
570 	domain = ctx->domain;
571 	e_flags = IOMMU_MAP_ENTRY_READ |
572 	    ((flags & BUS_DMA_NOWRITE) == 0 ? IOMMU_MAP_ENTRY_WRITE : 0);
573 	seg = *segp;
574 	error = 0;
575 	idx = 0;
576 	while (buflen > 0) {
577 		seg++;
578 		if (seg >= tag->common.nsegments) {
579 			error = EFBIG;
580 			break;
581 		}
582 		buflen1 = buflen > tag->common.maxsegsz ?
583 		    tag->common.maxsegsz : buflen;
584 
585 		/*
586 		 * (Too) optimistically allow split if there are more
587 		 * then one segments left.
588 		 */
589 		gas_flags = map->cansleep ? IOMMU_MF_CANWAIT : 0;
590 		if (seg + 1 < tag->common.nsegments)
591 			gas_flags |= IOMMU_MF_CANSPLIT;
592 
593 		error = iommu_gas_map(domain, &tag->common, buflen1,
594 		    offset, e_flags, gas_flags, ma + idx, &entry);
595 		if (error != 0)
596 			break;
597 		/* Update buflen1 in case buffer split. */
598 		if (buflen1 > entry->end - entry->start - offset)
599 			buflen1 = entry->end - entry->start - offset;
600 
601 		KASSERT(vm_addr_align_ok(entry->start + offset,
602 		    tag->common.alignment),
603 		    ("alignment failed: ctx %p start 0x%jx offset %x "
604 		    "align 0x%jx", ctx, (uintmax_t)entry->start, offset,
605 		    (uintmax_t)tag->common.alignment));
606 		KASSERT(entry->end <= tag->common.lowaddr ||
607 		    entry->start >= tag->common.highaddr,
608 		    ("entry placement failed: ctx %p start 0x%jx end 0x%jx "
609 		    "lowaddr 0x%jx highaddr 0x%jx", ctx,
610 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
611 		    (uintmax_t)tag->common.lowaddr,
612 		    (uintmax_t)tag->common.highaddr));
613 		KASSERT(vm_addr_bound_ok(entry->start + offset, buflen1,
614 		    tag->common.boundary),
615 		    ("boundary failed: ctx %p start 0x%jx end 0x%jx "
616 		    "boundary 0x%jx", ctx, (uintmax_t)entry->start,
617 		    (uintmax_t)entry->end, (uintmax_t)tag->common.boundary));
618 		KASSERT(buflen1 <= tag->common.maxsegsz,
619 		    ("segment too large: ctx %p start 0x%jx end 0x%jx "
620 		    "buflen1 0x%jx maxsegsz 0x%jx", ctx,
621 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
622 		    (uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz));
623 
624 		KASSERT((entry->flags & IOMMU_MAP_ENTRY_MAP) != 0,
625 		    ("entry %p missing IOMMU_MAP_ENTRY_MAP", entry));
626 		TAILQ_INSERT_TAIL(entries, entry, dmamap_link);
627 
628 		segs[seg].ds_addr = entry->start + offset;
629 		segs[seg].ds_len = buflen1;
630 
631 		idx += OFF_TO_IDX(offset + buflen1);
632 		offset += buflen1;
633 		offset &= IOMMU_PAGE_MASK;
634 		buflen -= buflen1;
635 	}
636 	if (error == 0)
637 		*segp = seg;
638 	return (error);
639 }
640 
641 static int
642 iommu_bus_dmamap_load_something(struct bus_dma_tag_iommu *tag,
643     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
644     int flags, bus_dma_segment_t *segs, int *segp)
645 {
646 	struct iommu_ctx *ctx;
647 	struct iommu_domain *domain;
648 	struct iommu_map_entries_tailq entries;
649 	int error;
650 
651 	ctx = tag->ctx;
652 	domain = ctx->domain;
653 	atomic_add_long(&ctx->loads, 1);
654 
655 	TAILQ_INIT(&entries);
656 	error = iommu_bus_dmamap_load_something1(tag, map, ma, offset,
657 	    buflen, flags, segs, segp, &entries);
658 	if (error == 0) {
659 		IOMMU_DMAMAP_LOCK(map);
660 		TAILQ_CONCAT(&map->map_entries, &entries, dmamap_link);
661 		IOMMU_DMAMAP_UNLOCK(map);
662 	} else if (!TAILQ_EMPTY(&entries)) {
663 		/*
664 		 * The busdma interface does not allow us to report
665 		 * partial buffer load, so unfortunately we have to
666 		 * revert all work done.
667 		 */
668 		IOMMU_DOMAIN_LOCK(domain);
669 		TAILQ_CONCAT(&domain->unload_entries, &entries, dmamap_link);
670 		IOMMU_DOMAIN_UNLOCK(domain);
671 		taskqueue_enqueue(domain->iommu->delayed_taskqueue,
672 		    &domain->unload_task);
673 	}
674 
675 	if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 &&
676 	    !map->cansleep)
677 		error = EINPROGRESS;
678 	if (error == EINPROGRESS)
679 		iommu_bus_schedule_dmamap(domain->iommu, map);
680 	return (error);
681 }
682 
683 static int
684 iommu_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map1,
685     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
686     bus_dma_segment_t *segs, int *segp)
687 {
688 	struct bus_dma_tag_iommu *tag;
689 	struct bus_dmamap_iommu *map;
690 
691 	tag = (struct bus_dma_tag_iommu *)dmat;
692 	map = (struct bus_dmamap_iommu *)map1;
693 	return (iommu_bus_dmamap_load_something(tag, map, ma, ma_offs, tlen,
694 	    flags, segs, segp));
695 }
696 
697 static int
698 iommu_bus_dmamap_load_phys(bus_dma_tag_t dmat, bus_dmamap_t map1,
699     vm_paddr_t buf, bus_size_t buflen, int flags, bus_dma_segment_t *segs,
700     int *segp)
701 {
702 	struct bus_dma_tag_iommu *tag;
703 	struct bus_dmamap_iommu *map;
704 	vm_page_t *ma, fma;
705 	vm_paddr_t pstart, pend, paddr;
706 	int error, i, ma_cnt, mflags, offset;
707 
708 	tag = (struct bus_dma_tag_iommu *)dmat;
709 	map = (struct bus_dmamap_iommu *)map1;
710 	pstart = trunc_page(buf);
711 	pend = round_page(buf + buflen);
712 	offset = buf & PAGE_MASK;
713 	ma_cnt = OFF_TO_IDX(pend - pstart);
714 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
715 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
716 	if (ma == NULL)
717 		return (ENOMEM);
718 	fma = NULL;
719 	for (i = 0; i < ma_cnt; i++) {
720 		paddr = pstart + ptoa(i);
721 		ma[i] = PHYS_TO_VM_PAGE(paddr);
722 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
723 			/*
724 			 * If PHYS_TO_VM_PAGE() returned NULL or the
725 			 * vm_page was not initialized we'll use a
726 			 * fake page.
727 			 */
728 			if (fma == NULL) {
729 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
730 				    M_DEVBUF, M_ZERO | mflags);
731 				if (fma == NULL) {
732 					free(ma, M_DEVBUF);
733 					return (ENOMEM);
734 				}
735 			}
736 			vm_page_initfake(&fma[i], pstart + ptoa(i),
737 			    VM_MEMATTR_DEFAULT);
738 			ma[i] = &fma[i];
739 		}
740 	}
741 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
742 	    flags, segs, segp);
743 	free(fma, M_DEVBUF);
744 	free(ma, M_DEVBUF);
745 	return (error);
746 }
747 
748 static int
749 iommu_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map1, void *buf,
750     bus_size_t buflen, pmap_t pmap, int flags, bus_dma_segment_t *segs,
751     int *segp)
752 {
753 	struct bus_dma_tag_iommu *tag;
754 	struct bus_dmamap_iommu *map;
755 	vm_page_t *ma, fma;
756 	vm_paddr_t pstart, pend, paddr;
757 	int error, i, ma_cnt, mflags, offset;
758 
759 	tag = (struct bus_dma_tag_iommu *)dmat;
760 	map = (struct bus_dmamap_iommu *)map1;
761 	pstart = trunc_page((vm_offset_t)buf);
762 	pend = round_page((vm_offset_t)buf + buflen);
763 	offset = (vm_offset_t)buf & PAGE_MASK;
764 	ma_cnt = OFF_TO_IDX(pend - pstart);
765 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
766 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
767 	if (ma == NULL)
768 		return (ENOMEM);
769 	fma = NULL;
770 	for (i = 0; i < ma_cnt; i++, pstart += PAGE_SIZE) {
771 		if (pmap == kernel_pmap)
772 			paddr = pmap_kextract(pstart);
773 		else
774 			paddr = pmap_extract(pmap, pstart);
775 		ma[i] = PHYS_TO_VM_PAGE(paddr);
776 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
777 			/*
778 			 * If PHYS_TO_VM_PAGE() returned NULL or the
779 			 * vm_page was not initialized we'll use a
780 			 * fake page.
781 			 */
782 			if (fma == NULL) {
783 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
784 				    M_DEVBUF, M_ZERO | mflags);
785 				if (fma == NULL) {
786 					free(ma, M_DEVBUF);
787 					return (ENOMEM);
788 				}
789 			}
790 			vm_page_initfake(&fma[i], paddr, VM_MEMATTR_DEFAULT);
791 			ma[i] = &fma[i];
792 		}
793 	}
794 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
795 	    flags, segs, segp);
796 	free(ma, M_DEVBUF);
797 	free(fma, M_DEVBUF);
798 	return (error);
799 }
800 
801 static void
802 iommu_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map1,
803     struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg)
804 {
805 	struct bus_dmamap_iommu *map;
806 
807 	if (map1 == NULL)
808 		return;
809 	map = (struct bus_dmamap_iommu *)map1;
810 	map->mem = *mem;
811 	map->tag = (struct bus_dma_tag_iommu *)dmat;
812 	map->callback = callback;
813 	map->callback_arg = callback_arg;
814 }
815 
816 static bus_dma_segment_t *
817 iommu_bus_dmamap_complete(bus_dma_tag_t dmat, bus_dmamap_t map1,
818     bus_dma_segment_t *segs, int nsegs, int error)
819 {
820 	struct bus_dma_tag_iommu *tag;
821 	struct bus_dmamap_iommu *map;
822 
823 	tag = (struct bus_dma_tag_iommu *)dmat;
824 	map = (struct bus_dmamap_iommu *)map1;
825 
826 	if (!map->locked) {
827 		KASSERT(map->cansleep,
828 		    ("map not locked and not sleepable context %p", map));
829 
830 		/*
831 		 * We are called from the delayed context.  Relock the
832 		 * driver.
833 		 */
834 		(tag->common.lockfunc)(tag->common.lockfuncarg, BUS_DMA_LOCK);
835 		map->locked = true;
836 	}
837 
838 	if (segs == NULL)
839 		segs = tag->segments;
840 	return (segs);
841 }
842 
843 /*
844  * The limitations of busdma KPI forces the iommu to perform the actual
845  * unload, consisting of the unmapping of the map entries page tables,
846  * from the delayed context on i386, since page table page mapping
847  * might require a sleep to be successfull.  The unfortunate
848  * consequence is that the DMA requests can be served some time after
849  * the bus_dmamap_unload() call returned.
850  *
851  * On amd64, we assume that sf allocation cannot fail.
852  */
853 static void
854 iommu_bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t map1)
855 {
856 	struct bus_dma_tag_iommu *tag;
857 	struct bus_dmamap_iommu *map;
858 	struct iommu_ctx *ctx;
859 	struct iommu_domain *domain;
860 	struct iommu_map_entries_tailq entries;
861 
862 	tag = (struct bus_dma_tag_iommu *)dmat;
863 	map = (struct bus_dmamap_iommu *)map1;
864 	ctx = tag->ctx;
865 	domain = ctx->domain;
866 	atomic_add_long(&ctx->unloads, 1);
867 
868 	TAILQ_INIT(&entries);
869 	IOMMU_DMAMAP_LOCK(map);
870 	TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link);
871 	IOMMU_DMAMAP_UNLOCK(map);
872 #if defined(IOMMU_DOMAIN_UNLOAD_SLEEP)
873 	IOMMU_DOMAIN_LOCK(domain);
874 	TAILQ_CONCAT(&domain->unload_entries, &entries, dmamap_link);
875 	IOMMU_DOMAIN_UNLOCK(domain);
876 	taskqueue_enqueue(domain->iommu->delayed_taskqueue,
877 	    &domain->unload_task);
878 #else
879 	THREAD_NO_SLEEPING();
880 	iommu_domain_unload(domain, &entries, false);
881 	THREAD_SLEEPING_OK();
882 	KASSERT(TAILQ_EMPTY(&entries), ("lazy iommu_ctx_unload %p", ctx));
883 #endif
884 }
885 
886 static void
887 iommu_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map1,
888     bus_dmasync_op_t op)
889 {
890 	struct bus_dmamap_iommu *map __unused;
891 
892 	map = (struct bus_dmamap_iommu *)map1;
893 	kmsan_bus_dmamap_sync(&map->kmsan_mem, op);
894 }
895 
896 #ifdef KMSAN
897 static void
898 iommu_bus_dmamap_load_kmsan(bus_dmamap_t map1, struct memdesc *mem)
899 {
900 	struct bus_dmamap_iommu *map;
901 
902 	map = (struct bus_dmamap_iommu *)map1;
903 	if (map == NULL)
904 		return;
905 	memcpy(&map->kmsan_mem, mem, sizeof(struct memdesc));
906 }
907 #endif
908 
909 struct bus_dma_impl bus_dma_iommu_impl = {
910 	.tag_create = iommu_bus_dma_tag_create,
911 	.tag_destroy = iommu_bus_dma_tag_destroy,
912 	.tag_set_domain = iommu_bus_dma_tag_set_domain,
913 	.id_mapped = iommu_bus_dma_id_mapped,
914 	.map_create = iommu_bus_dmamap_create,
915 	.map_destroy = iommu_bus_dmamap_destroy,
916 	.mem_alloc = iommu_bus_dmamem_alloc,
917 	.mem_free = iommu_bus_dmamem_free,
918 	.load_phys = iommu_bus_dmamap_load_phys,
919 	.load_buffer = iommu_bus_dmamap_load_buffer,
920 	.load_ma = iommu_bus_dmamap_load_ma,
921 	.map_waitok = iommu_bus_dmamap_waitok,
922 	.map_complete = iommu_bus_dmamap_complete,
923 	.map_unload = iommu_bus_dmamap_unload,
924 	.map_sync = iommu_bus_dmamap_sync,
925 #ifdef KMSAN
926 	.load_kmsan = iommu_bus_dmamap_load_kmsan,
927 #endif
928 };
929 
930 static void
931 iommu_bus_task_dmamap(void *arg, int pending)
932 {
933 	struct bus_dma_tag_iommu *tag;
934 	struct bus_dmamap_iommu *map;
935 	struct iommu_unit *unit;
936 
937 	unit = arg;
938 	IOMMU_LOCK(unit);
939 	while ((map = TAILQ_FIRST(&unit->delayed_maps)) != NULL) {
940 		TAILQ_REMOVE(&unit->delayed_maps, map, delay_link);
941 		IOMMU_UNLOCK(unit);
942 		tag = map->tag;
943 		map->cansleep = true;
944 		map->locked = false;
945 		bus_dmamap_load_mem((bus_dma_tag_t)tag, (bus_dmamap_t)map,
946 		    &map->mem, map->callback, map->callback_arg,
947 		    BUS_DMA_WAITOK);
948 		map->cansleep = false;
949 		if (map->locked) {
950 			(tag->common.lockfunc)(tag->common.lockfuncarg,
951 			    BUS_DMA_UNLOCK);
952 		} else
953 			map->locked = true;
954 		map->cansleep = false;
955 		IOMMU_LOCK(unit);
956 	}
957 	IOMMU_UNLOCK(unit);
958 }
959 
960 static void
961 iommu_bus_schedule_dmamap(struct iommu_unit *unit, struct bus_dmamap_iommu *map)
962 {
963 
964 	map->locked = false;
965 	IOMMU_LOCK(unit);
966 	TAILQ_INSERT_TAIL(&unit->delayed_maps, map, delay_link);
967 	IOMMU_UNLOCK(unit);
968 	taskqueue_enqueue(unit->delayed_taskqueue, &unit->dmamap_load_task);
969 }
970 
971 int
972 iommu_init_busdma(struct iommu_unit *unit)
973 {
974 	int error;
975 
976 	unit->dma_enabled = 1;
977 	error = TUNABLE_INT_FETCH("hw.iommu.dma", &unit->dma_enabled);
978 	if (error == 0) /* compatibility */
979 		TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled);
980 	TAILQ_INIT(&unit->delayed_maps);
981 	TASK_INIT(&unit->dmamap_load_task, 0, iommu_bus_task_dmamap, unit);
982 	unit->delayed_taskqueue = taskqueue_create("iommu", M_WAITOK,
983 	    taskqueue_thread_enqueue, &unit->delayed_taskqueue);
984 	taskqueue_start_threads(&unit->delayed_taskqueue, 1, PI_DISK,
985 	    "iommu%d busdma taskq", unit->unit);
986 	return (0);
987 }
988 
989 void
990 iommu_fini_busdma(struct iommu_unit *unit)
991 {
992 
993 	if (unit->delayed_taskqueue == NULL)
994 		return;
995 
996 	taskqueue_drain(unit->delayed_taskqueue, &unit->dmamap_load_task);
997 	taskqueue_free(unit->delayed_taskqueue);
998 	unit->delayed_taskqueue = NULL;
999 }
1000 
1001 int
1002 bus_dma_iommu_load_ident(bus_dma_tag_t dmat, bus_dmamap_t map1,
1003     vm_paddr_t start, vm_size_t length, int flags)
1004 {
1005 	struct bus_dma_tag_common *tc;
1006 	struct bus_dma_tag_iommu *tag;
1007 	struct bus_dmamap_iommu *map;
1008 	struct iommu_ctx *ctx;
1009 	struct iommu_domain *domain;
1010 	struct iommu_map_entry *entry;
1011 	vm_page_t *ma;
1012 	vm_size_t i;
1013 	int error;
1014 	bool waitok;
1015 
1016 	MPASS((start & PAGE_MASK) == 0);
1017 	MPASS((length & PAGE_MASK) == 0);
1018 	MPASS(length > 0);
1019 	MPASS(start + length >= start);
1020 	MPASS((flags & ~(BUS_DMA_NOWAIT | BUS_DMA_NOWRITE)) == 0);
1021 
1022 	tc = (struct bus_dma_tag_common *)dmat;
1023 	if (tc->impl != &bus_dma_iommu_impl)
1024 		return (0);
1025 
1026 	tag = (struct bus_dma_tag_iommu *)dmat;
1027 	ctx = tag->ctx;
1028 	domain = ctx->domain;
1029 	map = (struct bus_dmamap_iommu *)map1;
1030 	waitok = (flags & BUS_DMA_NOWAIT) != 0;
1031 
1032 	entry = iommu_gas_alloc_entry(domain, waitok ? 0 : IOMMU_PGF_WAITOK);
1033 	if (entry == NULL)
1034 		return (ENOMEM);
1035 	entry->start = start;
1036 	entry->end = start + length;
1037 	ma = malloc(sizeof(vm_page_t) * atop(length), M_TEMP, waitok ?
1038 	    M_WAITOK : M_NOWAIT);
1039 	if (ma == NULL) {
1040 		iommu_gas_free_entry(entry);
1041 		return (ENOMEM);
1042 	}
1043 	for (i = 0; i < atop(length); i++) {
1044 		ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
1045 		    VM_MEMATTR_DEFAULT);
1046 	}
1047 	error = iommu_gas_map_region(domain, entry, IOMMU_MAP_ENTRY_READ |
1048 	    ((flags & BUS_DMA_NOWRITE) ? 0 : IOMMU_MAP_ENTRY_WRITE) |
1049 	    IOMMU_MAP_ENTRY_MAP, waitok ? IOMMU_MF_CANWAIT : 0, ma);
1050 	if (error == 0) {
1051 		IOMMU_DMAMAP_LOCK(map);
1052 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
1053 		IOMMU_DMAMAP_UNLOCK(map);
1054 	} else {
1055 		iommu_gas_free_entry(entry);
1056 	}
1057 	for (i = 0; i < atop(length); i++)
1058 		vm_page_putfake(ma[i]);
1059 	free(ma, M_TEMP);
1060 	return (error);
1061 }
1062 
1063 static void
1064 iommu_domain_unload_task(void *arg, int pending)
1065 {
1066 	struct iommu_domain *domain;
1067 	struct iommu_map_entries_tailq entries;
1068 
1069 	domain = arg;
1070 	TAILQ_INIT(&entries);
1071 
1072 	for (;;) {
1073 		IOMMU_DOMAIN_LOCK(domain);
1074 		TAILQ_SWAP(&domain->unload_entries, &entries,
1075 		    iommu_map_entry, dmamap_link);
1076 		IOMMU_DOMAIN_UNLOCK(domain);
1077 		if (TAILQ_EMPTY(&entries))
1078 			break;
1079 		iommu_domain_unload(domain, &entries, true);
1080 	}
1081 }
1082 
1083 void
1084 iommu_domain_init(struct iommu_unit *unit, struct iommu_domain *domain,
1085     const struct iommu_domain_map_ops *ops)
1086 {
1087 
1088 	domain->ops = ops;
1089 	domain->iommu = unit;
1090 
1091 	TASK_INIT(&domain->unload_task, 0, iommu_domain_unload_task, domain);
1092 	RB_INIT(&domain->rb_root);
1093 	TAILQ_INIT(&domain->unload_entries);
1094 	mtx_init(&domain->lock, "iodom", NULL, MTX_DEF);
1095 }
1096 
1097 void
1098 iommu_domain_fini(struct iommu_domain *domain)
1099 {
1100 
1101 	mtx_destroy(&domain->lock);
1102 }
1103