xref: /freebsd/sys/dev/iommu/iommu_gas.c (revision 0078721898754f6e71063e1f566c8671288a2218)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #define	RB_AUGMENT_CHECK(entry) iommu_gas_augment_entry(entry)
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/malloc.h>
39 #include <sys/bus.h>
40 #include <sys/interrupt.h>
41 #include <sys/kernel.h>
42 #include <sys/ktr.h>
43 #include <sys/lock.h>
44 #include <sys/proc.h>
45 #include <sys/rwlock.h>
46 #include <sys/memdesc.h>
47 #include <sys/mutex.h>
48 #include <sys/sysctl.h>
49 #include <sys/rman.h>
50 #include <sys/taskqueue.h>
51 #include <sys/tree.h>
52 #include <sys/uio.h>
53 #include <sys/vmem.h>
54 #include <vm/vm.h>
55 #include <vm/vm_extern.h>
56 #include <vm/vm_kern.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_map.h>
60 #include <vm/uma.h>
61 #include <dev/pci/pcireg.h>
62 #include <dev/pci/pcivar.h>
63 #include <dev/iommu/iommu.h>
64 #include <dev/iommu/iommu_gas.h>
65 #include <dev/iommu/iommu_msi.h>
66 #include <machine/atomic.h>
67 #include <machine/bus.h>
68 #include <machine/md_var.h>
69 #include <machine/iommu.h>
70 #include <dev/iommu/busdma_iommu.h>
71 
72 /*
73  * Guest Address Space management.
74  */
75 
76 static uma_zone_t iommu_map_entry_zone;
77 
78 #ifdef INVARIANTS
79 static int iommu_check_free;
80 #endif
81 
82 static void
83 intel_gas_init(void)
84 {
85 
86 	iommu_map_entry_zone = uma_zcreate("IOMMU_MAP_ENTRY",
87 	    sizeof(struct iommu_map_entry), NULL, NULL,
88 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NODUMP);
89 }
90 SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL);
91 
92 struct iommu_map_entry *
93 iommu_gas_alloc_entry(struct iommu_domain *domain, u_int flags)
94 {
95 	struct iommu_map_entry *res;
96 
97 	KASSERT((flags & ~(IOMMU_PGF_WAITOK)) == 0,
98 	    ("unsupported flags %x", flags));
99 
100 	res = uma_zalloc(iommu_map_entry_zone, ((flags & IOMMU_PGF_WAITOK) !=
101 	    0 ? M_WAITOK : M_NOWAIT) | M_ZERO);
102 	if (res != NULL && domain != NULL) {
103 		res->domain = domain;
104 		atomic_add_int(&domain->entries_cnt, 1);
105 	}
106 	return (res);
107 }
108 
109 void
110 iommu_gas_free_entry(struct iommu_map_entry *entry)
111 {
112 	struct iommu_domain *domain;
113 
114 	domain = entry->domain;
115 	if (domain != NULL)
116 		atomic_subtract_int(&domain->entries_cnt, 1);
117 	uma_zfree(iommu_map_entry_zone, entry);
118 }
119 
120 static int
121 iommu_gas_cmp_entries(struct iommu_map_entry *a, struct iommu_map_entry *b)
122 {
123 
124 	/* Last entry have zero size, so <= */
125 	KASSERT(a->start <= a->end, ("inverted entry %p (%jx, %jx)",
126 	    a, (uintmax_t)a->start, (uintmax_t)a->end));
127 	KASSERT(b->start <= b->end, ("inverted entry %p (%jx, %jx)",
128 	    b, (uintmax_t)b->start, (uintmax_t)b->end));
129 	KASSERT(a->end <= b->start || b->end <= a->start ||
130 	    a->end == a->start || b->end == b->start,
131 	    ("overlapping entries %p (%jx, %jx) %p (%jx, %jx)",
132 	    a, (uintmax_t)a->start, (uintmax_t)a->end,
133 	    b, (uintmax_t)b->start, (uintmax_t)b->end));
134 
135 	if (a->end < b->end)
136 		return (-1);
137 	else if (b->end < a->end)
138 		return (1);
139 	return (0);
140 }
141 
142 /*
143  * Update augmentation data based on data from children.
144  * Return true if and only if the update changes the augmentation data.
145  */
146 static bool
147 iommu_gas_augment_entry(struct iommu_map_entry *entry)
148 {
149 	struct iommu_map_entry *child;
150 	iommu_gaddr_t bound, delta, free_down;
151 
152 	free_down = 0;
153 	bound = entry->start;
154 	if ((child = RB_LEFT(entry, rb_entry)) != NULL) {
155 		free_down = MAX(child->free_down, bound - child->last);
156 		bound = child->first;
157 	}
158 	delta = bound - entry->first;
159 	entry->first = bound;
160 	bound = entry->end;
161 	if ((child = RB_RIGHT(entry, rb_entry)) != NULL) {
162 		free_down = MAX(free_down, child->free_down);
163 		free_down = MAX(free_down, child->first - bound);
164 		bound = child->last;
165 	}
166 	delta += entry->last - bound;
167 	if (delta == 0)
168 		delta = entry->free_down - free_down;
169 	entry->last = bound;
170 	entry->free_down = free_down;
171 
172 	/*
173 	 * Return true either if the value of last-first changed,
174 	 * or if free_down changed.
175 	 */
176 	return (delta != 0);
177 }
178 
179 RB_GENERATE(iommu_gas_entries_tree, iommu_map_entry, rb_entry,
180     iommu_gas_cmp_entries);
181 
182 #ifdef INVARIANTS
183 static void
184 iommu_gas_check_free(struct iommu_domain *domain)
185 {
186 	struct iommu_map_entry *entry, *l, *r;
187 	iommu_gaddr_t v;
188 
189 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
190 		KASSERT(domain == entry->domain,
191 		    ("mismatched free domain %p entry %p entry->domain %p",
192 		    domain, entry, entry->domain));
193 		l = RB_LEFT(entry, rb_entry);
194 		r = RB_RIGHT(entry, rb_entry);
195 		v = 0;
196 		if (l != NULL) {
197 			v = MAX(v, l->free_down);
198 			v = MAX(v, entry->start - l->last);
199 		}
200 		if (r != NULL) {
201 			v = MAX(v, r->free_down);
202 			v = MAX(v, r->first - entry->end);
203 		}
204 		MPASS(entry->free_down == v);
205 	}
206 }
207 #endif
208 
209 static void
210 iommu_gas_rb_remove(struct iommu_domain *domain, struct iommu_map_entry *entry)
211 {
212 	struct iommu_map_entry *nbr;
213 
214 	/* Removing entry may open a new free gap before domain->start_gap. */
215 	if (entry->end <= domain->start_gap->end) {
216 		if (RB_RIGHT(entry, rb_entry) != NULL)
217 			nbr = iommu_gas_entries_tree_RB_NEXT(entry);
218 		else if (RB_LEFT(entry, rb_entry) != NULL)
219 			nbr = RB_LEFT(entry, rb_entry);
220 		else
221 			nbr = RB_PARENT(entry, rb_entry);
222 		domain->start_gap = nbr;
223 	}
224 	RB_REMOVE(iommu_gas_entries_tree, &domain->rb_root, entry);
225 }
226 
227 struct iommu_domain *
228 iommu_get_ctx_domain(struct iommu_ctx *ctx)
229 {
230 
231 	return (ctx->domain);
232 }
233 
234 void
235 iommu_gas_init_domain(struct iommu_domain *domain)
236 {
237 	struct iommu_map_entry *begin, *end;
238 
239 	begin = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
240 	end = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
241 
242 	IOMMU_DOMAIN_LOCK(domain);
243 	KASSERT(domain->entries_cnt == 2, ("dirty domain %p", domain));
244 	KASSERT(RB_EMPTY(&domain->rb_root),
245 	    ("non-empty entries %p", domain));
246 
247 	/*
248 	 * The end entry must be inserted first because it has a zero-length gap
249 	 * between start and end.  Initially, all augmentation data for a new
250 	 * entry is zero.  Function iommu_gas_augment_entry will compute no
251 	 * change in the value of (start-end) and no change in the value of
252 	 * free_down, so it will return false to suggest that nothing changed in
253 	 * the entry.  Thus, inserting the end entry second prevents
254 	 * augmentation information to be propogated to the begin entry at the
255 	 * tree root.  So it is inserted first.
256 	 */
257 	end->start = domain->end;
258 	end->end = domain->end;
259 	end->flags = IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED;
260 	RB_INSERT(iommu_gas_entries_tree, &domain->rb_root, end);
261 
262 	begin->start = 0;
263 	begin->end = IOMMU_PAGE_SIZE;
264 	begin->flags = IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED;
265 	RB_INSERT_PREV(iommu_gas_entries_tree, &domain->rb_root, end, begin);
266 
267 	domain->start_gap = end;
268 	domain->first_place = begin;
269 	domain->last_place = end;
270 	domain->flags |= IOMMU_DOMAIN_GAS_INITED;
271 	IOMMU_DOMAIN_UNLOCK(domain);
272 }
273 
274 void
275 iommu_gas_fini_domain(struct iommu_domain *domain)
276 {
277 	struct iommu_map_entry *entry;
278 
279 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
280 	KASSERT(domain->entries_cnt == 2,
281 	    ("domain still in use %p", domain));
282 
283 	entry = RB_MIN(iommu_gas_entries_tree, &domain->rb_root);
284 	KASSERT(entry->start == 0, ("start entry start %p", domain));
285 	KASSERT(entry->end == IOMMU_PAGE_SIZE, ("start entry end %p", domain));
286 	KASSERT(entry->flags ==
287 	    (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED),
288 	    ("start entry flags %p", domain));
289 	iommu_gas_rb_remove(domain, entry);
290 	iommu_gas_free_entry(entry);
291 
292 	entry = RB_MAX(iommu_gas_entries_tree, &domain->rb_root);
293 	KASSERT(entry->start == domain->end, ("end entry start %p", domain));
294 	KASSERT(entry->end == domain->end, ("end entry end %p", domain));
295 	KASSERT(entry->flags ==
296 	    (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED),
297 	    ("end entry flags %p", domain));
298 	iommu_gas_rb_remove(domain, entry);
299 	iommu_gas_free_entry(entry);
300 }
301 
302 struct iommu_gas_match_args {
303 	iommu_gaddr_t size;
304 	int offset;
305 	const struct bus_dma_tag_common *common;
306 	u_int gas_flags;
307 	struct iommu_map_entry *entry;
308 };
309 
310 /*
311  * The interval [beg, end) is a free interval between two iommu_map_entries.
312  * Addresses can be allocated only in the range [lbound, ubound). Try to
313  * allocate space in the free interval, subject to the conditions expressed by
314  * a, and return 'true' if and only if the allocation attempt succeeds.
315  */
316 static bool
317 iommu_gas_match_one(struct iommu_gas_match_args *a, iommu_gaddr_t beg,
318     iommu_gaddr_t end, iommu_gaddr_t lbound, iommu_gaddr_t ubound)
319 {
320 	struct iommu_map_entry *entry;
321 	iommu_gaddr_t first, size, start;
322 	int offset;
323 
324 	/*
325 	 * The prev->end is always aligned on the page size, which
326 	 * causes page alignment for the entry->start too.
327 	 *
328 	 * Create IOMMU_PAGE_SIZE gaps before, after new entry
329 	 * to ensure that out-of-bounds accesses fault.
330 	 */
331 	beg = MAX(beg + IOMMU_PAGE_SIZE, lbound);
332 	start = roundup2(beg, a->common->alignment);
333 	if (start < beg)
334 		return (false);
335 	end = MIN(end - IOMMU_PAGE_SIZE, ubound);
336 	offset = a->offset;
337 	size = a->size;
338 	if (start + offset + size > end)
339 		return (false);
340 
341 	/* Check for and try to skip past boundary crossing. */
342 	if (!vm_addr_bound_ok(start + offset, size, a->common->boundary)) {
343 		/*
344 		 * The start + offset to start + offset + size region crosses
345 		 * the boundary.  Check if there is enough space after the next
346 		 * boundary after the beg.
347 		 */
348 		first = start;
349 		beg = roundup2(start + offset + 1, a->common->boundary);
350 		start = roundup2(beg, a->common->alignment);
351 
352 		if (start + offset + size > end ||
353 		    !vm_addr_bound_ok(start + offset, size,
354 		    a->common->boundary)) {
355 			/*
356 			 * Not enough space to align at the requested boundary,
357 			 * or boundary is smaller than the size, but allowed to
358 			 * split.  We already checked that start + size does not
359 			 * overlap ubound.
360 			 *
361 			 * XXXKIB. It is possible that beg is exactly at the
362 			 * start of the next entry, then we do not have gap.
363 			 * Ignore for now.
364 			 */
365 			if ((a->gas_flags & IOMMU_MF_CANSPLIT) == 0)
366 				return (false);
367 			size = beg - first - offset;
368 			start = first;
369 		}
370 	}
371 	entry = a->entry;
372 	entry->start = start;
373 	entry->end = start + roundup2(size + offset, IOMMU_PAGE_SIZE);
374 	entry->flags = IOMMU_MAP_ENTRY_MAP;
375 	return (true);
376 }
377 
378 /* Find the next entry that might abut a big-enough range. */
379 static struct iommu_map_entry *
380 iommu_gas_next(struct iommu_map_entry *curr, iommu_gaddr_t min_free)
381 {
382 	struct iommu_map_entry *next;
383 
384 	if ((next = RB_RIGHT(curr, rb_entry)) != NULL &&
385 	    next->free_down >= min_free) {
386 		/* Find next entry in right subtree. */
387 		do
388 			curr = next;
389 		while ((next = RB_LEFT(curr, rb_entry)) != NULL &&
390 		    next->free_down >= min_free);
391 	} else {
392 		/* Find next entry in a left-parent ancestor. */
393 		while ((next = RB_PARENT(curr, rb_entry)) != NULL &&
394 		    curr == RB_RIGHT(next, rb_entry))
395 			curr = next;
396 		curr = next;
397 	}
398 	return (curr);
399 }
400 
401 /*
402  * Address-ordered first-fit search of 'domain' for free space satisfying the
403  * conditions of 'a'.  The space allocated is at least one page big, and is
404  * bounded by guard pages to the left and right.  The allocated space for
405  * 'domain' is described by an rb-tree of map entries at domain->rb_root, and
406  * domain->start_gap points to a map entry less than or adjacent to the first
407  * free-space of size at least 3 pages.
408  */
409 static int
410 iommu_gas_find_space(struct iommu_domain *domain,
411     struct iommu_gas_match_args *a)
412 {
413 	struct iommu_map_entry *curr, *first;
414 	iommu_gaddr_t addr, min_free;
415 
416 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
417 	KASSERT(a->entry->flags == 0,
418 	    ("dirty entry %p %p", domain, a->entry));
419 
420 	/*
421 	 * start_gap may point to an entry adjacent to gaps too small for any
422 	 * new allocation.  In that case, advance start_gap to the first free
423 	 * space big enough for a minimum allocation plus two guard pages.
424 	 */
425 	min_free = 3 * IOMMU_PAGE_SIZE;
426 	first = domain->start_gap;
427 	while (first != NULL && first->free_down < min_free)
428 		first = RB_PARENT(first, rb_entry);
429 	for (curr = first; curr != NULL;
430 	    curr = iommu_gas_next(curr, min_free)) {
431 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
432 		    first->last + min_free <= curr->start)
433 			break;
434 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
435 		    curr->end + min_free <= first->first)
436 			break;
437 	}
438 	domain->start_gap = curr;
439 
440 	/*
441 	 * If the subtree doesn't have free space for the requested allocation
442 	 * plus two guard pages, skip it.
443 	 */
444 	min_free = 2 * IOMMU_PAGE_SIZE +
445 	    roundup2(a->size + a->offset, IOMMU_PAGE_SIZE);
446 
447 	/* Climb to find a node in the subtree of big-enough ranges. */
448 	first = curr;
449 	while (first != NULL && first->free_down < min_free)
450 		first = RB_PARENT(first, rb_entry);
451 
452 	/*
453 	 * Walk the big-enough ranges tree until one satisfies alignment
454 	 * requirements, or violates lowaddr address requirement.
455 	 */
456 	addr = a->common->lowaddr + 1;
457 	for (curr = first; curr != NULL;
458 	    curr = iommu_gas_next(curr, min_free)) {
459 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
460 		    iommu_gas_match_one(a, first->last, curr->start,
461 		    0, addr)) {
462 			RB_INSERT_PREV(iommu_gas_entries_tree,
463 			    &domain->rb_root, curr, a->entry);
464 			return (0);
465 		}
466 		if (curr->end >= addr) {
467 			/* All remaining ranges >= addr */
468 			break;
469 		}
470 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
471 		    iommu_gas_match_one(a, curr->end, first->first,
472 		    0, addr)) {
473 			RB_INSERT_NEXT(iommu_gas_entries_tree,
474 			    &domain->rb_root, curr, a->entry);
475 			return (0);
476 		}
477 	}
478 
479 	/*
480 	 * To resume the search at the start of the upper region, first climb to
481 	 * the nearest ancestor that spans highaddr.  Then find the last entry
482 	 * before highaddr that could abut a big-enough range.
483 	 */
484 	addr = a->common->highaddr;
485 	while (curr != NULL && curr->last < addr)
486 		curr = RB_PARENT(curr, rb_entry);
487 	first = NULL;
488 	while (curr != NULL && curr->free_down >= min_free) {
489 		if (addr < curr->end)
490 			curr = RB_LEFT(curr, rb_entry);
491 		else {
492 			first = curr;
493 			curr = RB_RIGHT(curr, rb_entry);
494 		}
495 	}
496 
497 	/*
498 	 * Walk the remaining big-enough ranges until one satisfies alignment
499 	 * requirements.
500 	 */
501 	for (curr = first; curr != NULL;
502 	    curr = iommu_gas_next(curr, min_free)) {
503 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
504 		    iommu_gas_match_one(a, first->last, curr->start,
505 		    addr + 1, domain->end)) {
506 			RB_INSERT_PREV(iommu_gas_entries_tree,
507 			    &domain->rb_root, curr, a->entry);
508 			return (0);
509 		}
510 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
511 		    iommu_gas_match_one(a, curr->end, first->first,
512 		    addr + 1, domain->end)) {
513 			RB_INSERT_NEXT(iommu_gas_entries_tree,
514 			    &domain->rb_root, curr, a->entry);
515 			return (0);
516 		}
517 	}
518 
519 	return (ENOMEM);
520 }
521 
522 static int
523 iommu_gas_alloc_region(struct iommu_domain *domain, struct iommu_map_entry *entry,
524     u_int flags)
525 {
526 	struct iommu_map_entry *next, *prev;
527 
528 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
529 
530 	if ((entry->start & IOMMU_PAGE_MASK) != 0 ||
531 	    (entry->end & IOMMU_PAGE_MASK) != 0)
532 		return (EINVAL);
533 	if (entry->start >= entry->end)
534 		return (EINVAL);
535 	if (entry->end >= domain->end)
536 		return (EINVAL);
537 
538 	next = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, entry);
539 	KASSERT(next != NULL, ("next must be non-null %p %jx", domain,
540 	    (uintmax_t)entry->start));
541 	prev = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, next);
542 	/* prev could be NULL */
543 
544 	/*
545 	 * Adapt to broken BIOSes which specify overlapping RMRR
546 	 * entries.
547 	 *
548 	 * XXXKIB: this does not handle a case when prev or next
549 	 * entries are completely covered by the current one, which
550 	 * extends both ways.
551 	 */
552 	if (prev != NULL && prev->end > entry->start &&
553 	    (prev->flags & IOMMU_MAP_ENTRY_PLACE) == 0) {
554 		if ((flags & IOMMU_MF_RMRR) == 0 ||
555 		    (prev->flags & IOMMU_MAP_ENTRY_RMRR) == 0)
556 			return (EBUSY);
557 		entry->start = prev->end;
558 	}
559 	if (next->start < entry->end &&
560 	    (next->flags & IOMMU_MAP_ENTRY_PLACE) == 0) {
561 		if ((flags & IOMMU_MF_RMRR) == 0 ||
562 		    (next->flags & IOMMU_MAP_ENTRY_RMRR) == 0)
563 			return (EBUSY);
564 		entry->end = next->start;
565 	}
566 	if (entry->end == entry->start)
567 		return (0);
568 
569 	if (prev != NULL && prev->end > entry->start) {
570 		/* This assumes that prev is the placeholder entry. */
571 		iommu_gas_rb_remove(domain, prev);
572 		prev = NULL;
573 	}
574 	RB_INSERT_PREV(iommu_gas_entries_tree,
575 	    &domain->rb_root, next, entry);
576 	if (next->start < entry->end) {
577 		iommu_gas_rb_remove(domain, next);
578 		next = NULL;
579 	}
580 
581 	if ((flags & IOMMU_MF_RMRR) != 0)
582 		entry->flags = IOMMU_MAP_ENTRY_RMRR;
583 
584 #ifdef INVARIANTS
585 	struct iommu_map_entry *ip, *in;
586 	ip = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, entry);
587 	in = RB_NEXT(iommu_gas_entries_tree, &domain->rb_root, entry);
588 	KASSERT(prev == NULL || ip == prev,
589 	    ("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)",
590 	    entry, entry->start, entry->end, prev,
591 	    prev == NULL ? 0 : prev->start, prev == NULL ? 0 : prev->end,
592 	    ip, ip == NULL ? 0 : ip->start, ip == NULL ? 0 : ip->end));
593 	KASSERT(next == NULL || in == next,
594 	    ("RMRR %p (%jx %jx) next %p (%jx %jx) ins next %p (%jx %jx)",
595 	    entry, entry->start, entry->end, next,
596 	    next == NULL ? 0 : next->start, next == NULL ? 0 : next->end,
597 	    in, in == NULL ? 0 : in->start, in == NULL ? 0 : in->end));
598 #endif
599 
600 	return (0);
601 }
602 
603 void
604 iommu_gas_free_space(struct iommu_map_entry *entry)
605 {
606 	struct iommu_domain *domain;
607 
608 	domain = entry->domain;
609 	KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR |
610 	    IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_MAP,
611 	    ("permanent entry %p %p", domain, entry));
612 
613 	IOMMU_DOMAIN_LOCK(domain);
614 	iommu_gas_rb_remove(domain, entry);
615 	entry->flags &= ~IOMMU_MAP_ENTRY_MAP;
616 #ifdef INVARIANTS
617 	if (iommu_check_free)
618 		iommu_gas_check_free(domain);
619 #endif
620 	IOMMU_DOMAIN_UNLOCK(domain);
621 }
622 
623 void
624 iommu_gas_free_region(struct iommu_map_entry *entry)
625 {
626 	struct iommu_domain *domain;
627 
628 	domain = entry->domain;
629 	KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR |
630 	    IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_RMRR,
631 	    ("non-RMRR entry %p %p", domain, entry));
632 
633 	IOMMU_DOMAIN_LOCK(domain);
634 	if (entry != domain->first_place &&
635 	    entry != domain->last_place)
636 		iommu_gas_rb_remove(domain, entry);
637 	entry->flags &= ~IOMMU_MAP_ENTRY_RMRR;
638 	IOMMU_DOMAIN_UNLOCK(domain);
639 }
640 
641 static struct iommu_map_entry *
642 iommu_gas_remove_clip_left(struct iommu_domain *domain, iommu_gaddr_t start,
643     iommu_gaddr_t end, struct iommu_map_entry **r)
644 {
645 	struct iommu_map_entry *entry, *res, fentry;
646 
647 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
648 	MPASS(start <= end);
649 	MPASS(end <= domain->end);
650 
651 	/*
652 	 * Find an entry which contains the supplied guest's address
653 	 * start, or the first entry after the start.  Since we
654 	 * asserted that start is below domain end, entry should
655 	 * exist.  Then clip it if needed.
656 	 */
657 	fentry.start = start + 1;
658 	fentry.end = start + 1;
659 	entry = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, &fentry);
660 
661 	if (entry->start >= start ||
662 	    (entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
663 		return (entry);
664 
665 	res = *r;
666 	*r = NULL;
667 	*res = *entry;
668 	res->start = entry->end = start;
669 	RB_UPDATE_AUGMENT(entry, rb_entry);
670 	RB_INSERT_NEXT(iommu_gas_entries_tree,
671 	    &domain->rb_root, entry, res);
672 	return (res);
673 }
674 
675 static bool
676 iommu_gas_remove_clip_right(struct iommu_domain *domain,
677     iommu_gaddr_t end, struct iommu_map_entry *entry,
678     struct iommu_map_entry *r)
679 {
680 	if (entry->start >= end || (entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
681 		return (false);
682 
683 	*r = *entry;
684 	r->end = entry->start = end;
685 	RB_UPDATE_AUGMENT(entry, rb_entry);
686 	RB_INSERT_PREV(iommu_gas_entries_tree,
687 	    &domain->rb_root, entry, r);
688 	return (true);
689 }
690 
691 static void
692 iommu_gas_remove_unmap(struct iommu_domain *domain,
693     struct iommu_map_entry *entry, struct iommu_map_entries_tailq *gcp)
694 {
695 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
696 
697 	if ((entry->flags & (IOMMU_MAP_ENTRY_UNMAPPED |
698 	    IOMMU_MAP_ENTRY_REMOVING)) != 0)
699 		return;
700 	MPASS((entry->flags & IOMMU_MAP_ENTRY_PLACE) == 0);
701 	entry->flags |= IOMMU_MAP_ENTRY_REMOVING;
702 	TAILQ_INSERT_TAIL(gcp, entry, dmamap_link);
703 }
704 
705 /*
706  * Remove specified range from the GAS of the domain.  Note that the
707  * removal is not guaranteed to occur upon the function return, it
708  * might be finalized some time after, when hardware reports that
709  * (queued) IOTLB invalidation was performed.
710  */
711 void
712 iommu_gas_remove(struct iommu_domain *domain, iommu_gaddr_t start,
713     iommu_gaddr_t size)
714 {
715 	struct iommu_map_entry *entry, *nentry, *r1, *r2;
716 	struct iommu_map_entries_tailq gc;
717 	iommu_gaddr_t end;
718 
719 	end = start + size;
720 	r1 = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
721 	r2 = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
722 	TAILQ_INIT(&gc);
723 
724 	IOMMU_DOMAIN_LOCK(domain);
725 
726 	nentry = iommu_gas_remove_clip_left(domain, start, end, &r1);
727 	RB_FOREACH_FROM(entry, iommu_gas_entries_tree, nentry) {
728 		if (entry->start >= end)
729 			break;
730 		KASSERT(start <= entry->start,
731 		    ("iommu_gas_remove entry (%#jx, %#jx) start %#jx",
732 		    entry->start, entry->end, start));
733 		if ((entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
734 			continue;
735 		iommu_gas_remove_unmap(domain, entry, &gc);
736 	}
737 	if (iommu_gas_remove_clip_right(domain, end, entry, r2)) {
738 		iommu_gas_remove_unmap(domain, r2, &gc);
739 		r2 = NULL;
740 	}
741 
742 #ifdef INVARIANTS
743 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
744 		if ((entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
745 			continue;
746 		KASSERT(entry->end <= start || entry->start >= end,
747 		    ("iommu_gas_remove leftover entry (%#jx, %#jx) range "
748 		    "(%#jx, %#jx)",
749 		    entry->start, entry->end, start, end));
750 	}
751 #endif
752 
753 	IOMMU_DOMAIN_UNLOCK(domain);
754 	if (r1 != NULL)
755 		iommu_gas_free_entry(r1);
756 	if (r2 != NULL)
757 		iommu_gas_free_entry(r2);
758 	iommu_domain_unload(domain, &gc, true);
759 }
760 
761 int
762 iommu_gas_map(struct iommu_domain *domain,
763     const struct bus_dma_tag_common *common, iommu_gaddr_t size, int offset,
764     u_int eflags, u_int flags, vm_page_t *ma, struct iommu_map_entry **res)
765 {
766 	struct iommu_gas_match_args a;
767 	struct iommu_map_entry *entry;
768 	int error;
769 
770 	KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_CANSPLIT)) == 0,
771 	    ("invalid flags 0x%x", flags));
772 
773 	a.size = size;
774 	a.offset = offset;
775 	a.common = common;
776 	a.gas_flags = flags;
777 	entry = iommu_gas_alloc_entry(domain,
778 	    (flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0);
779 	if (entry == NULL)
780 		return (ENOMEM);
781 	a.entry = entry;
782 	IOMMU_DOMAIN_LOCK(domain);
783 	error = iommu_gas_find_space(domain, &a);
784 	if (error == ENOMEM) {
785 		IOMMU_DOMAIN_UNLOCK(domain);
786 		iommu_gas_free_entry(entry);
787 		return (error);
788 	}
789 #ifdef INVARIANTS
790 	if (iommu_check_free)
791 		iommu_gas_check_free(domain);
792 #endif
793 	KASSERT(error == 0,
794 	    ("unexpected error %d from iommu_gas_find_entry", error));
795 	KASSERT(entry->end < domain->end, ("allocated GPA %jx, max GPA %jx",
796 	    (uintmax_t)entry->end, (uintmax_t)domain->end));
797 	entry->flags |= eflags;
798 	IOMMU_DOMAIN_UNLOCK(domain);
799 
800 	error = domain->ops->map(domain, entry->start,
801 	    entry->end - entry->start, ma, eflags,
802 	    ((flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0));
803 	if (error == ENOMEM) {
804 		iommu_domain_unload_entry(entry, true,
805 		    (flags & IOMMU_MF_CANWAIT) != 0);
806 		return (error);
807 	}
808 	KASSERT(error == 0,
809 	    ("unexpected error %d from domain_map_buf", error));
810 
811 	*res = entry;
812 	return (0);
813 }
814 
815 int
816 iommu_gas_map_region(struct iommu_domain *domain, struct iommu_map_entry *entry,
817     u_int eflags, u_int flags, vm_page_t *ma)
818 {
819 	iommu_gaddr_t start;
820 	int error;
821 
822 	KASSERT(entry->domain == domain,
823 	    ("mismatched domain %p entry %p entry->domain %p", domain,
824 	    entry, entry->domain));
825 	KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain,
826 	    entry, entry->flags));
827 	KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_RMRR)) == 0,
828 	    ("invalid flags 0x%x", flags));
829 
830 	start = entry->start;
831 	IOMMU_DOMAIN_LOCK(domain);
832 	error = iommu_gas_alloc_region(domain, entry, flags);
833 	if (error != 0) {
834 		IOMMU_DOMAIN_UNLOCK(domain);
835 		return (error);
836 	}
837 	entry->flags |= eflags;
838 	IOMMU_DOMAIN_UNLOCK(domain);
839 	if (entry->end == entry->start)
840 		return (0);
841 
842 	error = domain->ops->map(domain, entry->start,
843 	    entry->end - entry->start, ma + OFF_TO_IDX(start - entry->start),
844 	    eflags, ((flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0));
845 	if (error == ENOMEM) {
846 		iommu_domain_unload_entry(entry, false,
847 		    (flags & IOMMU_MF_CANWAIT) != 0);
848 		return (error);
849 	}
850 	KASSERT(error == 0,
851 	    ("unexpected error %d from domain_map_buf", error));
852 
853 	return (0);
854 }
855 
856 static int
857 iommu_gas_reserve_region_locked(struct iommu_domain *domain,
858     iommu_gaddr_t start, iommu_gaddr_t end, struct iommu_map_entry *entry)
859 {
860 	int error;
861 
862 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
863 
864 	entry->start = start;
865 	entry->end = end;
866 	error = iommu_gas_alloc_region(domain, entry, IOMMU_MF_CANWAIT);
867 	if (error == 0)
868 		entry->flags |= IOMMU_MAP_ENTRY_UNMAPPED;
869 	return (error);
870 }
871 
872 int
873 iommu_gas_reserve_region(struct iommu_domain *domain, iommu_gaddr_t start,
874     iommu_gaddr_t end, struct iommu_map_entry **entry0)
875 {
876 	struct iommu_map_entry *entry;
877 	int error;
878 
879 	entry = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
880 	IOMMU_DOMAIN_LOCK(domain);
881 	error = iommu_gas_reserve_region_locked(domain, start, end, entry);
882 	IOMMU_DOMAIN_UNLOCK(domain);
883 	if (error != 0)
884 		iommu_gas_free_entry(entry);
885 	else if (entry0 != NULL)
886 		*entry0 = entry;
887 	return (error);
888 }
889 
890 /*
891  * As in iommu_gas_reserve_region, reserve [start, end), but allow for existing
892  * entries.
893  */
894 int
895 iommu_gas_reserve_region_extend(struct iommu_domain *domain,
896     iommu_gaddr_t start, iommu_gaddr_t end)
897 {
898 	struct iommu_map_entry *entry, *next, *prev, key = {};
899 	iommu_gaddr_t entry_start, entry_end;
900 	int error;
901 
902 	error = 0;
903 	entry = NULL;
904 	end = ummin(end, domain->end);
905 	while (start < end) {
906 		/* Preallocate an entry. */
907 		if (entry == NULL)
908 			entry = iommu_gas_alloc_entry(domain,
909 			    IOMMU_PGF_WAITOK);
910 		/* Calculate the free region from here to the next entry. */
911 		key.start = key.end = start;
912 		IOMMU_DOMAIN_LOCK(domain);
913 		next = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, &key);
914 		KASSERT(next != NULL, ("domain %p with end %#jx has no entry "
915 		    "after %#jx", domain, (uintmax_t)domain->end,
916 		    (uintmax_t)start));
917 		entry_end = ummin(end, next->start);
918 		prev = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, next);
919 		if (prev != NULL)
920 			entry_start = ummax(start, prev->end);
921 		else
922 			entry_start = start;
923 		start = next->end;
924 		/* Reserve the region if non-empty. */
925 		if (entry_start != entry_end) {
926 			error = iommu_gas_reserve_region_locked(domain,
927 			    entry_start, entry_end, entry);
928 			if (error != 0) {
929 				IOMMU_DOMAIN_UNLOCK(domain);
930 				break;
931 			}
932 			entry = NULL;
933 		}
934 		IOMMU_DOMAIN_UNLOCK(domain);
935 	}
936 	/* Release a preallocated entry if it was not used. */
937 	if (entry != NULL)
938 		iommu_gas_free_entry(entry);
939 	return (error);
940 }
941 
942 void
943 iommu_unmap_msi(struct iommu_ctx *ctx)
944 {
945 	struct iommu_map_entry *entry;
946 	struct iommu_domain *domain;
947 
948 	domain = ctx->domain;
949 	entry = domain->msi_entry;
950 	if (entry == NULL)
951 		return;
952 
953 	domain->ops->unmap(domain, entry->start, entry->end -
954 	    entry->start, IOMMU_PGF_WAITOK);
955 
956 	iommu_gas_free_space(entry);
957 
958 	iommu_gas_free_entry(entry);
959 
960 	domain->msi_entry = NULL;
961 	domain->msi_base = 0;
962 	domain->msi_phys = 0;
963 }
964 
965 int
966 iommu_map_msi(struct iommu_ctx *ctx, iommu_gaddr_t size, int offset,
967     u_int eflags, u_int flags, vm_page_t *ma)
968 {
969 	struct iommu_domain *domain;
970 	struct iommu_map_entry *entry;
971 	int error;
972 
973 	error = 0;
974 	domain = ctx->domain;
975 
976 	/* Check if there is already an MSI page allocated */
977 	IOMMU_DOMAIN_LOCK(domain);
978 	entry = domain->msi_entry;
979 	IOMMU_DOMAIN_UNLOCK(domain);
980 
981 	if (entry == NULL) {
982 		error = iommu_gas_map(domain, &ctx->tag->common, size, offset,
983 		    eflags, flags, ma, &entry);
984 		IOMMU_DOMAIN_LOCK(domain);
985 		if (error == 0) {
986 			if (domain->msi_entry == NULL) {
987 				MPASS(domain->msi_base == 0);
988 				MPASS(domain->msi_phys == 0);
989 
990 				domain->msi_entry = entry;
991 				domain->msi_base = entry->start;
992 				domain->msi_phys = VM_PAGE_TO_PHYS(ma[0]);
993 			} else {
994 				/*
995 				 * We lost the race and already have an
996 				 * MSI page allocated. Free the unneeded entry.
997 				 */
998 				iommu_gas_free_entry(entry);
999 			}
1000 		} else if (domain->msi_entry != NULL) {
1001 			/*
1002 			 * The allocation failed, but another succeeded.
1003 			 * Return success as there is a valid MSI page.
1004 			 */
1005 			error = 0;
1006 		}
1007 		IOMMU_DOMAIN_UNLOCK(domain);
1008 	}
1009 
1010 	return (error);
1011 }
1012 
1013 void
1014 iommu_translate_msi(struct iommu_domain *domain, uint64_t *addr)
1015 {
1016 
1017 	*addr = (*addr - domain->msi_phys) + domain->msi_base;
1018 
1019 	KASSERT(*addr >= domain->msi_entry->start,
1020 	    ("%s: Address is below the MSI entry start address (%jx < %jx)",
1021 	    __func__, (uintmax_t)*addr, (uintmax_t)domain->msi_entry->start));
1022 
1023 	KASSERT(*addr + sizeof(*addr) <= domain->msi_entry->end,
1024 	    ("%s: Address is above the MSI entry end address (%jx < %jx)",
1025 	    __func__, (uintmax_t)*addr, (uintmax_t)domain->msi_entry->end));
1026 }
1027 
1028 SYSCTL_NODE(_hw, OID_AUTO, iommu, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "");
1029 
1030 #ifdef INVARIANTS
1031 SYSCTL_INT(_hw_iommu, OID_AUTO, check_free, CTLFLAG_RWTUN,
1032     &iommu_check_free, 0,
1033     "Check the GPA RBtree for free_down and free_after validity");
1034 #endif
1035