xref: /freebsd/sys/dev/iommu/iommu_gas.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #define	RB_AUGMENT_CHECK(entry) iommu_gas_augment_entry(entry)
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/bus.h>
37 #include <sys/interrupt.h>
38 #include <sys/kernel.h>
39 #include <sys/ktr.h>
40 #include <sys/lock.h>
41 #include <sys/proc.h>
42 #include <sys/rwlock.h>
43 #include <sys/memdesc.h>
44 #include <sys/mutex.h>
45 #include <sys/sysctl.h>
46 #include <sys/rman.h>
47 #include <sys/taskqueue.h>
48 #include <sys/tree.h>
49 #include <sys/uio.h>
50 #include <sys/vmem.h>
51 #include <vm/vm.h>
52 #include <vm/vm_extern.h>
53 #include <vm/vm_kern.h>
54 #include <vm/vm_object.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_map.h>
57 #include <vm/uma.h>
58 #include <dev/pci/pcireg.h>
59 #include <dev/pci/pcivar.h>
60 #include <dev/iommu/iommu.h>
61 #include <dev/iommu/iommu_gas.h>
62 #include <dev/iommu/iommu_msi.h>
63 #include <machine/atomic.h>
64 #include <machine/bus.h>
65 #include <machine/md_var.h>
66 #include <machine/iommu.h>
67 #include <dev/iommu/busdma_iommu.h>
68 
69 /*
70  * Guest Address Space management.
71  */
72 
73 static uma_zone_t iommu_map_entry_zone;
74 
75 #ifdef INVARIANTS
76 static int iommu_check_free;
77 #endif
78 
79 static void
80 intel_gas_init(void)
81 {
82 
83 	iommu_map_entry_zone = uma_zcreate("IOMMU_MAP_ENTRY",
84 	    sizeof(struct iommu_map_entry), NULL, NULL,
85 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NODUMP);
86 }
87 SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL);
88 
89 struct iommu_map_entry *
90 iommu_gas_alloc_entry(struct iommu_domain *domain, u_int flags)
91 {
92 	struct iommu_map_entry *res;
93 
94 	KASSERT((flags & ~(IOMMU_PGF_WAITOK)) == 0,
95 	    ("unsupported flags %x", flags));
96 
97 	res = uma_zalloc(iommu_map_entry_zone, ((flags & IOMMU_PGF_WAITOK) !=
98 	    0 ? M_WAITOK : M_NOWAIT) | M_ZERO);
99 	if (res != NULL) {
100 		SLIST_INIT(&res->pgtbl_free);
101 		if (domain != NULL) {
102 			res->domain = domain;
103 			atomic_add_int(&domain->entries_cnt, 1);
104 		}
105 	}
106 	return (res);
107 }
108 
109 void
110 iommu_gas_free_entry(struct iommu_map_entry *entry)
111 {
112 	struct iommu_domain *domain;
113 	int n __unused;
114 
115 	n = vm_page_free_pages_toq(&entry->pgtbl_free, false);
116 #if defined(__i386__) || defined(__amd64__)
117 	atomic_subtract_int(&iommu_tbl_pagecnt, n);
118 #endif
119 	domain = entry->domain;
120 	if (domain != NULL)
121 		atomic_subtract_int(&domain->entries_cnt, 1);
122 	uma_zfree(iommu_map_entry_zone, entry);
123 }
124 
125 static int
126 iommu_gas_cmp_entries(struct iommu_map_entry *a, struct iommu_map_entry *b)
127 {
128 
129 	/* First and last entries have zero size, so <= */
130 	KASSERT(a->start <= a->end, ("inverted entry %p (%jx, %jx)",
131 	    a, (uintmax_t)a->start, (uintmax_t)a->end));
132 	KASSERT(b->start <= b->end, ("inverted entry %p (%jx, %jx)",
133 	    b, (uintmax_t)b->start, (uintmax_t)b->end));
134 	KASSERT(((a->flags | b->flags) & IOMMU_MAP_ENTRY_FAKE) != 0 ||
135 	    a->end <= b->start || b->end <= a->start ||
136 	    a->end == a->start || b->end == b->start,
137 	    ("overlapping entries %p (%jx, %jx) f %#x %p (%jx, %jx) f %#x"
138 	    " domain %p %p",
139 	    a, (uintmax_t)a->start, (uintmax_t)a->end, a->flags,
140 	    b, (uintmax_t)b->start, (uintmax_t)b->end, b->flags,
141 	    a->domain, b->domain));
142 
143 	if (a->end < b->end)
144 		return (-1);
145 	else if (b->end < a->end)
146 		return (1);
147 	return (0);
148 }
149 
150 /*
151  * Update augmentation data based on data from children.
152  * Return true if and only if the update changes the augmentation data.
153  */
154 static bool
155 iommu_gas_augment_entry(struct iommu_map_entry *entry)
156 {
157 	struct iommu_map_entry *child;
158 	iommu_gaddr_t bound, delta, free_down;
159 
160 	free_down = 0;
161 	bound = entry->start;
162 	if ((child = RB_LEFT(entry, rb_entry)) != NULL) {
163 		free_down = MAX(child->free_down, bound - child->last);
164 		bound = child->first;
165 	}
166 	delta = bound - entry->first;
167 	entry->first = bound;
168 	bound = entry->end;
169 	if ((child = RB_RIGHT(entry, rb_entry)) != NULL) {
170 		free_down = MAX(free_down, child->free_down);
171 		free_down = MAX(free_down, child->first - bound);
172 		bound = child->last;
173 	}
174 	delta += entry->last - bound;
175 	if (delta == 0)
176 		delta = entry->free_down - free_down;
177 	entry->last = bound;
178 	entry->free_down = free_down;
179 
180 	/*
181 	 * Return true either if the value of last-first changed,
182 	 * or if free_down changed.
183 	 */
184 	return (delta != 0);
185 }
186 
187 RB_GENERATE(iommu_gas_entries_tree, iommu_map_entry, rb_entry,
188     iommu_gas_cmp_entries);
189 
190 #ifdef INVARIANTS
191 static void
192 iommu_gas_check_free(struct iommu_domain *domain)
193 {
194 	struct iommu_map_entry *entry, *l, *r;
195 	iommu_gaddr_t v;
196 
197 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
198 		KASSERT(domain == entry->domain,
199 		    ("mismatched free domain %p entry %p entry->domain %p",
200 		    domain, entry, entry->domain));
201 		l = RB_LEFT(entry, rb_entry);
202 		r = RB_RIGHT(entry, rb_entry);
203 		v = 0;
204 		if (l != NULL) {
205 			v = MAX(v, l->free_down);
206 			v = MAX(v, entry->start - l->last);
207 		}
208 		if (r != NULL) {
209 			v = MAX(v, r->free_down);
210 			v = MAX(v, r->first - entry->end);
211 		}
212 		MPASS(entry->free_down == v);
213 	}
214 }
215 #endif
216 
217 static void
218 iommu_gas_rb_remove(struct iommu_domain *domain, struct iommu_map_entry *entry)
219 {
220 	struct iommu_map_entry *nbr;
221 
222 	/* Removing entry may open a new free gap before domain->start_gap. */
223 	if (entry->end <= domain->start_gap->end) {
224 		if (RB_RIGHT(entry, rb_entry) != NULL)
225 			nbr = iommu_gas_entries_tree_RB_NEXT(entry);
226 		else if (RB_LEFT(entry, rb_entry) != NULL)
227 			nbr = RB_LEFT(entry, rb_entry);
228 		else
229 			nbr = RB_PARENT(entry, rb_entry);
230 		domain->start_gap = nbr;
231 	}
232 	RB_REMOVE(iommu_gas_entries_tree, &domain->rb_root, entry);
233 }
234 
235 struct iommu_domain *
236 iommu_get_ctx_domain(struct iommu_ctx *ctx)
237 {
238 
239 	return (ctx->domain);
240 }
241 
242 void
243 iommu_gas_init_domain(struct iommu_domain *domain)
244 {
245 	struct iommu_map_entry *begin, *end;
246 
247 	begin = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
248 	end = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
249 
250 	IOMMU_DOMAIN_LOCK(domain);
251 	KASSERT(domain->entries_cnt == 2, ("dirty domain %p", domain));
252 	KASSERT(RB_EMPTY(&domain->rb_root),
253 	    ("non-empty entries %p", domain));
254 
255 	end->start = domain->end;
256 	end->end = domain->end;
257 	end->flags = IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED;
258 	RB_INSERT(iommu_gas_entries_tree, &domain->rb_root, end);
259 
260 	begin->start = 0;
261 	begin->end = 0;
262 	begin->flags = IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED;
263 	RB_INSERT_PREV(iommu_gas_entries_tree, &domain->rb_root, end, begin);
264 	iommu_gas_augment_entry(end);
265 	iommu_gas_augment_entry(begin);
266 
267 	domain->start_gap = begin;
268 	domain->first_place = begin;
269 	domain->last_place = end;
270 	domain->flags |= IOMMU_DOMAIN_GAS_INITED;
271 	IOMMU_DOMAIN_UNLOCK(domain);
272 }
273 
274 void
275 iommu_gas_fini_domain(struct iommu_domain *domain)
276 {
277 	struct iommu_map_entry *entry;
278 
279 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
280 	KASSERT(domain->entries_cnt == 2,
281 	    ("domain still in use %p", domain));
282 
283 	entry = RB_MIN(iommu_gas_entries_tree, &domain->rb_root);
284 	KASSERT(entry->start == 0, ("start entry start %p", domain));
285 	KASSERT(entry->end == IOMMU_PAGE_SIZE, ("start entry end %p", domain));
286 	KASSERT(entry->flags ==
287 	    (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED),
288 	    ("start entry flags %p", domain));
289 	iommu_gas_rb_remove(domain, entry);
290 	iommu_gas_free_entry(entry);
291 
292 	entry = RB_MAX(iommu_gas_entries_tree, &domain->rb_root);
293 	KASSERT(entry->start == domain->end, ("end entry start %p", domain));
294 	KASSERT(entry->end == domain->end, ("end entry end %p", domain));
295 	KASSERT(entry->flags ==
296 	    (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED),
297 	    ("end entry flags %p", domain));
298 	iommu_gas_rb_remove(domain, entry);
299 	iommu_gas_free_entry(entry);
300 }
301 
302 struct iommu_gas_match_args {
303 	iommu_gaddr_t size;
304 	int offset;
305 	const struct bus_dma_tag_common *common;
306 	u_int gas_flags;
307 	struct iommu_map_entry *entry;
308 };
309 
310 /*
311  * The interval [beg, end) is a free interval between two iommu_map_entries.
312  * Addresses can be allocated only in the range [lbound, ubound]. Try to
313  * allocate space in the free interval, subject to the conditions expressed by
314  * a, and return 'true' if and only if the allocation attempt succeeds.
315  */
316 static bool
317 iommu_gas_match_one(struct iommu_gas_match_args *a, iommu_gaddr_t beg,
318     iommu_gaddr_t end, iommu_gaddr_t lbound, iommu_gaddr_t ubound)
319 {
320 	struct iommu_map_entry *entry;
321 	iommu_gaddr_t first, size, start;
322 	int offset;
323 
324 	/*
325 	 * The prev->end is always aligned on the page size, which
326 	 * causes page alignment for the entry->start too.
327 	 *
328 	 * Create IOMMU_PAGE_SIZE gaps before, after new entry
329 	 * to ensure that out-of-bounds accesses fault.
330 	 */
331 	beg = MAX(beg + IOMMU_PAGE_SIZE, lbound);
332 	start = roundup2(beg, a->common->alignment);
333 	if (start < beg)
334 		return (false);
335 	if (end < IOMMU_PAGE_SIZE + 1)
336 		return (false);
337 	end = MIN(end - IOMMU_PAGE_SIZE - 1, ubound);
338 	offset = a->offset;
339 	size = a->size;
340 	if (start + offset + size - 1 > end)
341 		return (false);
342 
343 	/* Check for and try to skip past boundary crossing. */
344 	if (!vm_addr_bound_ok(start + offset, size, a->common->boundary)) {
345 		/*
346 		 * The start + offset to start + offset + size region crosses
347 		 * the boundary.  Check if there is enough space after the next
348 		 * boundary after the beg.
349 		 */
350 		first = start;
351 		beg = roundup2(start + offset + 1, a->common->boundary);
352 		start = roundup2(beg, a->common->alignment);
353 
354 		if (start + offset + size - 1 > end ||
355 		    !vm_addr_bound_ok(start + offset, size,
356 		    a->common->boundary)) {
357 			/*
358 			 * Not enough space to align at the requested boundary,
359 			 * or boundary is smaller than the size, but allowed to
360 			 * split.  We already checked that start + size does not
361 			 * overlap ubound.
362 			 *
363 			 * XXXKIB. It is possible that beg is exactly at the
364 			 * start of the next entry, then we do not have gap.
365 			 * Ignore for now.
366 			 */
367 			if ((a->gas_flags & IOMMU_MF_CANSPLIT) == 0)
368 				return (false);
369 			size = beg - first - offset;
370 			start = first;
371 		}
372 	}
373 	entry = a->entry;
374 	entry->start = start;
375 	entry->end = start + roundup2(size + offset, IOMMU_PAGE_SIZE);
376 	entry->flags = IOMMU_MAP_ENTRY_MAP;
377 	return (true);
378 }
379 
380 /* Find the next entry that might abut a big-enough range. */
381 static struct iommu_map_entry *
382 iommu_gas_next(struct iommu_map_entry *curr, iommu_gaddr_t min_free)
383 {
384 	struct iommu_map_entry *next;
385 
386 	if ((next = RB_RIGHT(curr, rb_entry)) != NULL &&
387 	    next->free_down >= min_free) {
388 		/* Find next entry in right subtree. */
389 		do
390 			curr = next;
391 		while ((next = RB_LEFT(curr, rb_entry)) != NULL &&
392 		    next->free_down >= min_free);
393 	} else {
394 		/* Find next entry in a left-parent ancestor. */
395 		while ((next = RB_PARENT(curr, rb_entry)) != NULL &&
396 		    curr == RB_RIGHT(next, rb_entry))
397 			curr = next;
398 		curr = next;
399 	}
400 	return (curr);
401 }
402 
403 /*
404  * Address-ordered first-fit search of 'domain' for free space satisfying the
405  * conditions of 'a'.  The space allocated is at least one page big, and is
406  * bounded by guard pages to the left and right.  The allocated space for
407  * 'domain' is described by an rb-tree of map entries at domain->rb_root, and
408  * domain->start_gap points to a map entry less than or adjacent to the first
409  * free-space of size at least 3 pages.
410  */
411 static int
412 iommu_gas_find_space(struct iommu_domain *domain,
413     struct iommu_gas_match_args *a)
414 {
415 	struct iommu_map_entry *curr, *first;
416 	iommu_gaddr_t addr, min_free;
417 
418 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
419 	KASSERT(a->entry->flags == 0,
420 	    ("dirty entry %p %p", domain, a->entry));
421 
422 	/*
423 	 * start_gap may point to an entry adjacent to gaps too small for any
424 	 * new allocation.  In that case, advance start_gap to the first free
425 	 * space big enough for a minimum allocation plus two guard pages.
426 	 */
427 	min_free = 3 * IOMMU_PAGE_SIZE;
428 	first = domain->start_gap;
429 	while (first != NULL && first->free_down < min_free)
430 		first = RB_PARENT(first, rb_entry);
431 	for (curr = first; curr != NULL;
432 	    curr = iommu_gas_next(curr, min_free)) {
433 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
434 		    first->last + min_free <= curr->start)
435 			break;
436 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
437 		    curr->end + min_free <= first->first)
438 			break;
439 	}
440 	domain->start_gap = curr;
441 
442 	/*
443 	 * If the subtree doesn't have free space for the requested allocation
444 	 * plus two guard pages, skip it.
445 	 */
446 	min_free = 2 * IOMMU_PAGE_SIZE +
447 	    roundup2(a->size + a->offset, IOMMU_PAGE_SIZE);
448 
449 	/* Climb to find a node in the subtree of big-enough ranges. */
450 	first = curr;
451 	while (first != NULL && first->free_down < min_free)
452 		first = RB_PARENT(first, rb_entry);
453 
454 	/*
455 	 * Walk the big-enough ranges tree until one satisfies alignment
456 	 * requirements, or violates lowaddr address requirement.
457 	 */
458 	addr = a->common->lowaddr;
459 	for (curr = first; curr != NULL;
460 	    curr = iommu_gas_next(curr, min_free)) {
461 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
462 		    iommu_gas_match_one(a, first->last, curr->start,
463 		    0, addr)) {
464 			RB_INSERT_PREV(iommu_gas_entries_tree,
465 			    &domain->rb_root, curr, a->entry);
466 			return (0);
467 		}
468 		if (curr->end >= addr) {
469 			/* All remaining ranges > addr */
470 			break;
471 		}
472 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
473 		    iommu_gas_match_one(a, curr->end, first->first,
474 		    0, addr)) {
475 			RB_INSERT_NEXT(iommu_gas_entries_tree,
476 			    &domain->rb_root, curr, a->entry);
477 			return (0);
478 		}
479 	}
480 
481 	/*
482 	 * To resume the search at the start of the upper region, first climb to
483 	 * the nearest ancestor that spans highaddr.  Then find the last entry
484 	 * before highaddr that could abut a big-enough range.
485 	 */
486 	addr = a->common->highaddr;
487 	while (curr != NULL && curr->last < addr)
488 		curr = RB_PARENT(curr, rb_entry);
489 	first = NULL;
490 	while (curr != NULL && curr->free_down >= min_free) {
491 		if (addr < curr->end)
492 			curr = RB_LEFT(curr, rb_entry);
493 		else {
494 			first = curr;
495 			curr = RB_RIGHT(curr, rb_entry);
496 		}
497 	}
498 
499 	/*
500 	 * Walk the remaining big-enough ranges until one satisfies alignment
501 	 * requirements.
502 	 */
503 	for (curr = first; curr != NULL;
504 	    curr = iommu_gas_next(curr, min_free)) {
505 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
506 		    iommu_gas_match_one(a, first->last, curr->start,
507 		    addr + 1, domain->end - 1)) {
508 			RB_INSERT_PREV(iommu_gas_entries_tree,
509 			    &domain->rb_root, curr, a->entry);
510 			return (0);
511 		}
512 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
513 		    iommu_gas_match_one(a, curr->end, first->first,
514 		    addr + 1, domain->end - 1)) {
515 			RB_INSERT_NEXT(iommu_gas_entries_tree,
516 			    &domain->rb_root, curr, a->entry);
517 			return (0);
518 		}
519 	}
520 
521 	return (ENOMEM);
522 }
523 
524 static int
525 iommu_gas_alloc_region(struct iommu_domain *domain, struct iommu_map_entry *entry,
526     u_int flags)
527 {
528 	struct iommu_map_entry *next, *prev;
529 
530 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
531 
532 	if ((entry->start & IOMMU_PAGE_MASK) != 0 ||
533 	    (entry->end & IOMMU_PAGE_MASK) != 0)
534 		return (EINVAL);
535 	if (entry->start >= entry->end)
536 		return (EINVAL);
537 	if (entry->end >= domain->end)
538 		return (EINVAL);
539 
540 	entry->flags |= IOMMU_MAP_ENTRY_FAKE;
541 	next = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, entry);
542 	KASSERT(next != NULL, ("next must be non-null %p %jx", domain,
543 	    (uintmax_t)entry->start));
544 	prev = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, next);
545 	/* prev could be NULL */
546 	entry->flags &= ~IOMMU_MAP_ENTRY_FAKE;
547 
548 	/*
549 	 * Adapt to broken BIOSes which specify overlapping RMRR
550 	 * entries.
551 	 *
552 	 * XXXKIB: this does not handle a case when prev or next
553 	 * entries are completely covered by the current one, which
554 	 * extends both ways.
555 	 */
556 	if (prev != NULL && prev->end > entry->start &&
557 	    (prev->flags & IOMMU_MAP_ENTRY_PLACE) == 0) {
558 		if ((flags & IOMMU_MF_RMRR) == 0 ||
559 		    (prev->flags & IOMMU_MAP_ENTRY_RMRR) == 0)
560 			return (EBUSY);
561 		entry->start = prev->end;
562 	}
563 	if (next->start < entry->end &&
564 	    (next->flags & IOMMU_MAP_ENTRY_PLACE) == 0) {
565 		if ((flags & IOMMU_MF_RMRR) == 0 ||
566 		    (next->flags & IOMMU_MAP_ENTRY_RMRR) == 0)
567 			return (EBUSY);
568 		entry->end = next->start;
569 	}
570 	if (entry->end == entry->start)
571 		return (0);
572 
573 	if (prev != NULL && prev->end > entry->start) {
574 		/* This assumes that prev is the placeholder entry. */
575 		iommu_gas_rb_remove(domain, prev);
576 		prev = NULL;
577 	}
578 	RB_INSERT_PREV(iommu_gas_entries_tree,
579 	    &domain->rb_root, next, entry);
580 	if (next->start < entry->end) {
581 		iommu_gas_rb_remove(domain, next);
582 		next = NULL;
583 	}
584 
585 	if ((flags & IOMMU_MF_RMRR) != 0)
586 		entry->flags = IOMMU_MAP_ENTRY_RMRR;
587 
588 #ifdef INVARIANTS
589 	struct iommu_map_entry *ip, *in;
590 	ip = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, entry);
591 	in = RB_NEXT(iommu_gas_entries_tree, &domain->rb_root, entry);
592 	KASSERT(prev == NULL || ip == prev,
593 	    ("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)",
594 	    entry, entry->start, entry->end, prev,
595 	    prev == NULL ? 0 : prev->start, prev == NULL ? 0 : prev->end,
596 	    ip, ip == NULL ? 0 : ip->start, ip == NULL ? 0 : ip->end));
597 	KASSERT(next == NULL || in == next,
598 	    ("RMRR %p (%jx %jx) next %p (%jx %jx) ins next %p (%jx %jx)",
599 	    entry, entry->start, entry->end, next,
600 	    next == NULL ? 0 : next->start, next == NULL ? 0 : next->end,
601 	    in, in == NULL ? 0 : in->start, in == NULL ? 0 : in->end));
602 #endif
603 
604 	return (0);
605 }
606 
607 void
608 iommu_gas_free_space(struct iommu_map_entry *entry)
609 {
610 	struct iommu_domain *domain;
611 
612 	domain = entry->domain;
613 	KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR |
614 	    IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_MAP,
615 	    ("permanent entry %p %p", domain, entry));
616 
617 	IOMMU_DOMAIN_LOCK(domain);
618 	iommu_gas_rb_remove(domain, entry);
619 	entry->flags &= ~IOMMU_MAP_ENTRY_MAP;
620 #ifdef INVARIANTS
621 	if (iommu_check_free)
622 		iommu_gas_check_free(domain);
623 #endif
624 	IOMMU_DOMAIN_UNLOCK(domain);
625 }
626 
627 void
628 iommu_gas_free_region(struct iommu_map_entry *entry)
629 {
630 	struct iommu_domain *domain;
631 
632 	domain = entry->domain;
633 	KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR |
634 	    IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_RMRR,
635 	    ("non-RMRR entry %p %p", domain, entry));
636 
637 	IOMMU_DOMAIN_LOCK(domain);
638 	if (entry != domain->first_place &&
639 	    entry != domain->last_place)
640 		iommu_gas_rb_remove(domain, entry);
641 	entry->flags &= ~IOMMU_MAP_ENTRY_RMRR;
642 	IOMMU_DOMAIN_UNLOCK(domain);
643 }
644 
645 static struct iommu_map_entry *
646 iommu_gas_remove_clip_left(struct iommu_domain *domain, iommu_gaddr_t start,
647     iommu_gaddr_t end, struct iommu_map_entry **r)
648 {
649 	struct iommu_map_entry *entry, *res, fentry;
650 
651 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
652 	MPASS(start <= end);
653 	MPASS(end <= domain->end);
654 
655 	/*
656 	 * Find an entry which contains the supplied guest's address
657 	 * start, or the first entry after the start.  Since we
658 	 * asserted that start is below domain end, entry should
659 	 * exist.  Then clip it if needed.
660 	 */
661 	bzero(&fentry, sizeof(fentry));
662 	fentry.start = start + 1;
663 	fentry.end = start + 1;
664 	fentry.flags = IOMMU_MAP_ENTRY_FAKE;
665 	entry = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, &fentry);
666 
667 	if (entry->start >= start ||
668 	    (entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
669 		return (entry);
670 
671 	res = *r;
672 	*r = NULL;
673 	*res = *entry;
674 	res->start = entry->end = start;
675 	RB_UPDATE_AUGMENT(entry, rb_entry);
676 	RB_INSERT_NEXT(iommu_gas_entries_tree,
677 	    &domain->rb_root, entry, res);
678 	return (res);
679 }
680 
681 static bool
682 iommu_gas_remove_clip_right(struct iommu_domain *domain,
683     iommu_gaddr_t end, struct iommu_map_entry *entry,
684     struct iommu_map_entry *r)
685 {
686 	if (entry->start >= end || (entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
687 		return (false);
688 
689 	*r = *entry;
690 	r->end = entry->start = end;
691 	RB_UPDATE_AUGMENT(entry, rb_entry);
692 	RB_INSERT_PREV(iommu_gas_entries_tree,
693 	    &domain->rb_root, entry, r);
694 	return (true);
695 }
696 
697 static void
698 iommu_gas_remove_unmap(struct iommu_domain *domain,
699     struct iommu_map_entry *entry, struct iommu_map_entries_tailq *gcp)
700 {
701 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
702 
703 	if ((entry->flags & (IOMMU_MAP_ENTRY_UNMAPPED |
704 	    IOMMU_MAP_ENTRY_RMRR |
705 	    IOMMU_MAP_ENTRY_REMOVING)) != 0)
706 		return;
707 	MPASS((entry->flags & IOMMU_MAP_ENTRY_PLACE) == 0);
708 	entry->flags |= IOMMU_MAP_ENTRY_REMOVING;
709 	TAILQ_INSERT_TAIL(gcp, entry, dmamap_link);
710 }
711 
712 static void
713 iommu_gas_remove_locked(struct iommu_domain *domain,
714     iommu_gaddr_t start, iommu_gaddr_t size,
715     struct iommu_map_entries_tailq *gc,
716     struct iommu_map_entry **r1, struct iommu_map_entry **r2)
717 {
718 	struct iommu_map_entry *entry, *nentry;
719 	iommu_gaddr_t end;
720 
721 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
722 
723 	end = start + size;
724 
725 	nentry = iommu_gas_remove_clip_left(domain, start, end, r1);
726 	RB_FOREACH_FROM(entry, iommu_gas_entries_tree, nentry) {
727 		if (entry->start >= end)
728 			break;
729 		KASSERT(start <= entry->start,
730 		    ("iommu_gas_remove entry (%#jx, %#jx) start %#jx",
731 		    entry->start, entry->end, start));
732 		iommu_gas_remove_unmap(domain, entry, gc);
733 	}
734 	if (iommu_gas_remove_clip_right(domain, end, entry, *r2)) {
735 		iommu_gas_remove_unmap(domain, *r2, gc);
736 		*r2 = NULL;
737 	}
738 
739 #ifdef INVARIANTS
740 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
741 		if ((entry->flags & (IOMMU_MAP_ENTRY_RMRR |
742 		    IOMMU_MAP_ENTRY_PLACE)) != 0)
743 			continue;
744 		KASSERT(entry->end <= start || entry->start >= end,
745 		    ("iommu_gas_remove leftover entry (%#jx, %#jx) range "
746 		    "(%#jx, %#jx)",
747 		    entry->start, entry->end, start, end));
748 	}
749 #endif
750 }
751 
752 static void
753 iommu_gas_remove_init(struct iommu_domain *domain,
754     struct iommu_map_entries_tailq *gc, struct iommu_map_entry **r1,
755     struct iommu_map_entry **r2)
756 {
757 	TAILQ_INIT(gc);
758 	*r1 = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
759 	*r2 = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
760 }
761 
762 static void
763 iommu_gas_remove_cleanup(struct iommu_domain *domain,
764     struct iommu_map_entries_tailq *gc, struct iommu_map_entry **r1,
765     struct iommu_map_entry **r2)
766 {
767 	if (*r1 != NULL) {
768 		iommu_gas_free_entry(*r1);
769 		*r1 = NULL;
770 	}
771 	if (*r2 != NULL) {
772 		iommu_gas_free_entry(*r2);
773 		*r2 = NULL;
774 	}
775 	iommu_domain_unload(domain, gc, true);
776 }
777 
778 /*
779  * Remove specified range from the GAS of the domain.  Note that the
780  * removal is not guaranteed to occur upon the function return, it
781  * might be finalized some time after, when hardware reports that
782  * (queued) IOTLB invalidation was performed.
783  */
784 void
785 iommu_gas_remove(struct iommu_domain *domain, iommu_gaddr_t start,
786     iommu_gaddr_t size)
787 {
788 	struct iommu_map_entry *r1, *r2;
789 	struct iommu_map_entries_tailq gc;
790 
791 	iommu_gas_remove_init(domain, &gc, &r1, &r2);
792 	IOMMU_DOMAIN_LOCK(domain);
793 	iommu_gas_remove_locked(domain, start, size, &gc, &r1, &r2);
794 	IOMMU_DOMAIN_UNLOCK(domain);
795 	iommu_gas_remove_cleanup(domain, &gc, &r1, &r2);
796 }
797 
798 int
799 iommu_gas_map(struct iommu_domain *domain,
800     const struct bus_dma_tag_common *common, iommu_gaddr_t size, int offset,
801     u_int eflags, u_int flags, vm_page_t *ma, struct iommu_map_entry **res)
802 {
803 	struct iommu_gas_match_args a;
804 	struct iommu_map_entry *entry;
805 	int error;
806 
807 	KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_CANSPLIT)) == 0,
808 	    ("invalid flags 0x%x", flags));
809 
810 	a.size = size;
811 	a.offset = offset;
812 	a.common = common;
813 	a.gas_flags = flags;
814 	entry = iommu_gas_alloc_entry(domain,
815 	    (flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0);
816 	if (entry == NULL)
817 		return (ENOMEM);
818 	a.entry = entry;
819 	IOMMU_DOMAIN_LOCK(domain);
820 	error = iommu_gas_find_space(domain, &a);
821 	if (error == ENOMEM) {
822 		IOMMU_DOMAIN_UNLOCK(domain);
823 		iommu_gas_free_entry(entry);
824 		return (error);
825 	}
826 #ifdef INVARIANTS
827 	if (iommu_check_free)
828 		iommu_gas_check_free(domain);
829 #endif
830 	KASSERT(error == 0,
831 	    ("unexpected error %d from iommu_gas_find_entry", error));
832 	KASSERT(entry->end < domain->end, ("allocated GPA %jx, max GPA %jx",
833 	    (uintmax_t)entry->end, (uintmax_t)domain->end));
834 	entry->flags |= eflags;
835 	IOMMU_DOMAIN_UNLOCK(domain);
836 
837 	error = domain->ops->map(domain, entry, ma, eflags,
838 	    ((flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0));
839 	if (error == ENOMEM) {
840 		iommu_domain_unload_entry(entry, true,
841 		    (flags & IOMMU_MF_CANWAIT) != 0);
842 		return (error);
843 	}
844 	KASSERT(error == 0,
845 	    ("unexpected error %d from domain_map_buf", error));
846 
847 	*res = entry;
848 	return (0);
849 }
850 
851 int
852 iommu_gas_map_region(struct iommu_domain *domain, struct iommu_map_entry *entry,
853     u_int eflags, u_int flags, vm_page_t *ma)
854 {
855 	iommu_gaddr_t start;
856 	int error;
857 
858 	KASSERT(entry->domain == domain,
859 	    ("mismatched domain %p entry %p entry->domain %p", domain,
860 	    entry, entry->domain));
861 	KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain,
862 	    entry, entry->flags));
863 	KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_RMRR)) == 0,
864 	    ("invalid flags 0x%x", flags));
865 
866 	start = entry->start;
867 	IOMMU_DOMAIN_LOCK(domain);
868 	error = iommu_gas_alloc_region(domain, entry, flags);
869 	if (error != 0) {
870 		IOMMU_DOMAIN_UNLOCK(domain);
871 		return (error);
872 	}
873 	entry->flags |= eflags;
874 	IOMMU_DOMAIN_UNLOCK(domain);
875 	if (entry->end == entry->start)
876 		return (0);
877 
878 	error = domain->ops->map(domain, entry,
879 	    ma + OFF_TO_IDX(start - entry->start), eflags,
880 	    ((flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0));
881 	if (error == ENOMEM) {
882 		iommu_domain_unload_entry(entry, false,
883 		    (flags & IOMMU_MF_CANWAIT) != 0);
884 		return (error);
885 	}
886 	KASSERT(error == 0,
887 	    ("unexpected error %d from domain_map_buf", error));
888 
889 	return (0);
890 }
891 
892 static int
893 iommu_gas_reserve_region_locked(struct iommu_domain *domain,
894     iommu_gaddr_t start, iommu_gaddr_t end, struct iommu_map_entry *entry)
895 {
896 	int error;
897 
898 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
899 
900 	entry->start = start;
901 	entry->end = end;
902 	error = iommu_gas_alloc_region(domain, entry, IOMMU_MF_CANWAIT);
903 	if (error == 0)
904 		entry->flags |= IOMMU_MAP_ENTRY_UNMAPPED;
905 	return (error);
906 }
907 
908 int
909 iommu_gas_reserve_region(struct iommu_domain *domain, iommu_gaddr_t start,
910     iommu_gaddr_t end, struct iommu_map_entry **entry0)
911 {
912 	struct iommu_map_entry *entry;
913 	int error;
914 
915 	entry = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
916 	IOMMU_DOMAIN_LOCK(domain);
917 	error = iommu_gas_reserve_region_locked(domain, start, end, entry);
918 	IOMMU_DOMAIN_UNLOCK(domain);
919 	if (error != 0)
920 		iommu_gas_free_entry(entry);
921 	else if (entry0 != NULL)
922 		*entry0 = entry;
923 	return (error);
924 }
925 
926 /*
927  * As in iommu_gas_reserve_region, reserve [start, end), but allow for existing
928  * entries.
929  */
930 int
931 iommu_gas_reserve_region_extend(struct iommu_domain *domain,
932     iommu_gaddr_t start, iommu_gaddr_t end)
933 {
934 	struct iommu_map_entry *entry, *next, *prev, key = {};
935 	iommu_gaddr_t entry_start, entry_end;
936 	int error;
937 
938 	error = 0;
939 	entry = NULL;
940 	end = ummin(end, domain->end);
941 	while (start < end) {
942 		/* Preallocate an entry. */
943 		if (entry == NULL)
944 			entry = iommu_gas_alloc_entry(domain,
945 			    IOMMU_PGF_WAITOK);
946 		/* Calculate the free region from here to the next entry. */
947 		key.start = key.end = start;
948 		IOMMU_DOMAIN_LOCK(domain);
949 		next = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, &key);
950 		KASSERT(next != NULL, ("domain %p with end %#jx has no entry "
951 		    "after %#jx", domain, (uintmax_t)domain->end,
952 		    (uintmax_t)start));
953 		entry_end = ummin(end, next->start);
954 		prev = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, next);
955 		if (prev != NULL)
956 			entry_start = ummax(start, prev->end);
957 		else
958 			entry_start = start;
959 		start = next->end;
960 		/* Reserve the region if non-empty. */
961 		if (entry_start != entry_end) {
962 			error = iommu_gas_reserve_region_locked(domain,
963 			    entry_start, entry_end, entry);
964 			if (error != 0) {
965 				IOMMU_DOMAIN_UNLOCK(domain);
966 				break;
967 			}
968 			entry = NULL;
969 		}
970 		IOMMU_DOMAIN_UNLOCK(domain);
971 	}
972 	/* Release a preallocated entry if it was not used. */
973 	if (entry != NULL)
974 		iommu_gas_free_entry(entry);
975 	return (error);
976 }
977 
978 void
979 iommu_unmap_msi(struct iommu_ctx *ctx)
980 {
981 	struct iommu_map_entry *entry;
982 	struct iommu_domain *domain;
983 
984 	domain = ctx->domain;
985 	entry = domain->msi_entry;
986 	if (entry == NULL)
987 		return;
988 
989 	domain->ops->unmap(domain, entry, IOMMU_PGF_WAITOK);
990 
991 	iommu_gas_free_space(entry);
992 
993 	iommu_gas_free_entry(entry);
994 
995 	domain->msi_entry = NULL;
996 	domain->msi_base = 0;
997 	domain->msi_phys = 0;
998 }
999 
1000 int
1001 iommu_map_msi(struct iommu_ctx *ctx, iommu_gaddr_t size, int offset,
1002     u_int eflags, u_int flags, vm_page_t *ma)
1003 {
1004 	struct iommu_domain *domain;
1005 	struct iommu_map_entry *entry;
1006 	int error;
1007 
1008 	error = 0;
1009 	domain = ctx->domain;
1010 
1011 	/* Check if there is already an MSI page allocated */
1012 	IOMMU_DOMAIN_LOCK(domain);
1013 	entry = domain->msi_entry;
1014 	IOMMU_DOMAIN_UNLOCK(domain);
1015 
1016 	if (entry == NULL) {
1017 		error = iommu_gas_map(domain, &ctx->tag->common, size, offset,
1018 		    eflags, flags, ma, &entry);
1019 		IOMMU_DOMAIN_LOCK(domain);
1020 		if (error == 0) {
1021 			if (domain->msi_entry == NULL) {
1022 				MPASS(domain->msi_base == 0);
1023 				MPASS(domain->msi_phys == 0);
1024 
1025 				domain->msi_entry = entry;
1026 				domain->msi_base = entry->start;
1027 				domain->msi_phys = VM_PAGE_TO_PHYS(ma[0]);
1028 			} else {
1029 				/*
1030 				 * We lost the race and already have an
1031 				 * MSI page allocated. Free the unneeded entry.
1032 				 */
1033 				iommu_gas_free_entry(entry);
1034 			}
1035 		} else if (domain->msi_entry != NULL) {
1036 			/*
1037 			 * The allocation failed, but another succeeded.
1038 			 * Return success as there is a valid MSI page.
1039 			 */
1040 			error = 0;
1041 		}
1042 		IOMMU_DOMAIN_UNLOCK(domain);
1043 	}
1044 
1045 	return (error);
1046 }
1047 
1048 void
1049 iommu_translate_msi(struct iommu_domain *domain, uint64_t *addr)
1050 {
1051 
1052 	*addr = (*addr - domain->msi_phys) + domain->msi_base;
1053 
1054 	KASSERT(*addr >= domain->msi_entry->start,
1055 	    ("%s: Address is below the MSI entry start address (%jx < %jx)",
1056 	    __func__, (uintmax_t)*addr, (uintmax_t)domain->msi_entry->start));
1057 
1058 	KASSERT(*addr + sizeof(*addr) <= domain->msi_entry->end,
1059 	    ("%s: Address is above the MSI entry end address (%jx < %jx)",
1060 	    __func__, (uintmax_t)*addr, (uintmax_t)domain->msi_entry->end));
1061 }
1062 
1063 SYSCTL_NODE(_hw, OID_AUTO, iommu, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "");
1064 
1065 #ifdef INVARIANTS
1066 SYSCTL_INT(_hw_iommu, OID_AUTO, check_free, CTLFLAG_RWTUN,
1067     &iommu_check_free, 0,
1068     "Check the GPA RBtree for free_down and free_after validity");
1069 #endif
1070 
1071 #include "opt_ddb.h"
1072 #ifdef DDB
1073 
1074 #include <ddb/ddb.h>
1075 
1076 static void
1077 iommu_debug_dump_gas(struct iommu_domain *domain)
1078 {
1079 	struct iommu_map_entry *entry;
1080 
1081 	db_printf("iommu_domain %p tree %p iommu %p fl %#x\n", domain,
1082 	    &domain->rb_root, domain->iommu, domain->flags);
1083 	db_printf("iommu_domain %p tree %p\n", domain, &domain->rb_root);
1084 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
1085 		db_printf(
1086 	    "  e %p [%#jx %#jx] fl %#x first %#jx last %#jx free_down %#jx",
1087 		    entry, (uintmax_t)entry->start, (uintmax_t)entry->end,
1088 		    entry->flags,
1089 		    (uintmax_t)entry->first, (uintmax_t)entry->last,
1090 		    (uintmax_t)entry->free_down);
1091 		if (entry == domain->start_gap)
1092 			db_printf(" start_gap");
1093 		if (entry == domain->first_place)
1094 			db_printf(" first_place");
1095 		if (entry == domain->last_place)
1096 			db_printf(" last_place");
1097 		db_printf("\n");
1098 	}
1099 }
1100 
1101 DB_SHOW_COMMAND(iommu_domain, iommu_domain_show)
1102 {
1103 	struct iommu_domain *domain;
1104 
1105 	if (!have_addr) {
1106 		db_printf("show iommu_domain addr\n");
1107 		return;
1108 	}
1109 
1110 	domain = (void *)addr;
1111 	iommu_debug_dump_gas(domain);
1112 }
1113 
1114 #endif
1115