xref: /freebsd/sys/dev/iommu/iommu_gas.c (revision a59c252903e81f46c74903ce5b1cf0960927dbcc)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #define	RB_AUGMENT_CHECK(entry) iommu_gas_augment_entry(entry)
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/bus.h>
37 #include <sys/interrupt.h>
38 #include <sys/kernel.h>
39 #include <sys/ktr.h>
40 #include <sys/lock.h>
41 #include <sys/proc.h>
42 #include <sys/rwlock.h>
43 #include <sys/memdesc.h>
44 #include <sys/mutex.h>
45 #include <sys/sysctl.h>
46 #include <sys/rman.h>
47 #include <sys/taskqueue.h>
48 #include <sys/tree.h>
49 #include <sys/uio.h>
50 #include <sys/vmem.h>
51 #include <vm/vm.h>
52 #include <vm/vm_extern.h>
53 #include <vm/vm_kern.h>
54 #include <vm/vm_object.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_map.h>
57 #include <vm/uma.h>
58 #include <dev/pci/pcireg.h>
59 #include <dev/pci/pcivar.h>
60 #include <dev/iommu/iommu.h>
61 #include <dev/iommu/iommu_gas.h>
62 #include <dev/iommu/iommu_msi.h>
63 #include <machine/atomic.h>
64 #include <machine/bus.h>
65 #include <machine/md_var.h>
66 #include <machine/iommu.h>
67 #include <dev/iommu/busdma_iommu.h>
68 
69 /*
70  * Guest Address Space management.
71  */
72 
73 static uma_zone_t iommu_map_entry_zone;
74 
75 #ifdef INVARIANTS
76 static int iommu_check_free;
77 #endif
78 
79 static void
80 intel_gas_init(void)
81 {
82 
83 	iommu_map_entry_zone = uma_zcreate("IOMMU_MAP_ENTRY",
84 	    sizeof(struct iommu_map_entry), NULL, NULL,
85 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NODUMP);
86 }
87 SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL);
88 
89 struct iommu_map_entry *
90 iommu_gas_alloc_entry(struct iommu_domain *domain, u_int flags)
91 {
92 	struct iommu_map_entry *res;
93 
94 	KASSERT((flags & ~(IOMMU_PGF_WAITOK)) == 0,
95 	    ("unsupported flags %x", flags));
96 
97 	res = uma_zalloc(iommu_map_entry_zone, ((flags & IOMMU_PGF_WAITOK) !=
98 	    0 ? M_WAITOK : M_NOWAIT) | M_ZERO);
99 	if (res != NULL && domain != NULL) {
100 		res->domain = domain;
101 		atomic_add_int(&domain->entries_cnt, 1);
102 	}
103 	return (res);
104 }
105 
106 void
107 iommu_gas_free_entry(struct iommu_map_entry *entry)
108 {
109 	struct iommu_domain *domain;
110 
111 	domain = entry->domain;
112 	if (domain != NULL)
113 		atomic_subtract_int(&domain->entries_cnt, 1);
114 	uma_zfree(iommu_map_entry_zone, entry);
115 }
116 
117 static int
118 iommu_gas_cmp_entries(struct iommu_map_entry *a, struct iommu_map_entry *b)
119 {
120 
121 	/* Last entry have zero size, so <= */
122 	KASSERT(a->start <= a->end, ("inverted entry %p (%jx, %jx)",
123 	    a, (uintmax_t)a->start, (uintmax_t)a->end));
124 	KASSERT(b->start <= b->end, ("inverted entry %p (%jx, %jx)",
125 	    b, (uintmax_t)b->start, (uintmax_t)b->end));
126 	KASSERT(((a->flags | b->flags) & IOMMU_MAP_ENTRY_FAKE) != 0 ||
127 	    a->end <= b->start || b->end <= a->start ||
128 	    a->end == a->start || b->end == b->start,
129 	    ("overlapping entries %p (%jx, %jx) f %#x %p (%jx, %jx) f %#x"
130 	    " domain %p %p",
131 	    a, (uintmax_t)a->start, (uintmax_t)a->end, a->flags,
132 	    b, (uintmax_t)b->start, (uintmax_t)b->end, b->flags,
133 	    a->domain, b->domain));
134 
135 	if (a->end < b->end)
136 		return (-1);
137 	else if (b->end < a->end)
138 		return (1);
139 	return (0);
140 }
141 
142 /*
143  * Update augmentation data based on data from children.
144  * Return true if and only if the update changes the augmentation data.
145  */
146 static bool
147 iommu_gas_augment_entry(struct iommu_map_entry *entry)
148 {
149 	struct iommu_map_entry *child;
150 	iommu_gaddr_t bound, delta, free_down;
151 
152 	free_down = 0;
153 	bound = entry->start;
154 	if ((child = RB_LEFT(entry, rb_entry)) != NULL) {
155 		free_down = MAX(child->free_down, bound - child->last);
156 		bound = child->first;
157 	}
158 	delta = bound - entry->first;
159 	entry->first = bound;
160 	bound = entry->end;
161 	if ((child = RB_RIGHT(entry, rb_entry)) != NULL) {
162 		free_down = MAX(free_down, child->free_down);
163 		free_down = MAX(free_down, child->first - bound);
164 		bound = child->last;
165 	}
166 	delta += entry->last - bound;
167 	if (delta == 0)
168 		delta = entry->free_down - free_down;
169 	entry->last = bound;
170 	entry->free_down = free_down;
171 
172 	/*
173 	 * Return true either if the value of last-first changed,
174 	 * or if free_down changed.
175 	 */
176 	return (delta != 0);
177 }
178 
179 RB_GENERATE(iommu_gas_entries_tree, iommu_map_entry, rb_entry,
180     iommu_gas_cmp_entries);
181 
182 #ifdef INVARIANTS
183 static void
184 iommu_gas_check_free(struct iommu_domain *domain)
185 {
186 	struct iommu_map_entry *entry, *l, *r;
187 	iommu_gaddr_t v;
188 
189 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
190 		KASSERT(domain == entry->domain,
191 		    ("mismatched free domain %p entry %p entry->domain %p",
192 		    domain, entry, entry->domain));
193 		l = RB_LEFT(entry, rb_entry);
194 		r = RB_RIGHT(entry, rb_entry);
195 		v = 0;
196 		if (l != NULL) {
197 			v = MAX(v, l->free_down);
198 			v = MAX(v, entry->start - l->last);
199 		}
200 		if (r != NULL) {
201 			v = MAX(v, r->free_down);
202 			v = MAX(v, r->first - entry->end);
203 		}
204 		MPASS(entry->free_down == v);
205 	}
206 }
207 #endif
208 
209 static void
210 iommu_gas_rb_remove(struct iommu_domain *domain, struct iommu_map_entry *entry)
211 {
212 	struct iommu_map_entry *nbr;
213 
214 	/* Removing entry may open a new free gap before domain->start_gap. */
215 	if (entry->end <= domain->start_gap->end) {
216 		if (RB_RIGHT(entry, rb_entry) != NULL)
217 			nbr = iommu_gas_entries_tree_RB_NEXT(entry);
218 		else if (RB_LEFT(entry, rb_entry) != NULL)
219 			nbr = RB_LEFT(entry, rb_entry);
220 		else
221 			nbr = RB_PARENT(entry, rb_entry);
222 		domain->start_gap = nbr;
223 	}
224 	RB_REMOVE(iommu_gas_entries_tree, &domain->rb_root, entry);
225 }
226 
227 struct iommu_domain *
228 iommu_get_ctx_domain(struct iommu_ctx *ctx)
229 {
230 
231 	return (ctx->domain);
232 }
233 
234 void
235 iommu_gas_init_domain(struct iommu_domain *domain)
236 {
237 	struct iommu_map_entry *begin, *end;
238 
239 	begin = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
240 	end = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
241 
242 	IOMMU_DOMAIN_LOCK(domain);
243 	KASSERT(domain->entries_cnt == 2, ("dirty domain %p", domain));
244 	KASSERT(RB_EMPTY(&domain->rb_root),
245 	    ("non-empty entries %p", domain));
246 
247 	/*
248 	 * The end entry must be inserted first because it has a zero-length gap
249 	 * between start and end.  Initially, all augmentation data for a new
250 	 * entry is zero.  Function iommu_gas_augment_entry will compute no
251 	 * change in the value of (start-end) and no change in the value of
252 	 * free_down, so it will return false to suggest that nothing changed in
253 	 * the entry.  Thus, inserting the end entry second prevents
254 	 * augmentation information to be propogated to the begin entry at the
255 	 * tree root.  So it is inserted first.
256 	 */
257 	end->start = domain->end;
258 	end->end = domain->end;
259 	end->flags = IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED;
260 	RB_INSERT(iommu_gas_entries_tree, &domain->rb_root, end);
261 
262 	begin->start = 0;
263 	begin->end = IOMMU_PAGE_SIZE;
264 	begin->flags = IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED;
265 	RB_INSERT_PREV(iommu_gas_entries_tree, &domain->rb_root, end, begin);
266 
267 	domain->start_gap = begin;
268 	domain->first_place = begin;
269 	domain->last_place = end;
270 	domain->flags |= IOMMU_DOMAIN_GAS_INITED;
271 	IOMMU_DOMAIN_UNLOCK(domain);
272 }
273 
274 void
275 iommu_gas_fini_domain(struct iommu_domain *domain)
276 {
277 	struct iommu_map_entry *entry;
278 
279 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
280 	KASSERT(domain->entries_cnt == 2,
281 	    ("domain still in use %p", domain));
282 
283 	entry = RB_MIN(iommu_gas_entries_tree, &domain->rb_root);
284 	KASSERT(entry->start == 0, ("start entry start %p", domain));
285 	KASSERT(entry->end == IOMMU_PAGE_SIZE, ("start entry end %p", domain));
286 	KASSERT(entry->flags ==
287 	    (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED),
288 	    ("start entry flags %p", domain));
289 	iommu_gas_rb_remove(domain, entry);
290 	iommu_gas_free_entry(entry);
291 
292 	entry = RB_MAX(iommu_gas_entries_tree, &domain->rb_root);
293 	KASSERT(entry->start == domain->end, ("end entry start %p", domain));
294 	KASSERT(entry->end == domain->end, ("end entry end %p", domain));
295 	KASSERT(entry->flags ==
296 	    (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED),
297 	    ("end entry flags %p", domain));
298 	iommu_gas_rb_remove(domain, entry);
299 	iommu_gas_free_entry(entry);
300 }
301 
302 struct iommu_gas_match_args {
303 	iommu_gaddr_t size;
304 	int offset;
305 	const struct bus_dma_tag_common *common;
306 	u_int gas_flags;
307 	struct iommu_map_entry *entry;
308 };
309 
310 /*
311  * The interval [beg, end) is a free interval between two iommu_map_entries.
312  * Addresses can be allocated only in the range [lbound, ubound]. Try to
313  * allocate space in the free interval, subject to the conditions expressed by
314  * a, and return 'true' if and only if the allocation attempt succeeds.
315  */
316 static bool
317 iommu_gas_match_one(struct iommu_gas_match_args *a, iommu_gaddr_t beg,
318     iommu_gaddr_t end, iommu_gaddr_t lbound, iommu_gaddr_t ubound)
319 {
320 	struct iommu_map_entry *entry;
321 	iommu_gaddr_t first, size, start;
322 	int offset;
323 
324 	/*
325 	 * The prev->end is always aligned on the page size, which
326 	 * causes page alignment for the entry->start too.
327 	 *
328 	 * Create IOMMU_PAGE_SIZE gaps before, after new entry
329 	 * to ensure that out-of-bounds accesses fault.
330 	 */
331 	beg = MAX(beg + IOMMU_PAGE_SIZE, lbound);
332 	start = roundup2(beg, a->common->alignment);
333 	if (start < beg)
334 		return (false);
335 	if (end < IOMMU_PAGE_SIZE + 1)
336 		return (false);
337 	end = MIN(end - IOMMU_PAGE_SIZE - 1, ubound);
338 	offset = a->offset;
339 	size = a->size;
340 	if (start + offset + size - 1 > end)
341 		return (false);
342 
343 	/* Check for and try to skip past boundary crossing. */
344 	if (!vm_addr_bound_ok(start + offset, size, a->common->boundary)) {
345 		/*
346 		 * The start + offset to start + offset + size region crosses
347 		 * the boundary.  Check if there is enough space after the next
348 		 * boundary after the beg.
349 		 */
350 		first = start;
351 		beg = roundup2(start + offset + 1, a->common->boundary);
352 		start = roundup2(beg, a->common->alignment);
353 
354 		if (start + offset + size - 1 > end ||
355 		    !vm_addr_bound_ok(start + offset, size,
356 		    a->common->boundary)) {
357 			/*
358 			 * Not enough space to align at the requested boundary,
359 			 * or boundary is smaller than the size, but allowed to
360 			 * split.  We already checked that start + size does not
361 			 * overlap ubound.
362 			 *
363 			 * XXXKIB. It is possible that beg is exactly at the
364 			 * start of the next entry, then we do not have gap.
365 			 * Ignore for now.
366 			 */
367 			if ((a->gas_flags & IOMMU_MF_CANSPLIT) == 0)
368 				return (false);
369 			size = beg - first - offset;
370 			start = first;
371 		}
372 	}
373 	entry = a->entry;
374 	entry->start = start;
375 	entry->end = start + roundup2(size + offset, IOMMU_PAGE_SIZE);
376 	entry->flags = IOMMU_MAP_ENTRY_MAP;
377 	return (true);
378 }
379 
380 /* Find the next entry that might abut a big-enough range. */
381 static struct iommu_map_entry *
382 iommu_gas_next(struct iommu_map_entry *curr, iommu_gaddr_t min_free)
383 {
384 	struct iommu_map_entry *next;
385 
386 	if ((next = RB_RIGHT(curr, rb_entry)) != NULL &&
387 	    next->free_down >= min_free) {
388 		/* Find next entry in right subtree. */
389 		do
390 			curr = next;
391 		while ((next = RB_LEFT(curr, rb_entry)) != NULL &&
392 		    next->free_down >= min_free);
393 	} else {
394 		/* Find next entry in a left-parent ancestor. */
395 		while ((next = RB_PARENT(curr, rb_entry)) != NULL &&
396 		    curr == RB_RIGHT(next, rb_entry))
397 			curr = next;
398 		curr = next;
399 	}
400 	return (curr);
401 }
402 
403 /*
404  * Address-ordered first-fit search of 'domain' for free space satisfying the
405  * conditions of 'a'.  The space allocated is at least one page big, and is
406  * bounded by guard pages to the left and right.  The allocated space for
407  * 'domain' is described by an rb-tree of map entries at domain->rb_root, and
408  * domain->start_gap points to a map entry less than or adjacent to the first
409  * free-space of size at least 3 pages.
410  */
411 static int
412 iommu_gas_find_space(struct iommu_domain *domain,
413     struct iommu_gas_match_args *a)
414 {
415 	struct iommu_map_entry *curr, *first;
416 	iommu_gaddr_t addr, min_free;
417 
418 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
419 	KASSERT(a->entry->flags == 0,
420 	    ("dirty entry %p %p", domain, a->entry));
421 
422 	/*
423 	 * start_gap may point to an entry adjacent to gaps too small for any
424 	 * new allocation.  In that case, advance start_gap to the first free
425 	 * space big enough for a minimum allocation plus two guard pages.
426 	 */
427 	min_free = 3 * IOMMU_PAGE_SIZE;
428 	first = domain->start_gap;
429 	while (first != NULL && first->free_down < min_free)
430 		first = RB_PARENT(first, rb_entry);
431 	for (curr = first; curr != NULL;
432 	    curr = iommu_gas_next(curr, min_free)) {
433 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
434 		    first->last + min_free <= curr->start)
435 			break;
436 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
437 		    curr->end + min_free <= first->first)
438 			break;
439 	}
440 	domain->start_gap = curr;
441 
442 	/*
443 	 * If the subtree doesn't have free space for the requested allocation
444 	 * plus two guard pages, skip it.
445 	 */
446 	min_free = 2 * IOMMU_PAGE_SIZE +
447 	    roundup2(a->size + a->offset, IOMMU_PAGE_SIZE);
448 
449 	/* Climb to find a node in the subtree of big-enough ranges. */
450 	first = curr;
451 	while (first != NULL && first->free_down < min_free)
452 		first = RB_PARENT(first, rb_entry);
453 
454 	/*
455 	 * Walk the big-enough ranges tree until one satisfies alignment
456 	 * requirements, or violates lowaddr address requirement.
457 	 */
458 	addr = a->common->lowaddr;
459 	for (curr = first; curr != NULL;
460 	    curr = iommu_gas_next(curr, min_free)) {
461 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
462 		    iommu_gas_match_one(a, first->last, curr->start,
463 		    0, addr)) {
464 			RB_INSERT_PREV(iommu_gas_entries_tree,
465 			    &domain->rb_root, curr, a->entry);
466 			return (0);
467 		}
468 		if (curr->end >= addr) {
469 			/* All remaining ranges > addr */
470 			break;
471 		}
472 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
473 		    iommu_gas_match_one(a, curr->end, first->first,
474 		    0, addr)) {
475 			RB_INSERT_NEXT(iommu_gas_entries_tree,
476 			    &domain->rb_root, curr, a->entry);
477 			return (0);
478 		}
479 	}
480 
481 	/*
482 	 * To resume the search at the start of the upper region, first climb to
483 	 * the nearest ancestor that spans highaddr.  Then find the last entry
484 	 * before highaddr that could abut a big-enough range.
485 	 */
486 	addr = a->common->highaddr;
487 	while (curr != NULL && curr->last < addr)
488 		curr = RB_PARENT(curr, rb_entry);
489 	first = NULL;
490 	while (curr != NULL && curr->free_down >= min_free) {
491 		if (addr < curr->end)
492 			curr = RB_LEFT(curr, rb_entry);
493 		else {
494 			first = curr;
495 			curr = RB_RIGHT(curr, rb_entry);
496 		}
497 	}
498 
499 	/*
500 	 * Walk the remaining big-enough ranges until one satisfies alignment
501 	 * requirements.
502 	 */
503 	for (curr = first; curr != NULL;
504 	    curr = iommu_gas_next(curr, min_free)) {
505 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
506 		    iommu_gas_match_one(a, first->last, curr->start,
507 		    addr + 1, domain->end - 1)) {
508 			RB_INSERT_PREV(iommu_gas_entries_tree,
509 			    &domain->rb_root, curr, a->entry);
510 			return (0);
511 		}
512 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
513 		    iommu_gas_match_one(a, curr->end, first->first,
514 		    addr + 1, domain->end - 1)) {
515 			RB_INSERT_NEXT(iommu_gas_entries_tree,
516 			    &domain->rb_root, curr, a->entry);
517 			return (0);
518 		}
519 	}
520 
521 	return (ENOMEM);
522 }
523 
524 static int
525 iommu_gas_alloc_region(struct iommu_domain *domain, struct iommu_map_entry *entry,
526     u_int flags)
527 {
528 	struct iommu_map_entry *next, *prev;
529 
530 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
531 
532 	if ((entry->start & IOMMU_PAGE_MASK) != 0 ||
533 	    (entry->end & IOMMU_PAGE_MASK) != 0)
534 		return (EINVAL);
535 	if (entry->start >= entry->end)
536 		return (EINVAL);
537 	if (entry->end >= domain->end)
538 		return (EINVAL);
539 
540 	entry->flags |= IOMMU_MAP_ENTRY_FAKE;
541 	next = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, entry);
542 	KASSERT(next != NULL, ("next must be non-null %p %jx", domain,
543 	    (uintmax_t)entry->start));
544 	prev = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, next);
545 	/* prev could be NULL */
546 	entry->flags &= ~IOMMU_MAP_ENTRY_FAKE;
547 
548 	/*
549 	 * Adapt to broken BIOSes which specify overlapping RMRR
550 	 * entries.
551 	 *
552 	 * XXXKIB: this does not handle a case when prev or next
553 	 * entries are completely covered by the current one, which
554 	 * extends both ways.
555 	 */
556 	if (prev != NULL && prev->end > entry->start &&
557 	    (prev->flags & IOMMU_MAP_ENTRY_PLACE) == 0) {
558 		if ((flags & IOMMU_MF_RMRR) == 0 ||
559 		    (prev->flags & IOMMU_MAP_ENTRY_RMRR) == 0)
560 			return (EBUSY);
561 		entry->start = prev->end;
562 	}
563 	if (next->start < entry->end &&
564 	    (next->flags & IOMMU_MAP_ENTRY_PLACE) == 0) {
565 		if ((flags & IOMMU_MF_RMRR) == 0 ||
566 		    (next->flags & IOMMU_MAP_ENTRY_RMRR) == 0)
567 			return (EBUSY);
568 		entry->end = next->start;
569 	}
570 	if (entry->end == entry->start)
571 		return (0);
572 
573 	if (prev != NULL && prev->end > entry->start) {
574 		/* This assumes that prev is the placeholder entry. */
575 		iommu_gas_rb_remove(domain, prev);
576 		prev = NULL;
577 	}
578 	RB_INSERT_PREV(iommu_gas_entries_tree,
579 	    &domain->rb_root, next, entry);
580 	if (next->start < entry->end) {
581 		iommu_gas_rb_remove(domain, next);
582 		next = NULL;
583 	}
584 
585 	if ((flags & IOMMU_MF_RMRR) != 0)
586 		entry->flags = IOMMU_MAP_ENTRY_RMRR;
587 
588 #ifdef INVARIANTS
589 	struct iommu_map_entry *ip, *in;
590 	ip = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, entry);
591 	in = RB_NEXT(iommu_gas_entries_tree, &domain->rb_root, entry);
592 	KASSERT(prev == NULL || ip == prev,
593 	    ("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)",
594 	    entry, entry->start, entry->end, prev,
595 	    prev == NULL ? 0 : prev->start, prev == NULL ? 0 : prev->end,
596 	    ip, ip == NULL ? 0 : ip->start, ip == NULL ? 0 : ip->end));
597 	KASSERT(next == NULL || in == next,
598 	    ("RMRR %p (%jx %jx) next %p (%jx %jx) ins next %p (%jx %jx)",
599 	    entry, entry->start, entry->end, next,
600 	    next == NULL ? 0 : next->start, next == NULL ? 0 : next->end,
601 	    in, in == NULL ? 0 : in->start, in == NULL ? 0 : in->end));
602 #endif
603 
604 	return (0);
605 }
606 
607 void
608 iommu_gas_free_space(struct iommu_map_entry *entry)
609 {
610 	struct iommu_domain *domain;
611 
612 	domain = entry->domain;
613 	KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR |
614 	    IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_MAP,
615 	    ("permanent entry %p %p", domain, entry));
616 
617 	IOMMU_DOMAIN_LOCK(domain);
618 	iommu_gas_rb_remove(domain, entry);
619 	entry->flags &= ~IOMMU_MAP_ENTRY_MAP;
620 #ifdef INVARIANTS
621 	if (iommu_check_free)
622 		iommu_gas_check_free(domain);
623 #endif
624 	IOMMU_DOMAIN_UNLOCK(domain);
625 }
626 
627 void
628 iommu_gas_free_region(struct iommu_map_entry *entry)
629 {
630 	struct iommu_domain *domain;
631 
632 	domain = entry->domain;
633 	KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR |
634 	    IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_RMRR,
635 	    ("non-RMRR entry %p %p", domain, entry));
636 
637 	IOMMU_DOMAIN_LOCK(domain);
638 	if (entry != domain->first_place &&
639 	    entry != domain->last_place)
640 		iommu_gas_rb_remove(domain, entry);
641 	entry->flags &= ~IOMMU_MAP_ENTRY_RMRR;
642 	IOMMU_DOMAIN_UNLOCK(domain);
643 }
644 
645 static struct iommu_map_entry *
646 iommu_gas_remove_clip_left(struct iommu_domain *domain, iommu_gaddr_t start,
647     iommu_gaddr_t end, struct iommu_map_entry **r)
648 {
649 	struct iommu_map_entry *entry, *res, fentry;
650 
651 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
652 	MPASS(start <= end);
653 	MPASS(end <= domain->end);
654 
655 	/*
656 	 * Find an entry which contains the supplied guest's address
657 	 * start, or the first entry after the start.  Since we
658 	 * asserted that start is below domain end, entry should
659 	 * exist.  Then clip it if needed.
660 	 */
661 	bzero(&fentry, sizeof(fentry));
662 	fentry.start = start + 1;
663 	fentry.end = start + 1;
664 	fentry.flags = IOMMU_MAP_ENTRY_FAKE;
665 	entry = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, &fentry);
666 
667 	if (entry->start >= start ||
668 	    (entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
669 		return (entry);
670 
671 	res = *r;
672 	*r = NULL;
673 	*res = *entry;
674 	res->start = entry->end = start;
675 	RB_UPDATE_AUGMENT(entry, rb_entry);
676 	RB_INSERT_NEXT(iommu_gas_entries_tree,
677 	    &domain->rb_root, entry, res);
678 	return (res);
679 }
680 
681 static bool
682 iommu_gas_remove_clip_right(struct iommu_domain *domain,
683     iommu_gaddr_t end, struct iommu_map_entry *entry,
684     struct iommu_map_entry *r)
685 {
686 	if (entry->start >= end || (entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
687 		return (false);
688 
689 	*r = *entry;
690 	r->end = entry->start = end;
691 	RB_UPDATE_AUGMENT(entry, rb_entry);
692 	RB_INSERT_PREV(iommu_gas_entries_tree,
693 	    &domain->rb_root, entry, r);
694 	return (true);
695 }
696 
697 static void
698 iommu_gas_remove_unmap(struct iommu_domain *domain,
699     struct iommu_map_entry *entry, struct iommu_map_entries_tailq *gcp)
700 {
701 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
702 
703 	if ((entry->flags & (IOMMU_MAP_ENTRY_UNMAPPED |
704 	    IOMMU_MAP_ENTRY_REMOVING)) != 0)
705 		return;
706 	MPASS((entry->flags & IOMMU_MAP_ENTRY_PLACE) == 0);
707 	entry->flags |= IOMMU_MAP_ENTRY_REMOVING;
708 	TAILQ_INSERT_TAIL(gcp, entry, dmamap_link);
709 }
710 
711 /*
712  * Remove specified range from the GAS of the domain.  Note that the
713  * removal is not guaranteed to occur upon the function return, it
714  * might be finalized some time after, when hardware reports that
715  * (queued) IOTLB invalidation was performed.
716  */
717 void
718 iommu_gas_remove(struct iommu_domain *domain, iommu_gaddr_t start,
719     iommu_gaddr_t size)
720 {
721 	struct iommu_map_entry *entry, *nentry, *r1, *r2;
722 	struct iommu_map_entries_tailq gc;
723 	iommu_gaddr_t end;
724 
725 	end = start + size;
726 	r1 = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
727 	r2 = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
728 	TAILQ_INIT(&gc);
729 
730 	IOMMU_DOMAIN_LOCK(domain);
731 
732 	nentry = iommu_gas_remove_clip_left(domain, start, end, &r1);
733 	RB_FOREACH_FROM(entry, iommu_gas_entries_tree, nentry) {
734 		if (entry->start >= end)
735 			break;
736 		KASSERT(start <= entry->start,
737 		    ("iommu_gas_remove entry (%#jx, %#jx) start %#jx",
738 		    entry->start, entry->end, start));
739 		if ((entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
740 			continue;
741 		iommu_gas_remove_unmap(domain, entry, &gc);
742 	}
743 	if (iommu_gas_remove_clip_right(domain, end, entry, r2)) {
744 		iommu_gas_remove_unmap(domain, r2, &gc);
745 		r2 = NULL;
746 	}
747 
748 #ifdef INVARIANTS
749 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
750 		if ((entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
751 			continue;
752 		KASSERT(entry->end <= start || entry->start >= end,
753 		    ("iommu_gas_remove leftover entry (%#jx, %#jx) range "
754 		    "(%#jx, %#jx)",
755 		    entry->start, entry->end, start, end));
756 	}
757 #endif
758 
759 	IOMMU_DOMAIN_UNLOCK(domain);
760 	if (r1 != NULL)
761 		iommu_gas_free_entry(r1);
762 	if (r2 != NULL)
763 		iommu_gas_free_entry(r2);
764 	iommu_domain_unload(domain, &gc, true);
765 }
766 
767 int
768 iommu_gas_map(struct iommu_domain *domain,
769     const struct bus_dma_tag_common *common, iommu_gaddr_t size, int offset,
770     u_int eflags, u_int flags, vm_page_t *ma, struct iommu_map_entry **res)
771 {
772 	struct iommu_gas_match_args a;
773 	struct iommu_map_entry *entry;
774 	int error;
775 
776 	KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_CANSPLIT)) == 0,
777 	    ("invalid flags 0x%x", flags));
778 
779 	a.size = size;
780 	a.offset = offset;
781 	a.common = common;
782 	a.gas_flags = flags;
783 	entry = iommu_gas_alloc_entry(domain,
784 	    (flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0);
785 	if (entry == NULL)
786 		return (ENOMEM);
787 	a.entry = entry;
788 	IOMMU_DOMAIN_LOCK(domain);
789 	error = iommu_gas_find_space(domain, &a);
790 	if (error == ENOMEM) {
791 		IOMMU_DOMAIN_UNLOCK(domain);
792 		iommu_gas_free_entry(entry);
793 		return (error);
794 	}
795 #ifdef INVARIANTS
796 	if (iommu_check_free)
797 		iommu_gas_check_free(domain);
798 #endif
799 	KASSERT(error == 0,
800 	    ("unexpected error %d from iommu_gas_find_entry", error));
801 	KASSERT(entry->end < domain->end, ("allocated GPA %jx, max GPA %jx",
802 	    (uintmax_t)entry->end, (uintmax_t)domain->end));
803 	entry->flags |= eflags;
804 	IOMMU_DOMAIN_UNLOCK(domain);
805 
806 	error = domain->ops->map(domain, entry->start,
807 	    entry->end - entry->start, ma, eflags,
808 	    ((flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0));
809 	if (error == ENOMEM) {
810 		iommu_domain_unload_entry(entry, true,
811 		    (flags & IOMMU_MF_CANWAIT) != 0);
812 		return (error);
813 	}
814 	KASSERT(error == 0,
815 	    ("unexpected error %d from domain_map_buf", error));
816 
817 	*res = entry;
818 	return (0);
819 }
820 
821 int
822 iommu_gas_map_region(struct iommu_domain *domain, struct iommu_map_entry *entry,
823     u_int eflags, u_int flags, vm_page_t *ma)
824 {
825 	iommu_gaddr_t start;
826 	int error;
827 
828 	KASSERT(entry->domain == domain,
829 	    ("mismatched domain %p entry %p entry->domain %p", domain,
830 	    entry, entry->domain));
831 	KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain,
832 	    entry, entry->flags));
833 	KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_RMRR)) == 0,
834 	    ("invalid flags 0x%x", flags));
835 
836 	start = entry->start;
837 	IOMMU_DOMAIN_LOCK(domain);
838 	error = iommu_gas_alloc_region(domain, entry, flags);
839 	if (error != 0) {
840 		IOMMU_DOMAIN_UNLOCK(domain);
841 		return (error);
842 	}
843 	entry->flags |= eflags;
844 	IOMMU_DOMAIN_UNLOCK(domain);
845 	if (entry->end == entry->start)
846 		return (0);
847 
848 	error = domain->ops->map(domain, entry->start,
849 	    entry->end - entry->start, ma + OFF_TO_IDX(start - entry->start),
850 	    eflags, ((flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0));
851 	if (error == ENOMEM) {
852 		iommu_domain_unload_entry(entry, false,
853 		    (flags & IOMMU_MF_CANWAIT) != 0);
854 		return (error);
855 	}
856 	KASSERT(error == 0,
857 	    ("unexpected error %d from domain_map_buf", error));
858 
859 	return (0);
860 }
861 
862 static int
863 iommu_gas_reserve_region_locked(struct iommu_domain *domain,
864     iommu_gaddr_t start, iommu_gaddr_t end, struct iommu_map_entry *entry)
865 {
866 	int error;
867 
868 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
869 
870 	entry->start = start;
871 	entry->end = end;
872 	error = iommu_gas_alloc_region(domain, entry, IOMMU_MF_CANWAIT);
873 	if (error == 0)
874 		entry->flags |= IOMMU_MAP_ENTRY_UNMAPPED;
875 	return (error);
876 }
877 
878 int
879 iommu_gas_reserve_region(struct iommu_domain *domain, iommu_gaddr_t start,
880     iommu_gaddr_t end, struct iommu_map_entry **entry0)
881 {
882 	struct iommu_map_entry *entry;
883 	int error;
884 
885 	entry = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
886 	IOMMU_DOMAIN_LOCK(domain);
887 	error = iommu_gas_reserve_region_locked(domain, start, end, entry);
888 	IOMMU_DOMAIN_UNLOCK(domain);
889 	if (error != 0)
890 		iommu_gas_free_entry(entry);
891 	else if (entry0 != NULL)
892 		*entry0 = entry;
893 	return (error);
894 }
895 
896 /*
897  * As in iommu_gas_reserve_region, reserve [start, end), but allow for existing
898  * entries.
899  */
900 int
901 iommu_gas_reserve_region_extend(struct iommu_domain *domain,
902     iommu_gaddr_t start, iommu_gaddr_t end)
903 {
904 	struct iommu_map_entry *entry, *next, *prev, key = {};
905 	iommu_gaddr_t entry_start, entry_end;
906 	int error;
907 
908 	error = 0;
909 	entry = NULL;
910 	end = ummin(end, domain->end);
911 	while (start < end) {
912 		/* Preallocate an entry. */
913 		if (entry == NULL)
914 			entry = iommu_gas_alloc_entry(domain,
915 			    IOMMU_PGF_WAITOK);
916 		/* Calculate the free region from here to the next entry. */
917 		key.start = key.end = start;
918 		IOMMU_DOMAIN_LOCK(domain);
919 		next = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, &key);
920 		KASSERT(next != NULL, ("domain %p with end %#jx has no entry "
921 		    "after %#jx", domain, (uintmax_t)domain->end,
922 		    (uintmax_t)start));
923 		entry_end = ummin(end, next->start);
924 		prev = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, next);
925 		if (prev != NULL)
926 			entry_start = ummax(start, prev->end);
927 		else
928 			entry_start = start;
929 		start = next->end;
930 		/* Reserve the region if non-empty. */
931 		if (entry_start != entry_end) {
932 			error = iommu_gas_reserve_region_locked(domain,
933 			    entry_start, entry_end, entry);
934 			if (error != 0) {
935 				IOMMU_DOMAIN_UNLOCK(domain);
936 				break;
937 			}
938 			entry = NULL;
939 		}
940 		IOMMU_DOMAIN_UNLOCK(domain);
941 	}
942 	/* Release a preallocated entry if it was not used. */
943 	if (entry != NULL)
944 		iommu_gas_free_entry(entry);
945 	return (error);
946 }
947 
948 void
949 iommu_unmap_msi(struct iommu_ctx *ctx)
950 {
951 	struct iommu_map_entry *entry;
952 	struct iommu_domain *domain;
953 
954 	domain = ctx->domain;
955 	entry = domain->msi_entry;
956 	if (entry == NULL)
957 		return;
958 
959 	domain->ops->unmap(domain, entry->start, entry->end -
960 	    entry->start, IOMMU_PGF_WAITOK);
961 
962 	iommu_gas_free_space(entry);
963 
964 	iommu_gas_free_entry(entry);
965 
966 	domain->msi_entry = NULL;
967 	domain->msi_base = 0;
968 	domain->msi_phys = 0;
969 }
970 
971 int
972 iommu_map_msi(struct iommu_ctx *ctx, iommu_gaddr_t size, int offset,
973     u_int eflags, u_int flags, vm_page_t *ma)
974 {
975 	struct iommu_domain *domain;
976 	struct iommu_map_entry *entry;
977 	int error;
978 
979 	error = 0;
980 	domain = ctx->domain;
981 
982 	/* Check if there is already an MSI page allocated */
983 	IOMMU_DOMAIN_LOCK(domain);
984 	entry = domain->msi_entry;
985 	IOMMU_DOMAIN_UNLOCK(domain);
986 
987 	if (entry == NULL) {
988 		error = iommu_gas_map(domain, &ctx->tag->common, size, offset,
989 		    eflags, flags, ma, &entry);
990 		IOMMU_DOMAIN_LOCK(domain);
991 		if (error == 0) {
992 			if (domain->msi_entry == NULL) {
993 				MPASS(domain->msi_base == 0);
994 				MPASS(domain->msi_phys == 0);
995 
996 				domain->msi_entry = entry;
997 				domain->msi_base = entry->start;
998 				domain->msi_phys = VM_PAGE_TO_PHYS(ma[0]);
999 			} else {
1000 				/*
1001 				 * We lost the race and already have an
1002 				 * MSI page allocated. Free the unneeded entry.
1003 				 */
1004 				iommu_gas_free_entry(entry);
1005 			}
1006 		} else if (domain->msi_entry != NULL) {
1007 			/*
1008 			 * The allocation failed, but another succeeded.
1009 			 * Return success as there is a valid MSI page.
1010 			 */
1011 			error = 0;
1012 		}
1013 		IOMMU_DOMAIN_UNLOCK(domain);
1014 	}
1015 
1016 	return (error);
1017 }
1018 
1019 void
1020 iommu_translate_msi(struct iommu_domain *domain, uint64_t *addr)
1021 {
1022 
1023 	*addr = (*addr - domain->msi_phys) + domain->msi_base;
1024 
1025 	KASSERT(*addr >= domain->msi_entry->start,
1026 	    ("%s: Address is below the MSI entry start address (%jx < %jx)",
1027 	    __func__, (uintmax_t)*addr, (uintmax_t)domain->msi_entry->start));
1028 
1029 	KASSERT(*addr + sizeof(*addr) <= domain->msi_entry->end,
1030 	    ("%s: Address is above the MSI entry end address (%jx < %jx)",
1031 	    __func__, (uintmax_t)*addr, (uintmax_t)domain->msi_entry->end));
1032 }
1033 
1034 SYSCTL_NODE(_hw, OID_AUTO, iommu, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "");
1035 
1036 #ifdef INVARIANTS
1037 SYSCTL_INT(_hw_iommu, OID_AUTO, check_free, CTLFLAG_RWTUN,
1038     &iommu_check_free, 0,
1039     "Check the GPA RBtree for free_down and free_after validity");
1040 #endif
1041 
1042 #include "opt_ddb.h"
1043 #ifdef DDB
1044 
1045 #include <ddb/ddb.h>
1046 
1047 static void
1048 iommu_debug_dump_gas(struct iommu_domain *domain)
1049 {
1050 	struct iommu_map_entry *entry;
1051 
1052 	db_printf("iommu_domain %p tree %p iommu %p fl %#x\n", domain,
1053 	    &domain->rb_root, domain->iommu, domain->flags);
1054 	db_printf("iommu_domain %p tree %p\n", domain, &domain->rb_root);
1055 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
1056 		db_printf(
1057 	    "  e %p [%#jx %#jx] fl %#x first %#jx last %#jx free_down %#jx",
1058 		    entry, (uintmax_t)entry->start, (uintmax_t)entry->end,
1059 		    entry->flags,
1060 		    (uintmax_t)entry->first, (uintmax_t)entry->last,
1061 		    (uintmax_t)entry->free_down);
1062 		if (entry == domain->start_gap)
1063 			db_printf(" start_gap");
1064 		if (entry == domain->first_place)
1065 			db_printf(" first_place");
1066 		if (entry == domain->last_place)
1067 			db_printf(" last_place");
1068 		db_printf("\n");
1069 	}
1070 }
1071 
1072 DB_SHOW_COMMAND(iommu_domain, iommu_domain_show)
1073 {
1074 	struct iommu_domain *domain;
1075 
1076 	if (!have_addr) {
1077 		db_printf("show iommu_domain addr\n");
1078 		return;
1079 	}
1080 
1081 	domain = (void *)addr;
1082 	iommu_debug_dump_gas(domain);
1083 }
1084 
1085 #endif
1086