xref: /freebsd/sys/dev/iommu/iommu_gas.c (revision 78cd75393ec79565c63927bf200f06f839a1dc05)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #define	RB_AUGMENT_CHECK(entry) iommu_gas_augment_entry(entry)
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/bus.h>
37 #include <sys/interrupt.h>
38 #include <sys/kernel.h>
39 #include <sys/ktr.h>
40 #include <sys/lock.h>
41 #include <sys/proc.h>
42 #include <sys/rwlock.h>
43 #include <sys/memdesc.h>
44 #include <sys/mutex.h>
45 #include <sys/sysctl.h>
46 #include <sys/rman.h>
47 #include <sys/taskqueue.h>
48 #include <sys/tree.h>
49 #include <sys/uio.h>
50 #include <sys/vmem.h>
51 #include <vm/vm.h>
52 #include <vm/vm_extern.h>
53 #include <vm/vm_kern.h>
54 #include <vm/vm_object.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_map.h>
57 #include <vm/uma.h>
58 #include <dev/pci/pcireg.h>
59 #include <dev/pci/pcivar.h>
60 #include <dev/iommu/iommu.h>
61 #include <dev/iommu/iommu_gas.h>
62 #include <dev/iommu/iommu_msi.h>
63 #include <machine/atomic.h>
64 #include <machine/bus.h>
65 #include <machine/md_var.h>
66 #include <machine/iommu.h>
67 #include <dev/iommu/busdma_iommu.h>
68 
69 /*
70  * Guest Address Space management.
71  */
72 
73 static uma_zone_t iommu_map_entry_zone;
74 
75 #ifdef INVARIANTS
76 static int iommu_check_free;
77 #endif
78 
79 static void
80 intel_gas_init(void)
81 {
82 
83 	iommu_map_entry_zone = uma_zcreate("IOMMU_MAP_ENTRY",
84 	    sizeof(struct iommu_map_entry), NULL, NULL,
85 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NODUMP);
86 }
87 SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL);
88 
89 struct iommu_map_entry *
90 iommu_gas_alloc_entry(struct iommu_domain *domain, u_int flags)
91 {
92 	struct iommu_map_entry *res;
93 
94 	KASSERT((flags & ~(IOMMU_PGF_WAITOK)) == 0,
95 	    ("unsupported flags %x", flags));
96 
97 	res = uma_zalloc(iommu_map_entry_zone, ((flags & IOMMU_PGF_WAITOK) !=
98 	    0 ? M_WAITOK : M_NOWAIT) | M_ZERO);
99 	if (res != NULL && domain != NULL) {
100 		res->domain = domain;
101 		atomic_add_int(&domain->entries_cnt, 1);
102 	}
103 	return (res);
104 }
105 
106 void
107 iommu_gas_free_entry(struct iommu_map_entry *entry)
108 {
109 	struct iommu_domain *domain;
110 
111 	domain = entry->domain;
112 	if (domain != NULL)
113 		atomic_subtract_int(&domain->entries_cnt, 1);
114 	uma_zfree(iommu_map_entry_zone, entry);
115 }
116 
117 static int
118 iommu_gas_cmp_entries(struct iommu_map_entry *a, struct iommu_map_entry *b)
119 {
120 
121 	/* First and last entries have zero size, so <= */
122 	KASSERT(a->start <= a->end, ("inverted entry %p (%jx, %jx)",
123 	    a, (uintmax_t)a->start, (uintmax_t)a->end));
124 	KASSERT(b->start <= b->end, ("inverted entry %p (%jx, %jx)",
125 	    b, (uintmax_t)b->start, (uintmax_t)b->end));
126 	KASSERT(((a->flags | b->flags) & IOMMU_MAP_ENTRY_FAKE) != 0 ||
127 	    a->end <= b->start || b->end <= a->start ||
128 	    a->end == a->start || b->end == b->start,
129 	    ("overlapping entries %p (%jx, %jx) f %#x %p (%jx, %jx) f %#x"
130 	    " domain %p %p",
131 	    a, (uintmax_t)a->start, (uintmax_t)a->end, a->flags,
132 	    b, (uintmax_t)b->start, (uintmax_t)b->end, b->flags,
133 	    a->domain, b->domain));
134 
135 	if (a->end < b->end)
136 		return (-1);
137 	else if (b->end < a->end)
138 		return (1);
139 	return (0);
140 }
141 
142 /*
143  * Update augmentation data based on data from children.
144  * Return true if and only if the update changes the augmentation data.
145  */
146 static bool
147 iommu_gas_augment_entry(struct iommu_map_entry *entry)
148 {
149 	struct iommu_map_entry *child;
150 	iommu_gaddr_t bound, delta, free_down;
151 
152 	free_down = 0;
153 	bound = entry->start;
154 	if ((child = RB_LEFT(entry, rb_entry)) != NULL) {
155 		free_down = MAX(child->free_down, bound - child->last);
156 		bound = child->first;
157 	}
158 	delta = bound - entry->first;
159 	entry->first = bound;
160 	bound = entry->end;
161 	if ((child = RB_RIGHT(entry, rb_entry)) != NULL) {
162 		free_down = MAX(free_down, child->free_down);
163 		free_down = MAX(free_down, child->first - bound);
164 		bound = child->last;
165 	}
166 	delta += entry->last - bound;
167 	if (delta == 0)
168 		delta = entry->free_down - free_down;
169 	entry->last = bound;
170 	entry->free_down = free_down;
171 
172 	/*
173 	 * Return true either if the value of last-first changed,
174 	 * or if free_down changed.
175 	 */
176 	return (delta != 0);
177 }
178 
179 RB_GENERATE(iommu_gas_entries_tree, iommu_map_entry, rb_entry,
180     iommu_gas_cmp_entries);
181 
182 #ifdef INVARIANTS
183 static void
184 iommu_gas_check_free(struct iommu_domain *domain)
185 {
186 	struct iommu_map_entry *entry, *l, *r;
187 	iommu_gaddr_t v;
188 
189 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
190 		KASSERT(domain == entry->domain,
191 		    ("mismatched free domain %p entry %p entry->domain %p",
192 		    domain, entry, entry->domain));
193 		l = RB_LEFT(entry, rb_entry);
194 		r = RB_RIGHT(entry, rb_entry);
195 		v = 0;
196 		if (l != NULL) {
197 			v = MAX(v, l->free_down);
198 			v = MAX(v, entry->start - l->last);
199 		}
200 		if (r != NULL) {
201 			v = MAX(v, r->free_down);
202 			v = MAX(v, r->first - entry->end);
203 		}
204 		MPASS(entry->free_down == v);
205 	}
206 }
207 #endif
208 
209 static void
210 iommu_gas_rb_remove(struct iommu_domain *domain, struct iommu_map_entry *entry)
211 {
212 	struct iommu_map_entry *nbr;
213 
214 	/* Removing entry may open a new free gap before domain->start_gap. */
215 	if (entry->end <= domain->start_gap->end) {
216 		if (RB_RIGHT(entry, rb_entry) != NULL)
217 			nbr = iommu_gas_entries_tree_RB_NEXT(entry);
218 		else if (RB_LEFT(entry, rb_entry) != NULL)
219 			nbr = RB_LEFT(entry, rb_entry);
220 		else
221 			nbr = RB_PARENT(entry, rb_entry);
222 		domain->start_gap = nbr;
223 	}
224 	RB_REMOVE(iommu_gas_entries_tree, &domain->rb_root, entry);
225 }
226 
227 struct iommu_domain *
228 iommu_get_ctx_domain(struct iommu_ctx *ctx)
229 {
230 
231 	return (ctx->domain);
232 }
233 
234 void
235 iommu_gas_init_domain(struct iommu_domain *domain)
236 {
237 	struct iommu_map_entry *begin, *end;
238 
239 	begin = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
240 	end = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
241 
242 	IOMMU_DOMAIN_LOCK(domain);
243 	KASSERT(domain->entries_cnt == 2, ("dirty domain %p", domain));
244 	KASSERT(RB_EMPTY(&domain->rb_root),
245 	    ("non-empty entries %p", domain));
246 
247 	end->start = domain->end;
248 	end->end = domain->end;
249 	end->flags = IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED;
250 	RB_INSERT(iommu_gas_entries_tree, &domain->rb_root, end);
251 
252 	begin->start = 0;
253 	begin->end = 0;
254 	begin->flags = IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED;
255 	RB_INSERT_PREV(iommu_gas_entries_tree, &domain->rb_root, end, begin);
256 	iommu_gas_augment_entry(end);
257 	iommu_gas_augment_entry(begin);
258 
259 	domain->start_gap = begin;
260 	domain->first_place = begin;
261 	domain->last_place = end;
262 	domain->flags |= IOMMU_DOMAIN_GAS_INITED;
263 	IOMMU_DOMAIN_UNLOCK(domain);
264 }
265 
266 void
267 iommu_gas_fini_domain(struct iommu_domain *domain)
268 {
269 	struct iommu_map_entry *entry;
270 
271 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
272 	KASSERT(domain->entries_cnt == 2,
273 	    ("domain still in use %p", domain));
274 
275 	entry = RB_MIN(iommu_gas_entries_tree, &domain->rb_root);
276 	KASSERT(entry->start == 0, ("start entry start %p", domain));
277 	KASSERT(entry->end == IOMMU_PAGE_SIZE, ("start entry end %p", domain));
278 	KASSERT(entry->flags ==
279 	    (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED),
280 	    ("start entry flags %p", domain));
281 	iommu_gas_rb_remove(domain, entry);
282 	iommu_gas_free_entry(entry);
283 
284 	entry = RB_MAX(iommu_gas_entries_tree, &domain->rb_root);
285 	KASSERT(entry->start == domain->end, ("end entry start %p", domain));
286 	KASSERT(entry->end == domain->end, ("end entry end %p", domain));
287 	KASSERT(entry->flags ==
288 	    (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_UNMAPPED),
289 	    ("end entry flags %p", domain));
290 	iommu_gas_rb_remove(domain, entry);
291 	iommu_gas_free_entry(entry);
292 }
293 
294 struct iommu_gas_match_args {
295 	iommu_gaddr_t size;
296 	int offset;
297 	const struct bus_dma_tag_common *common;
298 	u_int gas_flags;
299 	struct iommu_map_entry *entry;
300 };
301 
302 /*
303  * The interval [beg, end) is a free interval between two iommu_map_entries.
304  * Addresses can be allocated only in the range [lbound, ubound]. Try to
305  * allocate space in the free interval, subject to the conditions expressed by
306  * a, and return 'true' if and only if the allocation attempt succeeds.
307  */
308 static bool
309 iommu_gas_match_one(struct iommu_gas_match_args *a, iommu_gaddr_t beg,
310     iommu_gaddr_t end, iommu_gaddr_t lbound, iommu_gaddr_t ubound)
311 {
312 	struct iommu_map_entry *entry;
313 	iommu_gaddr_t first, size, start;
314 	int offset;
315 
316 	/*
317 	 * The prev->end is always aligned on the page size, which
318 	 * causes page alignment for the entry->start too.
319 	 *
320 	 * Create IOMMU_PAGE_SIZE gaps before, after new entry
321 	 * to ensure that out-of-bounds accesses fault.
322 	 */
323 	beg = MAX(beg + IOMMU_PAGE_SIZE, lbound);
324 	start = roundup2(beg, a->common->alignment);
325 	if (start < beg)
326 		return (false);
327 	if (end < IOMMU_PAGE_SIZE + 1)
328 		return (false);
329 	end = MIN(end - IOMMU_PAGE_SIZE - 1, ubound);
330 	offset = a->offset;
331 	size = a->size;
332 	if (start + offset + size - 1 > end)
333 		return (false);
334 
335 	/* Check for and try to skip past boundary crossing. */
336 	if (!vm_addr_bound_ok(start + offset, size, a->common->boundary)) {
337 		/*
338 		 * The start + offset to start + offset + size region crosses
339 		 * the boundary.  Check if there is enough space after the next
340 		 * boundary after the beg.
341 		 */
342 		first = start;
343 		beg = roundup2(start + offset + 1, a->common->boundary);
344 		start = roundup2(beg, a->common->alignment);
345 
346 		if (start + offset + size - 1 > end ||
347 		    !vm_addr_bound_ok(start + offset, size,
348 		    a->common->boundary)) {
349 			/*
350 			 * Not enough space to align at the requested boundary,
351 			 * or boundary is smaller than the size, but allowed to
352 			 * split.  We already checked that start + size does not
353 			 * overlap ubound.
354 			 *
355 			 * XXXKIB. It is possible that beg is exactly at the
356 			 * start of the next entry, then we do not have gap.
357 			 * Ignore for now.
358 			 */
359 			if ((a->gas_flags & IOMMU_MF_CANSPLIT) == 0)
360 				return (false);
361 			size = beg - first - offset;
362 			start = first;
363 		}
364 	}
365 	entry = a->entry;
366 	entry->start = start;
367 	entry->end = start + roundup2(size + offset, IOMMU_PAGE_SIZE);
368 	entry->flags = IOMMU_MAP_ENTRY_MAP;
369 	return (true);
370 }
371 
372 /* Find the next entry that might abut a big-enough range. */
373 static struct iommu_map_entry *
374 iommu_gas_next(struct iommu_map_entry *curr, iommu_gaddr_t min_free)
375 {
376 	struct iommu_map_entry *next;
377 
378 	if ((next = RB_RIGHT(curr, rb_entry)) != NULL &&
379 	    next->free_down >= min_free) {
380 		/* Find next entry in right subtree. */
381 		do
382 			curr = next;
383 		while ((next = RB_LEFT(curr, rb_entry)) != NULL &&
384 		    next->free_down >= min_free);
385 	} else {
386 		/* Find next entry in a left-parent ancestor. */
387 		while ((next = RB_PARENT(curr, rb_entry)) != NULL &&
388 		    curr == RB_RIGHT(next, rb_entry))
389 			curr = next;
390 		curr = next;
391 	}
392 	return (curr);
393 }
394 
395 /*
396  * Address-ordered first-fit search of 'domain' for free space satisfying the
397  * conditions of 'a'.  The space allocated is at least one page big, and is
398  * bounded by guard pages to the left and right.  The allocated space for
399  * 'domain' is described by an rb-tree of map entries at domain->rb_root, and
400  * domain->start_gap points to a map entry less than or adjacent to the first
401  * free-space of size at least 3 pages.
402  */
403 static int
404 iommu_gas_find_space(struct iommu_domain *domain,
405     struct iommu_gas_match_args *a)
406 {
407 	struct iommu_map_entry *curr, *first;
408 	iommu_gaddr_t addr, min_free;
409 
410 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
411 	KASSERT(a->entry->flags == 0,
412 	    ("dirty entry %p %p", domain, a->entry));
413 
414 	/*
415 	 * start_gap may point to an entry adjacent to gaps too small for any
416 	 * new allocation.  In that case, advance start_gap to the first free
417 	 * space big enough for a minimum allocation plus two guard pages.
418 	 */
419 	min_free = 3 * IOMMU_PAGE_SIZE;
420 	first = domain->start_gap;
421 	while (first != NULL && first->free_down < min_free)
422 		first = RB_PARENT(first, rb_entry);
423 	for (curr = first; curr != NULL;
424 	    curr = iommu_gas_next(curr, min_free)) {
425 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
426 		    first->last + min_free <= curr->start)
427 			break;
428 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
429 		    curr->end + min_free <= first->first)
430 			break;
431 	}
432 	domain->start_gap = curr;
433 
434 	/*
435 	 * If the subtree doesn't have free space for the requested allocation
436 	 * plus two guard pages, skip it.
437 	 */
438 	min_free = 2 * IOMMU_PAGE_SIZE +
439 	    roundup2(a->size + a->offset, IOMMU_PAGE_SIZE);
440 
441 	/* Climb to find a node in the subtree of big-enough ranges. */
442 	first = curr;
443 	while (first != NULL && first->free_down < min_free)
444 		first = RB_PARENT(first, rb_entry);
445 
446 	/*
447 	 * Walk the big-enough ranges tree until one satisfies alignment
448 	 * requirements, or violates lowaddr address requirement.
449 	 */
450 	addr = a->common->lowaddr;
451 	for (curr = first; curr != NULL;
452 	    curr = iommu_gas_next(curr, min_free)) {
453 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
454 		    iommu_gas_match_one(a, first->last, curr->start,
455 		    0, addr)) {
456 			RB_INSERT_PREV(iommu_gas_entries_tree,
457 			    &domain->rb_root, curr, a->entry);
458 			return (0);
459 		}
460 		if (curr->end >= addr) {
461 			/* All remaining ranges > addr */
462 			break;
463 		}
464 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
465 		    iommu_gas_match_one(a, curr->end, first->first,
466 		    0, addr)) {
467 			RB_INSERT_NEXT(iommu_gas_entries_tree,
468 			    &domain->rb_root, curr, a->entry);
469 			return (0);
470 		}
471 	}
472 
473 	/*
474 	 * To resume the search at the start of the upper region, first climb to
475 	 * the nearest ancestor that spans highaddr.  Then find the last entry
476 	 * before highaddr that could abut a big-enough range.
477 	 */
478 	addr = a->common->highaddr;
479 	while (curr != NULL && curr->last < addr)
480 		curr = RB_PARENT(curr, rb_entry);
481 	first = NULL;
482 	while (curr != NULL && curr->free_down >= min_free) {
483 		if (addr < curr->end)
484 			curr = RB_LEFT(curr, rb_entry);
485 		else {
486 			first = curr;
487 			curr = RB_RIGHT(curr, rb_entry);
488 		}
489 	}
490 
491 	/*
492 	 * Walk the remaining big-enough ranges until one satisfies alignment
493 	 * requirements.
494 	 */
495 	for (curr = first; curr != NULL;
496 	    curr = iommu_gas_next(curr, min_free)) {
497 		if ((first = RB_LEFT(curr, rb_entry)) != NULL &&
498 		    iommu_gas_match_one(a, first->last, curr->start,
499 		    addr + 1, domain->end - 1)) {
500 			RB_INSERT_PREV(iommu_gas_entries_tree,
501 			    &domain->rb_root, curr, a->entry);
502 			return (0);
503 		}
504 		if ((first = RB_RIGHT(curr, rb_entry)) != NULL &&
505 		    iommu_gas_match_one(a, curr->end, first->first,
506 		    addr + 1, domain->end - 1)) {
507 			RB_INSERT_NEXT(iommu_gas_entries_tree,
508 			    &domain->rb_root, curr, a->entry);
509 			return (0);
510 		}
511 	}
512 
513 	return (ENOMEM);
514 }
515 
516 static int
517 iommu_gas_alloc_region(struct iommu_domain *domain, struct iommu_map_entry *entry,
518     u_int flags)
519 {
520 	struct iommu_map_entry *next, *prev;
521 
522 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
523 
524 	if ((entry->start & IOMMU_PAGE_MASK) != 0 ||
525 	    (entry->end & IOMMU_PAGE_MASK) != 0)
526 		return (EINVAL);
527 	if (entry->start >= entry->end)
528 		return (EINVAL);
529 	if (entry->end >= domain->end)
530 		return (EINVAL);
531 
532 	entry->flags |= IOMMU_MAP_ENTRY_FAKE;
533 	next = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, entry);
534 	KASSERT(next != NULL, ("next must be non-null %p %jx", domain,
535 	    (uintmax_t)entry->start));
536 	prev = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, next);
537 	/* prev could be NULL */
538 	entry->flags &= ~IOMMU_MAP_ENTRY_FAKE;
539 
540 	/*
541 	 * Adapt to broken BIOSes which specify overlapping RMRR
542 	 * entries.
543 	 *
544 	 * XXXKIB: this does not handle a case when prev or next
545 	 * entries are completely covered by the current one, which
546 	 * extends both ways.
547 	 */
548 	if (prev != NULL && prev->end > entry->start &&
549 	    (prev->flags & IOMMU_MAP_ENTRY_PLACE) == 0) {
550 		if ((flags & IOMMU_MF_RMRR) == 0 ||
551 		    (prev->flags & IOMMU_MAP_ENTRY_RMRR) == 0)
552 			return (EBUSY);
553 		entry->start = prev->end;
554 	}
555 	if (next->start < entry->end &&
556 	    (next->flags & IOMMU_MAP_ENTRY_PLACE) == 0) {
557 		if ((flags & IOMMU_MF_RMRR) == 0 ||
558 		    (next->flags & IOMMU_MAP_ENTRY_RMRR) == 0)
559 			return (EBUSY);
560 		entry->end = next->start;
561 	}
562 	if (entry->end == entry->start)
563 		return (0);
564 
565 	if (prev != NULL && prev->end > entry->start) {
566 		/* This assumes that prev is the placeholder entry. */
567 		iommu_gas_rb_remove(domain, prev);
568 		prev = NULL;
569 	}
570 	RB_INSERT_PREV(iommu_gas_entries_tree,
571 	    &domain->rb_root, next, entry);
572 	if (next->start < entry->end) {
573 		iommu_gas_rb_remove(domain, next);
574 		next = NULL;
575 	}
576 
577 	if ((flags & IOMMU_MF_RMRR) != 0)
578 		entry->flags = IOMMU_MAP_ENTRY_RMRR;
579 
580 #ifdef INVARIANTS
581 	struct iommu_map_entry *ip, *in;
582 	ip = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, entry);
583 	in = RB_NEXT(iommu_gas_entries_tree, &domain->rb_root, entry);
584 	KASSERT(prev == NULL || ip == prev,
585 	    ("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)",
586 	    entry, entry->start, entry->end, prev,
587 	    prev == NULL ? 0 : prev->start, prev == NULL ? 0 : prev->end,
588 	    ip, ip == NULL ? 0 : ip->start, ip == NULL ? 0 : ip->end));
589 	KASSERT(next == NULL || in == next,
590 	    ("RMRR %p (%jx %jx) next %p (%jx %jx) ins next %p (%jx %jx)",
591 	    entry, entry->start, entry->end, next,
592 	    next == NULL ? 0 : next->start, next == NULL ? 0 : next->end,
593 	    in, in == NULL ? 0 : in->start, in == NULL ? 0 : in->end));
594 #endif
595 
596 	return (0);
597 }
598 
599 void
600 iommu_gas_free_space(struct iommu_map_entry *entry)
601 {
602 	struct iommu_domain *domain;
603 
604 	domain = entry->domain;
605 	KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR |
606 	    IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_MAP,
607 	    ("permanent entry %p %p", domain, entry));
608 
609 	IOMMU_DOMAIN_LOCK(domain);
610 	iommu_gas_rb_remove(domain, entry);
611 	entry->flags &= ~IOMMU_MAP_ENTRY_MAP;
612 #ifdef INVARIANTS
613 	if (iommu_check_free)
614 		iommu_gas_check_free(domain);
615 #endif
616 	IOMMU_DOMAIN_UNLOCK(domain);
617 }
618 
619 void
620 iommu_gas_free_region(struct iommu_map_entry *entry)
621 {
622 	struct iommu_domain *domain;
623 
624 	domain = entry->domain;
625 	KASSERT((entry->flags & (IOMMU_MAP_ENTRY_PLACE | IOMMU_MAP_ENTRY_RMRR |
626 	    IOMMU_MAP_ENTRY_MAP)) == IOMMU_MAP_ENTRY_RMRR,
627 	    ("non-RMRR entry %p %p", domain, entry));
628 
629 	IOMMU_DOMAIN_LOCK(domain);
630 	if (entry != domain->first_place &&
631 	    entry != domain->last_place)
632 		iommu_gas_rb_remove(domain, entry);
633 	entry->flags &= ~IOMMU_MAP_ENTRY_RMRR;
634 	IOMMU_DOMAIN_UNLOCK(domain);
635 }
636 
637 static struct iommu_map_entry *
638 iommu_gas_remove_clip_left(struct iommu_domain *domain, iommu_gaddr_t start,
639     iommu_gaddr_t end, struct iommu_map_entry **r)
640 {
641 	struct iommu_map_entry *entry, *res, fentry;
642 
643 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
644 	MPASS(start <= end);
645 	MPASS(end <= domain->end);
646 
647 	/*
648 	 * Find an entry which contains the supplied guest's address
649 	 * start, or the first entry after the start.  Since we
650 	 * asserted that start is below domain end, entry should
651 	 * exist.  Then clip it if needed.
652 	 */
653 	bzero(&fentry, sizeof(fentry));
654 	fentry.start = start + 1;
655 	fentry.end = start + 1;
656 	fentry.flags = IOMMU_MAP_ENTRY_FAKE;
657 	entry = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, &fentry);
658 
659 	if (entry->start >= start ||
660 	    (entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
661 		return (entry);
662 
663 	res = *r;
664 	*r = NULL;
665 	*res = *entry;
666 	res->start = entry->end = start;
667 	RB_UPDATE_AUGMENT(entry, rb_entry);
668 	RB_INSERT_NEXT(iommu_gas_entries_tree,
669 	    &domain->rb_root, entry, res);
670 	return (res);
671 }
672 
673 static bool
674 iommu_gas_remove_clip_right(struct iommu_domain *domain,
675     iommu_gaddr_t end, struct iommu_map_entry *entry,
676     struct iommu_map_entry *r)
677 {
678 	if (entry->start >= end || (entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
679 		return (false);
680 
681 	*r = *entry;
682 	r->end = entry->start = end;
683 	RB_UPDATE_AUGMENT(entry, rb_entry);
684 	RB_INSERT_PREV(iommu_gas_entries_tree,
685 	    &domain->rb_root, entry, r);
686 	return (true);
687 }
688 
689 static void
690 iommu_gas_remove_unmap(struct iommu_domain *domain,
691     struct iommu_map_entry *entry, struct iommu_map_entries_tailq *gcp)
692 {
693 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
694 
695 	if ((entry->flags & (IOMMU_MAP_ENTRY_UNMAPPED |
696 	    IOMMU_MAP_ENTRY_RMRR |
697 	    IOMMU_MAP_ENTRY_REMOVING)) != 0)
698 		return;
699 	MPASS((entry->flags & IOMMU_MAP_ENTRY_PLACE) == 0);
700 	entry->flags |= IOMMU_MAP_ENTRY_REMOVING;
701 	TAILQ_INSERT_TAIL(gcp, entry, dmamap_link);
702 }
703 
704 static void
705 iommu_gas_remove_locked(struct iommu_domain *domain,
706     iommu_gaddr_t start, iommu_gaddr_t size,
707     struct iommu_map_entries_tailq *gc,
708     struct iommu_map_entry **r1, struct iommu_map_entry **r2)
709 {
710 	struct iommu_map_entry *entry, *nentry;
711 	iommu_gaddr_t end;
712 
713 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
714 
715 	end = start + size;
716 
717 	nentry = iommu_gas_remove_clip_left(domain, start, end, r1);
718 	RB_FOREACH_FROM(entry, iommu_gas_entries_tree, nentry) {
719 		if (entry->start >= end)
720 			break;
721 		KASSERT(start <= entry->start,
722 		    ("iommu_gas_remove entry (%#jx, %#jx) start %#jx",
723 		    entry->start, entry->end, start));
724 		iommu_gas_remove_unmap(domain, entry, gc);
725 	}
726 	if (iommu_gas_remove_clip_right(domain, end, entry, *r2)) {
727 		iommu_gas_remove_unmap(domain, *r2, gc);
728 		*r2 = NULL;
729 	}
730 
731 #ifdef INVARIANTS
732 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
733 		if ((entry->flags & (IOMMU_MAP_ENTRY_RMRR |
734 		    IOMMU_MAP_ENTRY_PLACE)) != 0)
735 			continue;
736 		KASSERT(entry->end <= start || entry->start >= end,
737 		    ("iommu_gas_remove leftover entry (%#jx, %#jx) range "
738 		    "(%#jx, %#jx)",
739 		    entry->start, entry->end, start, end));
740 	}
741 #endif
742 }
743 
744 static void
745 iommu_gas_remove_init(struct iommu_domain *domain,
746     struct iommu_map_entries_tailq *gc, struct iommu_map_entry **r1,
747     struct iommu_map_entry **r2)
748 {
749 	TAILQ_INIT(gc);
750 	*r1 = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
751 	*r2 = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
752 }
753 
754 static void
755 iommu_gas_remove_cleanup(struct iommu_domain *domain,
756     struct iommu_map_entries_tailq *gc, struct iommu_map_entry **r1,
757     struct iommu_map_entry **r2)
758 {
759 	if (*r1 != NULL) {
760 		iommu_gas_free_entry(*r1);
761 		*r1 = NULL;
762 	}
763 	if (*r2 != NULL) {
764 		iommu_gas_free_entry(*r2);
765 		*r2 = NULL;
766 	}
767 	iommu_domain_unload(domain, gc, true);
768 }
769 
770 /*
771  * Remove specified range from the GAS of the domain.  Note that the
772  * removal is not guaranteed to occur upon the function return, it
773  * might be finalized some time after, when hardware reports that
774  * (queued) IOTLB invalidation was performed.
775  */
776 void
777 iommu_gas_remove(struct iommu_domain *domain, iommu_gaddr_t start,
778     iommu_gaddr_t size)
779 {
780 	struct iommu_map_entry *r1, *r2;
781 	struct iommu_map_entries_tailq gc;
782 
783 	iommu_gas_remove_init(domain, &gc, &r1, &r2);
784 	IOMMU_DOMAIN_LOCK(domain);
785 	iommu_gas_remove_locked(domain, start, size, &gc, &r1, &r2);
786 	IOMMU_DOMAIN_UNLOCK(domain);
787 	iommu_gas_remove_cleanup(domain, &gc, &r1, &r2);
788 }
789 
790 int
791 iommu_gas_map(struct iommu_domain *domain,
792     const struct bus_dma_tag_common *common, iommu_gaddr_t size, int offset,
793     u_int eflags, u_int flags, vm_page_t *ma, struct iommu_map_entry **res)
794 {
795 	struct iommu_gas_match_args a;
796 	struct iommu_map_entry *entry;
797 	int error;
798 
799 	KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_CANSPLIT)) == 0,
800 	    ("invalid flags 0x%x", flags));
801 
802 	a.size = size;
803 	a.offset = offset;
804 	a.common = common;
805 	a.gas_flags = flags;
806 	entry = iommu_gas_alloc_entry(domain,
807 	    (flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0);
808 	if (entry == NULL)
809 		return (ENOMEM);
810 	a.entry = entry;
811 	IOMMU_DOMAIN_LOCK(domain);
812 	error = iommu_gas_find_space(domain, &a);
813 	if (error == ENOMEM) {
814 		IOMMU_DOMAIN_UNLOCK(domain);
815 		iommu_gas_free_entry(entry);
816 		return (error);
817 	}
818 #ifdef INVARIANTS
819 	if (iommu_check_free)
820 		iommu_gas_check_free(domain);
821 #endif
822 	KASSERT(error == 0,
823 	    ("unexpected error %d from iommu_gas_find_entry", error));
824 	KASSERT(entry->end < domain->end, ("allocated GPA %jx, max GPA %jx",
825 	    (uintmax_t)entry->end, (uintmax_t)domain->end));
826 	entry->flags |= eflags;
827 	IOMMU_DOMAIN_UNLOCK(domain);
828 
829 	error = domain->ops->map(domain, entry->start,
830 	    entry->end - entry->start, ma, eflags,
831 	    ((flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0));
832 	if (error == ENOMEM) {
833 		iommu_domain_unload_entry(entry, true,
834 		    (flags & IOMMU_MF_CANWAIT) != 0);
835 		return (error);
836 	}
837 	KASSERT(error == 0,
838 	    ("unexpected error %d from domain_map_buf", error));
839 
840 	*res = entry;
841 	return (0);
842 }
843 
844 int
845 iommu_gas_map_region(struct iommu_domain *domain, struct iommu_map_entry *entry,
846     u_int eflags, u_int flags, vm_page_t *ma)
847 {
848 	iommu_gaddr_t start;
849 	int error;
850 
851 	KASSERT(entry->domain == domain,
852 	    ("mismatched domain %p entry %p entry->domain %p", domain,
853 	    entry, entry->domain));
854 	KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain,
855 	    entry, entry->flags));
856 	KASSERT((flags & ~(IOMMU_MF_CANWAIT | IOMMU_MF_RMRR)) == 0,
857 	    ("invalid flags 0x%x", flags));
858 
859 	start = entry->start;
860 	IOMMU_DOMAIN_LOCK(domain);
861 	error = iommu_gas_alloc_region(domain, entry, flags);
862 	if (error != 0) {
863 		IOMMU_DOMAIN_UNLOCK(domain);
864 		return (error);
865 	}
866 	entry->flags |= eflags;
867 	IOMMU_DOMAIN_UNLOCK(domain);
868 	if (entry->end == entry->start)
869 		return (0);
870 
871 	error = domain->ops->map(domain, entry->start,
872 	    entry->end - entry->start, ma + OFF_TO_IDX(start - entry->start),
873 	    eflags, ((flags & IOMMU_MF_CANWAIT) != 0 ? IOMMU_PGF_WAITOK : 0));
874 	if (error == ENOMEM) {
875 		iommu_domain_unload_entry(entry, false,
876 		    (flags & IOMMU_MF_CANWAIT) != 0);
877 		return (error);
878 	}
879 	KASSERT(error == 0,
880 	    ("unexpected error %d from domain_map_buf", error));
881 
882 	return (0);
883 }
884 
885 static int
886 iommu_gas_reserve_region_locked(struct iommu_domain *domain,
887     iommu_gaddr_t start, iommu_gaddr_t end, struct iommu_map_entry *entry)
888 {
889 	int error;
890 
891 	IOMMU_DOMAIN_ASSERT_LOCKED(domain);
892 
893 	entry->start = start;
894 	entry->end = end;
895 	error = iommu_gas_alloc_region(domain, entry, IOMMU_MF_CANWAIT);
896 	if (error == 0)
897 		entry->flags |= IOMMU_MAP_ENTRY_UNMAPPED;
898 	return (error);
899 }
900 
901 int
902 iommu_gas_reserve_region(struct iommu_domain *domain, iommu_gaddr_t start,
903     iommu_gaddr_t end, struct iommu_map_entry **entry0)
904 {
905 	struct iommu_map_entry *entry;
906 	int error;
907 
908 	entry = iommu_gas_alloc_entry(domain, IOMMU_PGF_WAITOK);
909 	IOMMU_DOMAIN_LOCK(domain);
910 	error = iommu_gas_reserve_region_locked(domain, start, end, entry);
911 	IOMMU_DOMAIN_UNLOCK(domain);
912 	if (error != 0)
913 		iommu_gas_free_entry(entry);
914 	else if (entry0 != NULL)
915 		*entry0 = entry;
916 	return (error);
917 }
918 
919 /*
920  * As in iommu_gas_reserve_region, reserve [start, end), but allow for existing
921  * entries.
922  */
923 int
924 iommu_gas_reserve_region_extend(struct iommu_domain *domain,
925     iommu_gaddr_t start, iommu_gaddr_t end)
926 {
927 	struct iommu_map_entry *entry, *next, *prev, key = {};
928 	iommu_gaddr_t entry_start, entry_end;
929 	int error;
930 
931 	error = 0;
932 	entry = NULL;
933 	end = ummin(end, domain->end);
934 	while (start < end) {
935 		/* Preallocate an entry. */
936 		if (entry == NULL)
937 			entry = iommu_gas_alloc_entry(domain,
938 			    IOMMU_PGF_WAITOK);
939 		/* Calculate the free region from here to the next entry. */
940 		key.start = key.end = start;
941 		IOMMU_DOMAIN_LOCK(domain);
942 		next = RB_NFIND(iommu_gas_entries_tree, &domain->rb_root, &key);
943 		KASSERT(next != NULL, ("domain %p with end %#jx has no entry "
944 		    "after %#jx", domain, (uintmax_t)domain->end,
945 		    (uintmax_t)start));
946 		entry_end = ummin(end, next->start);
947 		prev = RB_PREV(iommu_gas_entries_tree, &domain->rb_root, next);
948 		if (prev != NULL)
949 			entry_start = ummax(start, prev->end);
950 		else
951 			entry_start = start;
952 		start = next->end;
953 		/* Reserve the region if non-empty. */
954 		if (entry_start != entry_end) {
955 			error = iommu_gas_reserve_region_locked(domain,
956 			    entry_start, entry_end, entry);
957 			if (error != 0) {
958 				IOMMU_DOMAIN_UNLOCK(domain);
959 				break;
960 			}
961 			entry = NULL;
962 		}
963 		IOMMU_DOMAIN_UNLOCK(domain);
964 	}
965 	/* Release a preallocated entry if it was not used. */
966 	if (entry != NULL)
967 		iommu_gas_free_entry(entry);
968 	return (error);
969 }
970 
971 void
972 iommu_unmap_msi(struct iommu_ctx *ctx)
973 {
974 	struct iommu_map_entry *entry;
975 	struct iommu_domain *domain;
976 
977 	domain = ctx->domain;
978 	entry = domain->msi_entry;
979 	if (entry == NULL)
980 		return;
981 
982 	domain->ops->unmap(domain, entry->start, entry->end -
983 	    entry->start, IOMMU_PGF_WAITOK);
984 
985 	iommu_gas_free_space(entry);
986 
987 	iommu_gas_free_entry(entry);
988 
989 	domain->msi_entry = NULL;
990 	domain->msi_base = 0;
991 	domain->msi_phys = 0;
992 }
993 
994 int
995 iommu_map_msi(struct iommu_ctx *ctx, iommu_gaddr_t size, int offset,
996     u_int eflags, u_int flags, vm_page_t *ma)
997 {
998 	struct iommu_domain *domain;
999 	struct iommu_map_entry *entry;
1000 	int error;
1001 
1002 	error = 0;
1003 	domain = ctx->domain;
1004 
1005 	/* Check if there is already an MSI page allocated */
1006 	IOMMU_DOMAIN_LOCK(domain);
1007 	entry = domain->msi_entry;
1008 	IOMMU_DOMAIN_UNLOCK(domain);
1009 
1010 	if (entry == NULL) {
1011 		error = iommu_gas_map(domain, &ctx->tag->common, size, offset,
1012 		    eflags, flags, ma, &entry);
1013 		IOMMU_DOMAIN_LOCK(domain);
1014 		if (error == 0) {
1015 			if (domain->msi_entry == NULL) {
1016 				MPASS(domain->msi_base == 0);
1017 				MPASS(domain->msi_phys == 0);
1018 
1019 				domain->msi_entry = entry;
1020 				domain->msi_base = entry->start;
1021 				domain->msi_phys = VM_PAGE_TO_PHYS(ma[0]);
1022 			} else {
1023 				/*
1024 				 * We lost the race and already have an
1025 				 * MSI page allocated. Free the unneeded entry.
1026 				 */
1027 				iommu_gas_free_entry(entry);
1028 			}
1029 		} else if (domain->msi_entry != NULL) {
1030 			/*
1031 			 * The allocation failed, but another succeeded.
1032 			 * Return success as there is a valid MSI page.
1033 			 */
1034 			error = 0;
1035 		}
1036 		IOMMU_DOMAIN_UNLOCK(domain);
1037 	}
1038 
1039 	return (error);
1040 }
1041 
1042 void
1043 iommu_translate_msi(struct iommu_domain *domain, uint64_t *addr)
1044 {
1045 
1046 	*addr = (*addr - domain->msi_phys) + domain->msi_base;
1047 
1048 	KASSERT(*addr >= domain->msi_entry->start,
1049 	    ("%s: Address is below the MSI entry start address (%jx < %jx)",
1050 	    __func__, (uintmax_t)*addr, (uintmax_t)domain->msi_entry->start));
1051 
1052 	KASSERT(*addr + sizeof(*addr) <= domain->msi_entry->end,
1053 	    ("%s: Address is above the MSI entry end address (%jx < %jx)",
1054 	    __func__, (uintmax_t)*addr, (uintmax_t)domain->msi_entry->end));
1055 }
1056 
1057 SYSCTL_NODE(_hw, OID_AUTO, iommu, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "");
1058 
1059 #ifdef INVARIANTS
1060 SYSCTL_INT(_hw_iommu, OID_AUTO, check_free, CTLFLAG_RWTUN,
1061     &iommu_check_free, 0,
1062     "Check the GPA RBtree for free_down and free_after validity");
1063 #endif
1064 
1065 #include "opt_ddb.h"
1066 #ifdef DDB
1067 
1068 #include <ddb/ddb.h>
1069 
1070 static void
1071 iommu_debug_dump_gas(struct iommu_domain *domain)
1072 {
1073 	struct iommu_map_entry *entry;
1074 
1075 	db_printf("iommu_domain %p tree %p iommu %p fl %#x\n", domain,
1076 	    &domain->rb_root, domain->iommu, domain->flags);
1077 	db_printf("iommu_domain %p tree %p\n", domain, &domain->rb_root);
1078 	RB_FOREACH(entry, iommu_gas_entries_tree, &domain->rb_root) {
1079 		db_printf(
1080 	    "  e %p [%#jx %#jx] fl %#x first %#jx last %#jx free_down %#jx",
1081 		    entry, (uintmax_t)entry->start, (uintmax_t)entry->end,
1082 		    entry->flags,
1083 		    (uintmax_t)entry->first, (uintmax_t)entry->last,
1084 		    (uintmax_t)entry->free_down);
1085 		if (entry == domain->start_gap)
1086 			db_printf(" start_gap");
1087 		if (entry == domain->first_place)
1088 			db_printf(" first_place");
1089 		if (entry == domain->last_place)
1090 			db_printf(" last_place");
1091 		db_printf("\n");
1092 	}
1093 }
1094 
1095 DB_SHOW_COMMAND(iommu_domain, iommu_domain_show)
1096 {
1097 	struct iommu_domain *domain;
1098 
1099 	if (!have_addr) {
1100 		db_printf("show iommu_domain addr\n");
1101 		return;
1102 	}
1103 
1104 	domain = (void *)addr;
1105 	iommu_debug_dump_gas(domain);
1106 }
1107 
1108 #endif
1109