xref: /freebsd/sys/x86/iommu/intel_ctx.c (revision 59f5f100b774de8824fb2fc1a8a11a93bbc2dafd)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/malloc.h>
34 #include <sys/bus.h>
35 #include <sys/interrupt.h>
36 #include <sys/kernel.h>
37 #include <sys/ktr.h>
38 #include <sys/limits.h>
39 #include <sys/lock.h>
40 #include <sys/memdesc.h>
41 #include <sys/mutex.h>
42 #include <sys/proc.h>
43 #include <sys/rwlock.h>
44 #include <sys/rman.h>
45 #include <sys/sysctl.h>
46 #include <sys/taskqueue.h>
47 #include <sys/tree.h>
48 #include <sys/uio.h>
49 #include <sys/vmem.h>
50 #include <vm/vm.h>
51 #include <vm/vm_extern.h>
52 #include <vm/vm_kern.h>
53 #include <vm/vm_object.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_pager.h>
56 #include <vm/vm_map.h>
57 #include <contrib/dev/acpica/include/acpi.h>
58 #include <contrib/dev/acpica/include/accommon.h>
59 #include <dev/pci/pcireg.h>
60 #include <dev/pci/pcivar.h>
61 #include <machine/atomic.h>
62 #include <machine/bus.h>
63 #include <machine/md_var.h>
64 #include <machine/specialreg.h>
65 #include <x86/include/busdma_impl.h>
66 #include <dev/iommu/busdma_iommu.h>
67 #include <x86/iommu/intel_reg.h>
68 #include <x86/iommu/x86_iommu.h>
69 #include <x86/iommu/intel_dmar.h>
70 
71 static MALLOC_DEFINE(M_DMAR_CTX, "dmar_ctx", "Intel DMAR Context");
72 static MALLOC_DEFINE(M_DMAR_DOMAIN, "dmar_dom", "Intel DMAR Domain");
73 
74 static void dmar_unref_domain_locked(struct dmar_unit *dmar,
75     struct dmar_domain *domain);
76 static void dmar_domain_destroy(struct dmar_domain *domain);
77 
78 static void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx);
79 static void dmar_free_ctx(struct dmar_ctx *ctx);
80 
81 static void
82 dmar_ensure_ctx_page(struct dmar_unit *dmar, int bus)
83 {
84 	struct sf_buf *sf;
85 	dmar_root_entry_t *re;
86 	vm_page_t ctxm;
87 
88 	/*
89 	 * Allocated context page must be linked.
90 	 */
91 	ctxm = iommu_pgalloc(dmar->ctx_obj, 1 + bus, IOMMU_PGF_NOALLOC);
92 	if (ctxm != NULL)
93 		return;
94 
95 	/*
96 	 * Page not present, allocate and link.  Note that other
97 	 * thread might execute this sequence in parallel.  This
98 	 * should be safe, because the context entries written by both
99 	 * threads are equal.
100 	 */
101 	TD_PREP_PINNED_ASSERT;
102 	ctxm = iommu_pgalloc(dmar->ctx_obj, 1 + bus, IOMMU_PGF_ZERO |
103 	    IOMMU_PGF_WAITOK);
104 	re = iommu_map_pgtbl(dmar->ctx_obj, 0, IOMMU_PGF_NOALLOC, &sf);
105 	re += bus;
106 	dmar_pte_store(&re->r1, DMAR_ROOT_R1_P | (DMAR_ROOT_R1_CTP_MASK &
107 	    VM_PAGE_TO_PHYS(ctxm)));
108 	dmar_flush_root_to_ram(dmar, re);
109 	iommu_unmap_pgtbl(sf);
110 	TD_PINNED_ASSERT;
111 }
112 
113 static dmar_ctx_entry_t *
114 dmar_map_ctx_entry(struct dmar_ctx *ctx, struct sf_buf **sfp)
115 {
116 	struct dmar_unit *dmar;
117 	dmar_ctx_entry_t *ctxp;
118 
119 	dmar = CTX2DMAR(ctx);
120 
121 	ctxp = iommu_map_pgtbl(dmar->ctx_obj, 1 + PCI_RID2BUS(ctx->context.rid),
122 	    IOMMU_PGF_NOALLOC | IOMMU_PGF_WAITOK, sfp);
123 	ctxp += ctx->context.rid & 0xff;
124 	return (ctxp);
125 }
126 
127 static void
128 ctx_id_entry_init_one(dmar_ctx_entry_t *ctxp, struct dmar_domain *domain,
129     vm_page_t ctx_root)
130 {
131 	/*
132 	 * For update due to move, the store is not atomic.  It is
133 	 * possible that DMAR read upper doubleword, while low
134 	 * doubleword is not yet updated.  The domain id is stored in
135 	 * the upper doubleword, while the table pointer in the lower.
136 	 *
137 	 * There is no good solution, for the same reason it is wrong
138 	 * to clear P bit in the ctx entry for update.
139 	 */
140 	dmar_pte_store1(&ctxp->ctx2, DMAR_CTX2_DID(domain->domain) |
141 	    domain->awlvl);
142 	if (ctx_root == NULL) {
143 		dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P);
144 	} else {
145 		dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_UNTR |
146 		    (DMAR_CTX1_ASR_MASK & VM_PAGE_TO_PHYS(ctx_root)) |
147 		    DMAR_CTX1_P);
148 	}
149 }
150 
151 static void
152 ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp, bool move,
153     int busno)
154 {
155 	struct dmar_unit *unit;
156 	struct dmar_domain *domain;
157 	vm_page_t ctx_root;
158 	int i;
159 
160 	domain = CTX2DOM(ctx);
161 	unit = DOM2DMAR(domain);
162 	KASSERT(move || (ctxp->ctx1 == 0 && ctxp->ctx2 == 0),
163 	    ("dmar%d: initialized ctx entry %d:%d:%d 0x%jx 0x%jx",
164 	    unit->iommu.unit, busno, pci_get_slot(ctx->context.tag->owner),
165 	    pci_get_function(ctx->context.tag->owner),
166 	    ctxp->ctx1, ctxp->ctx2));
167 
168 	if ((domain->iodom.flags & IOMMU_DOMAIN_IDMAP) != 0 &&
169 	    (unit->hw_ecap & DMAR_ECAP_PT) != 0) {
170 		KASSERT(domain->pgtbl_obj == NULL,
171 		    ("ctx %p non-null pgtbl_obj", ctx));
172 		ctx_root = NULL;
173 	} else {
174 		ctx_root = iommu_pgalloc(domain->pgtbl_obj, 0,
175 		    IOMMU_PGF_NOALLOC);
176 	}
177 
178 	if (iommu_is_buswide_ctx(DMAR2IOMMU(unit), busno)) {
179 		MPASS(!move);
180 		for (i = 0; i <= PCI_BUSMAX; i++) {
181 			ctx_id_entry_init_one(&ctxp[i], domain, ctx_root);
182 		}
183 	} else {
184 		ctx_id_entry_init_one(ctxp, domain, ctx_root);
185 	}
186 	dmar_flush_ctx_to_ram(unit, ctxp);
187 }
188 
189 static int
190 dmar_flush_for_ctx_entry(struct dmar_unit *dmar, bool force)
191 {
192 	int error;
193 
194 	/*
195 	 * If dmar declares Caching Mode as Set, follow 11.5 "Caching
196 	 * Mode Consideration" and do the (global) invalidation of the
197 	 * negative TLB entries.
198 	 */
199 	if ((dmar->hw_cap & DMAR_CAP_CM) == 0 && !force)
200 		return (0);
201 	if (dmar->qi_enabled) {
202 		dmar_qi_invalidate_ctx_glob_locked(dmar);
203 		if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force)
204 			dmar_qi_invalidate_iotlb_glob_locked(dmar);
205 		return (0);
206 	}
207 	error = dmar_inv_ctx_glob(dmar);
208 	if (error == 0 && ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force))
209 		error = dmar_inv_iotlb_glob(dmar);
210 	return (error);
211 }
212 
213 static int
214 domain_init_rmrr(struct dmar_domain *domain, device_t dev, int bus,
215     int slot, int func, int dev_domain, int dev_busno,
216     const void *dev_path, int dev_path_len)
217 {
218 	struct iommu_map_entries_tailq rmrr_entries;
219 	struct iommu_map_entry *entry, *entry1;
220 	vm_page_t *ma;
221 	iommu_gaddr_t start, end;
222 	vm_pindex_t size, i;
223 	int error, error1;
224 
225 	if (!dmar_rmrr_enable)
226 		return (0);
227 
228 	error = 0;
229 	TAILQ_INIT(&rmrr_entries);
230 	dmar_dev_parse_rmrr(domain, dev_domain, dev_busno, dev_path,
231 	    dev_path_len, &rmrr_entries);
232 	TAILQ_FOREACH_SAFE(entry, &rmrr_entries, dmamap_link, entry1) {
233 		/*
234 		 * VT-d specification requires that the start of an
235 		 * RMRR entry is 4k-aligned.  Buggy BIOSes put
236 		 * anything into the start and end fields.  Truncate
237 		 * and round as neccesary.
238 		 *
239 		 * We also allow the overlapping RMRR entries, see
240 		 * iommu_gas_alloc_region().
241 		 */
242 		start = entry->start;
243 		end = entry->end;
244 		if (bootverbose)
245 			printf("dmar%d ctx pci%d:%d:%d RMRR [%#jx, %#jx]\n",
246 			    domain->iodom.iommu->unit, bus, slot, func,
247 			    (uintmax_t)start, (uintmax_t)end);
248 		entry->start = trunc_page(start);
249 		entry->end = round_page(end);
250 		if (entry->start == entry->end) {
251 			/* Workaround for some AMI (?) BIOSes */
252 			if (bootverbose) {
253 				if (dev != NULL)
254 					device_printf(dev, "");
255 				printf("pci%d:%d:%d ", bus, slot, func);
256 				printf("BIOS bug: dmar%d RMRR "
257 				    "region (%jx, %jx) corrected\n",
258 				    domain->iodom.iommu->unit, start, end);
259 			}
260 			entry->end += IOMMU_PAGE_SIZE * 0x20;
261 		}
262 		size = OFF_TO_IDX(entry->end - entry->start);
263 		ma = malloc(sizeof(vm_page_t) * size, M_TEMP, M_WAITOK);
264 		for (i = 0; i < size; i++) {
265 			ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
266 			    VM_MEMATTR_DEFAULT);
267 		}
268 		error1 = iommu_gas_map_region(DOM2IODOM(domain), entry,
269 		    IOMMU_MAP_ENTRY_READ | IOMMU_MAP_ENTRY_WRITE,
270 		    IOMMU_MF_CANWAIT | IOMMU_MF_RMRR, ma);
271 		/*
272 		 * Non-failed RMRR entries are owned by context rb
273 		 * tree.  Get rid of the failed entry, but do not stop
274 		 * the loop.  Rest of the parsed RMRR entries are
275 		 * loaded and removed on the context destruction.
276 		 */
277 		if (error1 == 0 && entry->end != entry->start) {
278 			IOMMU_LOCK(domain->iodom.iommu);
279 			domain->refs++; /* XXXKIB prevent free */
280 			domain->iodom.flags |= IOMMU_DOMAIN_RMRR;
281 			IOMMU_UNLOCK(domain->iodom.iommu);
282 		} else {
283 			if (error1 != 0) {
284 				if (dev != NULL)
285 					device_printf(dev, "");
286 				printf("pci%d:%d:%d ", bus, slot, func);
287 				printf(
288 			    "dmar%d failed to map RMRR region (%jx, %jx) %d\n",
289 				    domain->iodom.iommu->unit, start, end,
290 				    error1);
291 				error = error1;
292 			}
293 			TAILQ_REMOVE(&rmrr_entries, entry, dmamap_link);
294 			iommu_gas_free_entry(entry);
295 		}
296 		for (i = 0; i < size; i++)
297 			vm_page_putfake(ma[i]);
298 		free(ma, M_TEMP);
299 	}
300 	return (error);
301 }
302 
303 /*
304  * PCI memory address space is shared between memory-mapped devices (MMIO) and
305  * host memory (which may be remapped by an IOMMU).  Device accesses to an
306  * address within a memory aperture in a PCIe root port will be treated as
307  * peer-to-peer and not forwarded to an IOMMU.  To avoid this, reserve the
308  * address space of the root port's memory apertures in the address space used
309  * by the IOMMU for remapping.
310  */
311 static int
312 dmar_reserve_pci_regions(struct dmar_domain *domain, device_t dev)
313 {
314 	struct iommu_domain *iodom;
315 	device_t root;
316 	uint32_t val;
317 	uint64_t base, limit;
318 	int error;
319 
320 	iodom = DOM2IODOM(domain);
321 
322 	root = pci_find_pcie_root_port(dev);
323 	if (root == NULL)
324 		return (0);
325 
326 	/* Disable downstream memory */
327 	base = PCI_PPBMEMBASE(0, pci_read_config(root, PCIR_MEMBASE_1, 2));
328 	limit = PCI_PPBMEMLIMIT(0, pci_read_config(root, PCIR_MEMLIMIT_1, 2));
329 	error = iommu_gas_reserve_region_extend(iodom, base, limit + 1);
330 	if (bootverbose || error != 0)
331 		device_printf(dev, "DMAR reserve [%#jx-%#jx] (error %d)\n",
332 		    base, limit + 1, error);
333 	if (error != 0)
334 		return (error);
335 
336 	/* Disable downstream prefetchable memory */
337 	val = pci_read_config(root, PCIR_PMBASEL_1, 2);
338 	if (val != 0 || pci_read_config(root, PCIR_PMLIMITL_1, 2) != 0) {
339 		if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) {
340 			base = PCI_PPBMEMBASE(
341 			    pci_read_config(root, PCIR_PMBASEH_1, 4),
342 			    val);
343 			limit = PCI_PPBMEMLIMIT(
344 			    pci_read_config(root, PCIR_PMLIMITH_1, 4),
345 			    pci_read_config(root, PCIR_PMLIMITL_1, 2));
346 		} else {
347 			base = PCI_PPBMEMBASE(0, val);
348 			limit = PCI_PPBMEMLIMIT(0,
349 			    pci_read_config(root, PCIR_PMLIMITL_1, 2));
350 		}
351 		error = iommu_gas_reserve_region_extend(iodom, base,
352 		    limit + 1);
353 		if (bootverbose || error != 0)
354 			device_printf(dev, "DMAR reserve [%#jx-%#jx] "
355 			    "(error %d)\n", base, limit + 1, error);
356 		if (error != 0)
357 			return (error);
358 	}
359 
360 	return (error);
361 }
362 
363 static struct dmar_domain *
364 dmar_domain_alloc(struct dmar_unit *dmar, bool id_mapped)
365 {
366 	struct iommu_domain *iodom;
367 	struct iommu_unit *unit;
368 	struct dmar_domain *domain;
369 	int error, id, mgaw;
370 
371 	id = alloc_unr(dmar->domids);
372 	if (id == -1)
373 		return (NULL);
374 	domain = malloc(sizeof(*domain), M_DMAR_DOMAIN, M_WAITOK | M_ZERO);
375 	iodom = DOM2IODOM(domain);
376 	unit = DMAR2IOMMU(dmar);
377 	domain->domain = id;
378 	LIST_INIT(&domain->contexts);
379 	iommu_domain_init(unit, iodom, &dmar_domain_map_ops);
380 
381 	domain->dmar = dmar;
382 
383 	/*
384 	 * For now, use the maximal usable physical address of the
385 	 * installed memory to calculate the mgaw on id_mapped domain.
386 	 * It is useful for the identity mapping, and less so for the
387 	 * virtualized bus address space.
388 	 */
389 	domain->iodom.end = id_mapped ? ptoa(Maxmem) : BUS_SPACE_MAXADDR;
390 	mgaw = dmar_maxaddr2mgaw(dmar, domain->iodom.end, !id_mapped);
391 	error = domain_set_agaw(domain, mgaw);
392 	if (error != 0)
393 		goto fail;
394 	if (!id_mapped)
395 		/* Use all supported address space for remapping. */
396 		domain->iodom.end = 1ULL << (domain->agaw - 1);
397 
398 	iommu_gas_init_domain(DOM2IODOM(domain));
399 
400 	if (id_mapped) {
401 		if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) {
402 			domain->pgtbl_obj = dmar_get_idmap_pgtbl(domain,
403 			    domain->iodom.end);
404 		}
405 		domain->iodom.flags |= IOMMU_DOMAIN_IDMAP;
406 	} else {
407 		error = dmar_domain_alloc_pgtbl(domain);
408 		if (error != 0)
409 			goto fail;
410 		/* Disable local apic region access */
411 		error = iommu_gas_reserve_region(iodom, 0xfee00000,
412 		    0xfeefffff + 1, &iodom->msi_entry);
413 		if (error != 0)
414 			goto fail;
415 	}
416 	return (domain);
417 
418 fail:
419 	dmar_domain_destroy(domain);
420 	return (NULL);
421 }
422 
423 static struct dmar_ctx *
424 dmar_ctx_alloc(struct dmar_domain *domain, uint16_t rid)
425 {
426 	struct dmar_ctx *ctx;
427 
428 	ctx = malloc(sizeof(*ctx), M_DMAR_CTX, M_WAITOK | M_ZERO);
429 	ctx->context.domain = DOM2IODOM(domain);
430 	ctx->context.tag = malloc(sizeof(struct bus_dma_tag_iommu),
431 	    M_DMAR_CTX, M_WAITOK | M_ZERO);
432 	ctx->context.rid = rid;
433 	ctx->refs = 1;
434 	return (ctx);
435 }
436 
437 static void
438 dmar_ctx_link(struct dmar_ctx *ctx)
439 {
440 	struct dmar_domain *domain;
441 
442 	domain = CTX2DOM(ctx);
443 	IOMMU_ASSERT_LOCKED(domain->iodom.iommu);
444 	KASSERT(domain->refs >= domain->ctx_cnt,
445 	    ("dom %p ref underflow %d %d", domain, domain->refs,
446 	    domain->ctx_cnt));
447 	domain->refs++;
448 	domain->ctx_cnt++;
449 	LIST_INSERT_HEAD(&domain->contexts, ctx, link);
450 }
451 
452 static void
453 dmar_ctx_unlink(struct dmar_ctx *ctx)
454 {
455 	struct dmar_domain *domain;
456 
457 	domain = CTX2DOM(ctx);
458 	IOMMU_ASSERT_LOCKED(domain->iodom.iommu);
459 	KASSERT(domain->refs > 0,
460 	    ("domain %p ctx dtr refs %d", domain, domain->refs));
461 	KASSERT(domain->ctx_cnt >= domain->refs,
462 	    ("domain %p ctx dtr refs %d ctx_cnt %d", domain,
463 	    domain->refs, domain->ctx_cnt));
464 	domain->refs--;
465 	domain->ctx_cnt--;
466 	LIST_REMOVE(ctx, link);
467 }
468 
469 static void
470 dmar_domain_destroy(struct dmar_domain *domain)
471 {
472 	struct iommu_domain *iodom;
473 	struct dmar_unit *dmar;
474 
475 	iodom = DOM2IODOM(domain);
476 
477 	KASSERT(TAILQ_EMPTY(&domain->iodom.unload_entries),
478 	    ("unfinished unloads %p", domain));
479 	KASSERT(LIST_EMPTY(&domain->contexts),
480 	    ("destroying dom %p with contexts", domain));
481 	KASSERT(domain->ctx_cnt == 0,
482 	    ("destroying dom %p with ctx_cnt %d", domain, domain->ctx_cnt));
483 	KASSERT(domain->refs == 0,
484 	    ("destroying dom %p with refs %d", domain, domain->refs));
485 	if ((domain->iodom.flags & IOMMU_DOMAIN_GAS_INITED) != 0) {
486 		DMAR_DOMAIN_LOCK(domain);
487 		iommu_gas_fini_domain(iodom);
488 		DMAR_DOMAIN_UNLOCK(domain);
489 	}
490 	if ((domain->iodom.flags & IOMMU_DOMAIN_PGTBL_INITED) != 0) {
491 		if (domain->pgtbl_obj != NULL)
492 			DMAR_DOMAIN_PGLOCK(domain);
493 		dmar_domain_free_pgtbl(domain);
494 	}
495 	iommu_domain_fini(iodom);
496 	dmar = DOM2DMAR(domain);
497 	free_unr(dmar->domids, domain->domain);
498 	free(domain, M_DMAR_DOMAIN);
499 }
500 
501 static struct dmar_ctx *
502 dmar_get_ctx_for_dev1(struct dmar_unit *dmar, device_t dev, uint16_t rid,
503     int dev_domain, int dev_busno, const void *dev_path, int dev_path_len,
504     bool id_mapped, bool rmrr_init)
505 {
506 	struct dmar_domain *domain, *domain1;
507 	struct dmar_ctx *ctx, *ctx1;
508 	struct iommu_unit *unit __diagused;
509 	dmar_ctx_entry_t *ctxp;
510 	struct sf_buf *sf;
511 	int bus, slot, func, error;
512 	bool enable;
513 
514 	if (dev != NULL) {
515 		bus = pci_get_bus(dev);
516 		slot = pci_get_slot(dev);
517 		func = pci_get_function(dev);
518 	} else {
519 		bus = PCI_RID2BUS(rid);
520 		slot = PCI_RID2SLOT(rid);
521 		func = PCI_RID2FUNC(rid);
522 	}
523 	enable = false;
524 	TD_PREP_PINNED_ASSERT;
525 	unit = DMAR2IOMMU(dmar);
526 	DMAR_LOCK(dmar);
527 	KASSERT(!iommu_is_buswide_ctx(unit, bus) || (slot == 0 && func == 0),
528 	    ("iommu%d pci%d:%d:%d get_ctx for buswide", dmar->iommu.unit, bus,
529 	    slot, func));
530 	ctx = dmar_find_ctx_locked(dmar, rid);
531 	error = 0;
532 	if (ctx == NULL) {
533 		/*
534 		 * Perform the allocations which require sleep or have
535 		 * higher chance to succeed if the sleep is allowed.
536 		 */
537 		DMAR_UNLOCK(dmar);
538 		dmar_ensure_ctx_page(dmar, PCI_RID2BUS(rid));
539 		domain1 = dmar_domain_alloc(dmar, id_mapped);
540 		if (domain1 == NULL) {
541 			TD_PINNED_ASSERT;
542 			return (NULL);
543 		}
544 		if (!id_mapped) {
545 			error = domain_init_rmrr(domain1, dev, bus,
546 			    slot, func, dev_domain, dev_busno, dev_path,
547 			    dev_path_len);
548 			if (error == 0 && dev != NULL)
549 				error = dmar_reserve_pci_regions(domain1, dev);
550 			if (error != 0) {
551 				dmar_domain_destroy(domain1);
552 				TD_PINNED_ASSERT;
553 				return (NULL);
554 			}
555 		}
556 		ctx1 = dmar_ctx_alloc(domain1, rid);
557 		ctxp = dmar_map_ctx_entry(ctx1, &sf);
558 		DMAR_LOCK(dmar);
559 
560 		/*
561 		 * Recheck the contexts, other thread might have
562 		 * already allocated needed one.
563 		 */
564 		ctx = dmar_find_ctx_locked(dmar, rid);
565 		if (ctx == NULL) {
566 			domain = domain1;
567 			ctx = ctx1;
568 			dmar_ctx_link(ctx);
569 			ctx->context.tag->owner = dev;
570 			iommu_device_tag_init(CTX2IOCTX(ctx), dev);
571 
572 			/*
573 			 * This is the first activated context for the
574 			 * DMAR unit.  Enable the translation after
575 			 * everything is set up.
576 			 */
577 			if (LIST_EMPTY(&dmar->domains))
578 				enable = true;
579 			LIST_INSERT_HEAD(&dmar->domains, domain, link);
580 			ctx_id_entry_init(ctx, ctxp, false, bus);
581 			if (dev != NULL) {
582 				device_printf(dev,
583 			    "dmar%d pci%d:%d:%d:%d rid %x domain %d mgaw %d "
584 				    "agaw %d %s-mapped\n",
585 				    dmar->iommu.unit, dmar->segment, bus, slot,
586 				    func, rid, domain->domain, domain->mgaw,
587 				    domain->agaw, id_mapped ? "id" : "re");
588 			}
589 			iommu_unmap_pgtbl(sf);
590 		} else {
591 			iommu_unmap_pgtbl(sf);
592 			dmar_domain_destroy(domain1);
593 			/* Nothing needs to be done to destroy ctx1. */
594 			free(ctx1, M_DMAR_CTX);
595 			domain = CTX2DOM(ctx);
596 			ctx->refs++; /* tag referenced us */
597 		}
598 	} else {
599 		domain = CTX2DOM(ctx);
600 		if (ctx->context.tag->owner == NULL)
601 			ctx->context.tag->owner = dev;
602 		ctx->refs++; /* tag referenced us */
603 	}
604 
605 	error = dmar_flush_for_ctx_entry(dmar, enable);
606 	if (error != 0) {
607 		dmar_free_ctx_locked(dmar, ctx);
608 		TD_PINNED_ASSERT;
609 		return (NULL);
610 	}
611 
612 	/*
613 	 * The dmar lock was potentially dropped between check for the
614 	 * empty context list and now.  Recheck the state of GCMD_TE
615 	 * to avoid unneeded command.
616 	 */
617 	if (enable && !rmrr_init && (dmar->hw_gcmd & DMAR_GCMD_TE) == 0) {
618 		error = dmar_disable_protected_regions(dmar);
619 		if (error != 0)
620 			printf("dmar%d: Failed to disable protected regions\n",
621 			    dmar->iommu.unit);
622 		error = dmar_enable_translation(dmar);
623 		if (error == 0) {
624 			if (bootverbose) {
625 				printf("dmar%d: enabled translation\n",
626 				    dmar->iommu.unit);
627 			}
628 		} else {
629 			printf("dmar%d: enabling translation failed, "
630 			    "error %d\n", dmar->iommu.unit, error);
631 			dmar_free_ctx_locked(dmar, ctx);
632 			TD_PINNED_ASSERT;
633 			return (NULL);
634 		}
635 	}
636 	DMAR_UNLOCK(dmar);
637 	TD_PINNED_ASSERT;
638 	return (ctx);
639 }
640 
641 struct dmar_ctx *
642 dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, uint16_t rid,
643     bool id_mapped, bool rmrr_init)
644 {
645 	int dev_domain, dev_path_len, dev_busno;
646 
647 	dev_domain = pci_get_domain(dev);
648 	dev_path_len = dmar_dev_depth(dev);
649 	ACPI_DMAR_PCI_PATH dev_path[dev_path_len];
650 	dmar_dev_path(dev, &dev_busno, dev_path, dev_path_len);
651 	return (dmar_get_ctx_for_dev1(dmar, dev, rid, dev_domain, dev_busno,
652 	    dev_path, dev_path_len, id_mapped, rmrr_init));
653 }
654 
655 struct dmar_ctx *
656 dmar_get_ctx_for_devpath(struct dmar_unit *dmar, uint16_t rid,
657     int dev_domain, int dev_busno,
658     const void *dev_path, int dev_path_len,
659     bool id_mapped, bool rmrr_init)
660 {
661 
662 	return (dmar_get_ctx_for_dev1(dmar, NULL, rid, dev_domain, dev_busno,
663 	    dev_path, dev_path_len, id_mapped, rmrr_init));
664 }
665 
666 int
667 dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx)
668 {
669 	struct dmar_unit *dmar;
670 	struct dmar_domain *old_domain;
671 	dmar_ctx_entry_t *ctxp;
672 	struct sf_buf *sf;
673 	int error;
674 
675 	dmar = domain->dmar;
676 	old_domain = CTX2DOM(ctx);
677 	if (domain == old_domain)
678 		return (0);
679 	KASSERT(old_domain->iodom.iommu == domain->iodom.iommu,
680 	    ("domain %p %u moving between dmars %u %u", domain,
681 	    domain->domain, old_domain->iodom.iommu->unit,
682 	    domain->iodom.iommu->unit));
683 	TD_PREP_PINNED_ASSERT;
684 
685 	ctxp = dmar_map_ctx_entry(ctx, &sf);
686 	DMAR_LOCK(dmar);
687 	dmar_ctx_unlink(ctx);
688 	ctx->context.domain = &domain->iodom;
689 	dmar_ctx_link(ctx);
690 	ctx_id_entry_init(ctx, ctxp, true, PCI_BUSMAX + 100);
691 	iommu_unmap_pgtbl(sf);
692 	error = dmar_flush_for_ctx_entry(dmar, true);
693 	/* If flush failed, rolling back would not work as well. */
694 	printf("dmar%d rid %x domain %d->%d %s-mapped\n",
695 	    dmar->iommu.unit, ctx->context.rid, old_domain->domain,
696 	    domain->domain, (domain->iodom.flags & IOMMU_DOMAIN_IDMAP) != 0 ?
697 	    "id" : "re");
698 	dmar_unref_domain_locked(dmar, old_domain);
699 	TD_PINNED_ASSERT;
700 	return (error);
701 }
702 
703 static void
704 dmar_unref_domain_locked(struct dmar_unit *dmar, struct dmar_domain *domain)
705 {
706 
707 	DMAR_ASSERT_LOCKED(dmar);
708 	KASSERT(domain->refs >= 1,
709 	    ("dmar %d domain %p refs %u", dmar->iommu.unit, domain,
710 	    domain->refs));
711 	KASSERT(domain->refs > domain->ctx_cnt,
712 	    ("dmar %d domain %p refs %d ctx_cnt %d", dmar->iommu.unit, domain,
713 	    domain->refs, domain->ctx_cnt));
714 
715 	if (domain->refs > 1) {
716 		domain->refs--;
717 		DMAR_UNLOCK(dmar);
718 		return;
719 	}
720 
721 	KASSERT((domain->iodom.flags & IOMMU_DOMAIN_RMRR) == 0,
722 	    ("lost ref on RMRR domain %p", domain));
723 
724 	LIST_REMOVE(domain, link);
725 	DMAR_UNLOCK(dmar);
726 
727 	taskqueue_drain(dmar->iommu.delayed_taskqueue,
728 	    &domain->iodom.unload_task);
729 	dmar_domain_destroy(domain);
730 }
731 
732 static void
733 dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx)
734 {
735 	struct sf_buf *sf;
736 	dmar_ctx_entry_t *ctxp;
737 	struct dmar_domain *domain;
738 
739 	DMAR_ASSERT_LOCKED(dmar);
740 	KASSERT(ctx->refs >= 1,
741 	    ("dmar %p ctx %p refs %u", dmar, ctx, ctx->refs));
742 
743 	/*
744 	 * If our reference is not last, only the dereference should
745 	 * be performed.
746 	 */
747 	if (ctx->refs > 1) {
748 		ctx->refs--;
749 		DMAR_UNLOCK(dmar);
750 		return;
751 	}
752 
753 	KASSERT((ctx->context.flags & IOMMU_CTX_DISABLED) == 0,
754 	    ("lost ref on disabled ctx %p", ctx));
755 
756 	/*
757 	 * Otherwise, the context entry must be cleared before the
758 	 * page table is destroyed.  The mapping of the context
759 	 * entries page could require sleep, unlock the dmar.
760 	 */
761 	DMAR_UNLOCK(dmar);
762 	TD_PREP_PINNED_ASSERT;
763 	ctxp = dmar_map_ctx_entry(ctx, &sf);
764 	DMAR_LOCK(dmar);
765 	KASSERT(ctx->refs >= 1,
766 	    ("dmar %p ctx %p refs %u", dmar, ctx, ctx->refs));
767 
768 	/*
769 	 * Other thread might have referenced the context, in which
770 	 * case again only the dereference should be performed.
771 	 */
772 	if (ctx->refs > 1) {
773 		ctx->refs--;
774 		DMAR_UNLOCK(dmar);
775 		iommu_unmap_pgtbl(sf);
776 		TD_PINNED_ASSERT;
777 		return;
778 	}
779 
780 	KASSERT((ctx->context.flags & IOMMU_CTX_DISABLED) == 0,
781 	    ("lost ref on disabled ctx %p", ctx));
782 
783 	/*
784 	 * Clear the context pointer and flush the caches.
785 	 * XXXKIB: cannot do this if any RMRR entries are still present.
786 	 */
787 	dmar_pte_clear(&ctxp->ctx1);
788 	ctxp->ctx2 = 0;
789 	dmar_flush_ctx_to_ram(dmar, ctxp);
790 	dmar_inv_ctx_glob(dmar);
791 	if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0) {
792 		if (dmar->qi_enabled)
793 			dmar_qi_invalidate_iotlb_glob_locked(dmar);
794 		else
795 			dmar_inv_iotlb_glob(dmar);
796 	}
797 	iommu_unmap_pgtbl(sf);
798 	domain = CTX2DOM(ctx);
799 	dmar_ctx_unlink(ctx);
800 	free(ctx->context.tag, M_DMAR_CTX);
801 	free(ctx, M_DMAR_CTX);
802 	dmar_unref_domain_locked(dmar, domain);
803 	TD_PINNED_ASSERT;
804 }
805 
806 static void
807 dmar_free_ctx(struct dmar_ctx *ctx)
808 {
809 	struct dmar_unit *dmar;
810 
811 	dmar = CTX2DMAR(ctx);
812 	DMAR_LOCK(dmar);
813 	dmar_free_ctx_locked(dmar, ctx);
814 }
815 
816 /*
817  * Returns with the domain locked.
818  */
819 struct dmar_ctx *
820 dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid)
821 {
822 	struct dmar_domain *domain;
823 	struct dmar_ctx *ctx;
824 
825 	DMAR_ASSERT_LOCKED(dmar);
826 
827 	LIST_FOREACH(domain, &dmar->domains, link) {
828 		LIST_FOREACH(ctx, &domain->contexts, link) {
829 			if (ctx->context.rid == rid)
830 				return (ctx);
831 		}
832 	}
833 	return (NULL);
834 }
835 
836 /*
837  * If the given value for "free" is true, then the caller must not be using
838  * the entry's dmamap_link field.
839  */
840 void
841 dmar_domain_unload_entry(struct iommu_map_entry *entry, bool free,
842     bool cansleep)
843 {
844 	struct dmar_domain *domain;
845 	struct dmar_unit *unit;
846 
847 	domain = IODOM2DOM(entry->domain);
848 	unit = DOM2DMAR(domain);
849 
850 	/*
851 	 * If "free" is false, then the IOTLB invalidation must be performed
852 	 * synchronously.  Otherwise, the caller might free the entry before
853 	 * dmar_qi_task() is finished processing it.
854 	 */
855 	if (unit->qi_enabled) {
856 		if (free) {
857 			DMAR_LOCK(unit);
858 			iommu_qi_invalidate_locked(&domain->iodom, entry,
859 			    true);
860 			DMAR_UNLOCK(unit);
861 		} else {
862 			iommu_qi_invalidate_sync(&domain->iodom, entry->start,
863 			    entry->end - entry->start, cansleep);
864 			iommu_domain_free_entry(entry, false);
865 		}
866 	} else {
867 		dmar_flush_iotlb_sync(domain, entry->start, entry->end -
868 		    entry->start);
869 		iommu_domain_free_entry(entry, free);
870 	}
871 }
872 
873 static bool
874 dmar_domain_unload_emit_wait(struct dmar_domain *domain,
875     struct iommu_map_entry *entry)
876 {
877 
878 	if (TAILQ_NEXT(entry, dmamap_link) == NULL)
879 		return (true);
880 	return (domain->batch_no++ % iommu_qi_batch_coalesce == 0);
881 }
882 
883 void
884 dmar_domain_unload(struct iommu_domain *iodom,
885     struct iommu_map_entries_tailq *entries, bool cansleep)
886 {
887 	struct dmar_domain *domain;
888 	struct dmar_unit *unit;
889 	struct iommu_map_entry *entry, *entry1;
890 	int error __diagused;
891 
892 	domain = IODOM2DOM(iodom);
893 	unit = DOM2DMAR(domain);
894 
895 	TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
896 		KASSERT((entry->flags & IOMMU_MAP_ENTRY_MAP) != 0,
897 		    ("not mapped entry %p %p", domain, entry));
898 		error = iodom->ops->unmap(iodom, entry,
899 		    cansleep ? IOMMU_PGF_WAITOK : 0);
900 		KASSERT(error == 0, ("unmap %p error %d", domain, error));
901 		if (!unit->qi_enabled) {
902 			dmar_flush_iotlb_sync(domain, entry->start,
903 			    entry->end - entry->start);
904 			TAILQ_REMOVE(entries, entry, dmamap_link);
905 			iommu_domain_free_entry(entry, true);
906 		}
907 	}
908 	if (TAILQ_EMPTY(entries))
909 		return;
910 
911 	KASSERT(unit->qi_enabled, ("loaded entry left"));
912 	DMAR_LOCK(unit);
913 	while ((entry = TAILQ_FIRST(entries)) != NULL) {
914 		TAILQ_REMOVE(entries, entry, dmamap_link);
915 		iommu_qi_invalidate_locked(&domain->iodom, entry,
916 		    dmar_domain_unload_emit_wait(domain, entry));
917 	}
918 	DMAR_UNLOCK(unit);
919 }
920 
921 struct iommu_ctx *
922 dmar_get_ctx(struct iommu_unit *iommu, device_t dev, uint16_t rid,
923     bool id_mapped, bool rmrr_init)
924 {
925 	struct dmar_unit *dmar;
926 	struct dmar_ctx *ret;
927 
928 	dmar = IOMMU2DMAR(iommu);
929 	ret = dmar_get_ctx_for_dev(dmar, dev, rid, id_mapped, rmrr_init);
930 	return (CTX2IOCTX(ret));
931 }
932 
933 void
934 dmar_free_ctx_locked_method(struct iommu_unit *iommu,
935     struct iommu_ctx *context)
936 {
937 	struct dmar_unit *dmar;
938 	struct dmar_ctx *ctx;
939 
940 	dmar = IOMMU2DMAR(iommu);
941 	ctx = IOCTX2CTX(context);
942 	dmar_free_ctx_locked(dmar, ctx);
943 }
944 
945 void
946 dmar_free_ctx_method(struct iommu_ctx *context)
947 {
948 	struct dmar_ctx *ctx;
949 
950 	ctx = IOCTX2CTX(context);
951 	dmar_free_ctx(ctx);
952 }
953