xref: /freebsd/sys/amd64/vmm/intel/vtd.c (revision d7d962ead0b6e5e8a39202d0590022082bf5bfb6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 
39 #include <vm/vm.h>
40 #include <vm/pmap.h>
41 
42 #include <dev/pci/pcireg.h>
43 
44 #include <machine/vmparam.h>
45 #include <contrib/dev/acpica/include/acpi.h>
46 
47 #include "io/iommu.h"
48 
49 /*
50  * Documented in the "Intel Virtualization Technology for Directed I/O",
51  * Architecture Spec, September 2008.
52  */
53 
54 #define VTD_DRHD_INCLUDE_PCI_ALL(Flags)  (((Flags) >> 0) & 0x1)
55 
56 /* Section 10.4 "Register Descriptions" */
57 struct vtdmap {
58 	volatile uint32_t	version;
59 	volatile uint32_t	res0;
60 	volatile uint64_t	cap;
61 	volatile uint64_t	ext_cap;
62 	volatile uint32_t	gcr;
63 	volatile uint32_t	gsr;
64 	volatile uint64_t	rta;
65 	volatile uint64_t	ccr;
66 };
67 
68 #define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
69 #define	VTD_CAP_ND(cap)		((cap) & 0x7)
70 #define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
71 #define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
72 #define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
73 
74 #define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
75 #define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
76 #define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
77 
78 #define	VTD_GCR_WBF		(1 << 27)
79 #define	VTD_GCR_SRTP		(1 << 30)
80 #define	VTD_GCR_TE		(1U << 31)
81 
82 #define	VTD_GSR_WBFS		(1 << 27)
83 #define	VTD_GSR_RTPS		(1 << 30)
84 #define	VTD_GSR_TES		(1U << 31)
85 
86 #define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
87 #define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
88 
89 #define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
90 #define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
91 #define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
92 #define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
93 #define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
94 #define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
95 #define	VTD_IIR_DOMAIN_P	32
96 
97 #define	VTD_ROOT_PRESENT	0x1
98 #define	VTD_CTX_PRESENT		0x1
99 #define	VTD_CTX_TT_ALL		(1UL << 2)
100 
101 #define	VTD_PTE_RD		(1UL << 0)
102 #define	VTD_PTE_WR		(1UL << 1)
103 #define	VTD_PTE_SUPERPAGE	(1UL << 7)
104 #define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
105 
106 #define VTD_RID2IDX(rid)	(((rid) & 0xff) * 2)
107 
108 struct domain {
109 	uint64_t	*ptp;		/* first level page table page */
110 	int		pt_levels;	/* number of page table levels */
111 	int		addrwidth;	/* 'AW' field in context entry */
112 	int		spsmask;	/* supported super page sizes */
113 	u_int		id;		/* domain id */
114 	vm_paddr_t	maxaddr;	/* highest address to be mapped */
115 	SLIST_ENTRY(domain) next;
116 };
117 
118 static SLIST_HEAD(, domain) domhead;
119 
120 #define	DRHD_MAX_UNITS	8
121 static ACPI_DMAR_HARDWARE_UNIT	*drhds[DRHD_MAX_UNITS];
122 static int			drhd_num;
123 static struct vtdmap		*vtdmaps[DRHD_MAX_UNITS];
124 static int			max_domains;
125 typedef int			(*drhd_ident_func_t)(void);
126 
127 static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
128 static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
129 
130 static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
131 
132 static int
133 vtd_max_domains(struct vtdmap *vtdmap)
134 {
135 	int nd;
136 
137 	nd = VTD_CAP_ND(vtdmap->cap);
138 
139 	switch (nd) {
140 	case 0:
141 		return (16);
142 	case 1:
143 		return (64);
144 	case 2:
145 		return (256);
146 	case 3:
147 		return (1024);
148 	case 4:
149 		return (4 * 1024);
150 	case 5:
151 		return (16 * 1024);
152 	case 6:
153 		return (64 * 1024);
154 	default:
155 		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
156 	}
157 }
158 
159 static u_int
160 domain_id(void)
161 {
162 	u_int id;
163 	struct domain *dom;
164 
165 	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
166 	for (id = 1; id < max_domains; id++) {
167 		SLIST_FOREACH(dom, &domhead, next) {
168 			if (dom->id == id)
169 				break;
170 		}
171 		if (dom == NULL)
172 			break;		/* found it */
173 	}
174 
175 	if (id >= max_domains)
176 		panic("domain ids exhausted");
177 
178 	return (id);
179 }
180 
181 static struct vtdmap *
182 vtd_device_scope(uint16_t rid)
183 {
184 	int i, remaining, pathremaining;
185 	char *end, *pathend;
186 	struct vtdmap *vtdmap;
187 	ACPI_DMAR_HARDWARE_UNIT *drhd;
188 	ACPI_DMAR_DEVICE_SCOPE *device_scope;
189 	ACPI_DMAR_PCI_PATH *path;
190 
191 	for (i = 0; i < drhd_num; i++) {
192 		drhd = drhds[i];
193 
194 		if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) {
195 			/*
196 			 * From Intel VT-d arch spec, version 3.0:
197 			 * If a DRHD structure with INCLUDE_PCI_ALL flag Set is reported
198 			 * for a Segment, it must be enumerated by BIOS after all other
199 			 * DRHD structures for the same Segment.
200 			 */
201 			vtdmap = vtdmaps[i];
202 			return(vtdmap);
203 		}
204 
205 		end = (char *)drhd + drhd->Header.Length;
206 		remaining = drhd->Header.Length - sizeof(ACPI_DMAR_HARDWARE_UNIT);
207 		while (remaining > sizeof(ACPI_DMAR_DEVICE_SCOPE)) {
208 			device_scope = (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining);
209 			remaining -= device_scope->Length;
210 
211 			switch (device_scope->EntryType){
212 				/* 0x01 and 0x02 are PCI device entries */
213 				case 0x01:
214 				case 0x02:
215 					break;
216 				default:
217 					continue;
218 			}
219 
220 			if (PCI_RID2BUS(rid) != device_scope->Bus)
221 				continue;
222 
223 			pathend = (char *)device_scope + device_scope->Length;
224 			pathremaining = device_scope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE);
225 			while (pathremaining >= sizeof(ACPI_DMAR_PCI_PATH)) {
226 				path = (ACPI_DMAR_PCI_PATH *)(pathend - pathremaining);
227 				pathremaining -= sizeof(ACPI_DMAR_PCI_PATH);
228 
229 				if (PCI_RID2SLOT(rid) != path->Device)
230 					continue;
231 				if (PCI_RID2FUNC(rid) != path->Function)
232 					continue;
233 
234 				vtdmap = vtdmaps[i];
235 				return (vtdmap);
236 			}
237 		}
238 	}
239 
240 	/* No matching scope */
241 	return (NULL);
242 }
243 
244 static void
245 vtd_wbflush(struct vtdmap *vtdmap)
246 {
247 
248 	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
249 		pmap_invalidate_cache();
250 
251 	if (VTD_CAP_RWBF(vtdmap->cap)) {
252 		vtdmap->gcr = VTD_GCR_WBF;
253 		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
254 			;
255 	}
256 }
257 
258 static void
259 vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
260 {
261 
262 	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
263 	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
264 		;
265 }
266 
267 static void
268 vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
269 {
270 	int offset;
271 	volatile uint64_t *iotlb_reg, val;
272 
273 	vtd_wbflush(vtdmap);
274 
275 	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
276 	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
277 
278 	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
279 		      VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
280 
281 	while (1) {
282 		val = *iotlb_reg;
283 		if ((val & VTD_IIR_IVT) == 0)
284 			break;
285 	}
286 }
287 
288 static void
289 vtd_translation_enable(struct vtdmap *vtdmap)
290 {
291 
292 	vtdmap->gcr = VTD_GCR_TE;
293 	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
294 		;
295 }
296 
297 static void
298 vtd_translation_disable(struct vtdmap *vtdmap)
299 {
300 
301 	vtdmap->gcr = 0;
302 	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
303 		;
304 }
305 
306 static int
307 vtd_init(void)
308 {
309 	int i, units, remaining, tmp;
310 	struct vtdmap *vtdmap;
311 	vm_paddr_t ctx_paddr;
312 	char *end, envname[32];
313 	unsigned long mapaddr;
314 	ACPI_STATUS status;
315 	ACPI_TABLE_DMAR *dmar;
316 	ACPI_DMAR_HEADER *hdr;
317 	ACPI_DMAR_HARDWARE_UNIT *drhd;
318 
319 	/*
320 	 * Allow the user to override the ACPI DMAR table by specifying the
321 	 * physical address of each remapping unit.
322 	 *
323 	 * The following example specifies two remapping units at
324 	 * physical addresses 0xfed90000 and 0xfeda0000 respectively.
325 	 * set vtd.regmap.0.addr=0xfed90000
326 	 * set vtd.regmap.1.addr=0xfeda0000
327 	 */
328 	for (units = 0; units < DRHD_MAX_UNITS; units++) {
329 		snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units);
330 		if (getenv_ulong(envname, &mapaddr) == 0)
331 			break;
332 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
333 	}
334 
335 	if (units > 0)
336 		goto skip_dmar;
337 
338 	/* Search for DMAR table. */
339 	status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
340 	if (ACPI_FAILURE(status))
341 		return (ENXIO);
342 
343 	end = (char *)dmar + dmar->Header.Length;
344 	remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR);
345 	while (remaining > sizeof(ACPI_DMAR_HEADER)) {
346 		hdr = (ACPI_DMAR_HEADER *)(end - remaining);
347 		if (hdr->Length > remaining)
348 			break;
349 		/*
350 		 * From Intel VT-d arch spec, version 1.3:
351 		 * BIOS implementations must report mapping structures
352 		 * in numerical order, i.e. All remapping structures of
353 		 * type 0 (DRHD) enumerated before remapping structures of
354 		 * type 1 (RMRR) and so forth.
355 		 */
356 		if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
357 			break;
358 
359 		drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
360 		drhds[units] = drhd;
361 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
362 		if (++units >= DRHD_MAX_UNITS)
363 			break;
364 		remaining -= hdr->Length;
365 	}
366 
367 	if (units <= 0)
368 		return (ENXIO);
369 
370 skip_dmar:
371 	drhd_num = units;
372 
373 	max_domains = 64 * 1024; /* maximum valid value */
374 	for (i = 0; i < drhd_num; i++){
375 		vtdmap = vtdmaps[i];
376 
377 		if (VTD_CAP_CM(vtdmap->cap) != 0)
378 			panic("vtd_init: invalid caching mode");
379 
380 		/* take most compatible (minimum) value */
381 		if ((tmp = vtd_max_domains(vtdmap)) < max_domains)
382 			max_domains = tmp;
383 	}
384 
385 	/*
386 	 * Set up the root-table to point to the context-entry tables
387 	 */
388 	for (i = 0; i < 256; i++) {
389 		ctx_paddr = vtophys(ctx_tables[i]);
390 		if (ctx_paddr & PAGE_MASK)
391 			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
392 
393 		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
394 	}
395 
396 	return (0);
397 }
398 
399 static void
400 vtd_cleanup(void)
401 {
402 }
403 
404 static void
405 vtd_enable(void)
406 {
407 	int i;
408 	struct vtdmap *vtdmap;
409 
410 	for (i = 0; i < drhd_num; i++) {
411 		vtdmap = vtdmaps[i];
412 		vtd_wbflush(vtdmap);
413 
414 		/* Update the root table address */
415 		vtdmap->rta = vtophys(root_table);
416 		vtdmap->gcr = VTD_GCR_SRTP;
417 		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
418 			;
419 
420 		vtd_ctx_global_invalidate(vtdmap);
421 		vtd_iotlb_global_invalidate(vtdmap);
422 
423 		vtd_translation_enable(vtdmap);
424 	}
425 }
426 
427 static void
428 vtd_disable(void)
429 {
430 	int i;
431 	struct vtdmap *vtdmap;
432 
433 	for (i = 0; i < drhd_num; i++) {
434 		vtdmap = vtdmaps[i];
435 		vtd_translation_disable(vtdmap);
436 	}
437 }
438 
439 static void
440 vtd_add_device(void *arg, uint16_t rid)
441 {
442 	int idx;
443 	uint64_t *ctxp;
444 	struct domain *dom = arg;
445 	vm_paddr_t pt_paddr;
446 	struct vtdmap *vtdmap;
447 	uint8_t bus;
448 
449 	bus = PCI_RID2BUS(rid);
450 	ctxp = ctx_tables[bus];
451 	pt_paddr = vtophys(dom->ptp);
452 	idx = VTD_RID2IDX(rid);
453 
454 	if (ctxp[idx] & VTD_CTX_PRESENT) {
455 		panic("vtd_add_device: device %x is already owned by "
456 		      "domain %d", rid,
457 		      (uint16_t)(ctxp[idx + 1] >> 8));
458 	}
459 
460 	if ((vtdmap = vtd_device_scope(rid)) == NULL)
461 		panic("vtd_add_device: device %x is not in scope for "
462 		      "any DMA remapping unit", rid);
463 
464 	/*
465 	 * Order is important. The 'present' bit is set only after all fields
466 	 * of the context pointer are initialized.
467 	 */
468 	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
469 
470 	if (VTD_ECAP_DI(vtdmap->ext_cap))
471 		ctxp[idx] = VTD_CTX_TT_ALL;
472 	else
473 		ctxp[idx] = 0;
474 
475 	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
476 
477 	/*
478 	 * 'Not Present' entries are not cached in either the Context Cache
479 	 * or in the IOTLB, so there is no need to invalidate either of them.
480 	 */
481 }
482 
483 static void
484 vtd_remove_device(void *arg, uint16_t rid)
485 {
486 	int i, idx;
487 	uint64_t *ctxp;
488 	struct vtdmap *vtdmap;
489 	uint8_t bus;
490 
491 	bus = PCI_RID2BUS(rid);
492 	ctxp = ctx_tables[bus];
493 	idx = VTD_RID2IDX(rid);
494 
495 	/*
496 	 * Order is important. The 'present' bit is must be cleared first.
497 	 */
498 	ctxp[idx] = 0;
499 	ctxp[idx + 1] = 0;
500 
501 	/*
502 	 * Invalidate the Context Cache and the IOTLB.
503 	 *
504 	 * XXX use device-selective invalidation for Context Cache
505 	 * XXX use domain-selective invalidation for IOTLB
506 	 */
507 	for (i = 0; i < drhd_num; i++) {
508 		vtdmap = vtdmaps[i];
509 		vtd_ctx_global_invalidate(vtdmap);
510 		vtd_iotlb_global_invalidate(vtdmap);
511 	}
512 }
513 
514 #define	CREATE_MAPPING	0
515 #define	REMOVE_MAPPING	1
516 
517 static uint64_t
518 vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
519 		   int remove)
520 {
521 	struct domain *dom;
522 	int i, spshift, ptpshift, ptpindex, nlevels;
523 	uint64_t spsize, *ptp;
524 
525 	dom = arg;
526 	ptpindex = 0;
527 	ptpshift = 0;
528 
529 	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
530 	    gpa, len));
531 	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
532 	    "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
533 
534 	if (gpa & PAGE_MASK)
535 		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
536 
537 	if (hpa & PAGE_MASK)
538 		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
539 
540 	if (len & PAGE_MASK)
541 		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
542 
543 	/*
544 	 * Compute the size of the mapping that we can accommodate.
545 	 *
546 	 * This is based on three factors:
547 	 * - supported super page size
548 	 * - alignment of the region starting at 'gpa' and 'hpa'
549 	 * - length of the region 'len'
550 	 */
551 	spshift = 48;
552 	for (i = 3; i >= 0; i--) {
553 		spsize = 1UL << spshift;
554 		if ((dom->spsmask & (1 << i)) != 0 &&
555 		    (gpa & (spsize - 1)) == 0 &&
556 		    (hpa & (spsize - 1)) == 0 &&
557 		    (len >= spsize)) {
558 			break;
559 		}
560 		spshift -= 9;
561 	}
562 
563 	ptp = dom->ptp;
564 	nlevels = dom->pt_levels;
565 	while (--nlevels >= 0) {
566 		ptpshift = 12 + nlevels * 9;
567 		ptpindex = (gpa >> ptpshift) & 0x1FF;
568 
569 		/* We have reached the leaf mapping */
570 		if (spshift >= ptpshift) {
571 			break;
572 		}
573 
574 		/*
575 		 * We are working on a non-leaf page table page.
576 		 *
577 		 * Create a downstream page table page if necessary and point
578 		 * to it from the current page table.
579 		 */
580 		if (ptp[ptpindex] == 0) {
581 			void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
582 			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
583 		}
584 
585 		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
586 	}
587 
588 	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
589 		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
590 
591 	/*
592 	 * Update the 'gpa' -> 'hpa' mapping
593 	 */
594 	if (remove) {
595 		ptp[ptpindex] = 0;
596 	} else {
597 		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
598 
599 		if (nlevels > 0)
600 			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
601 	}
602 
603 	return (1UL << ptpshift);
604 }
605 
606 static uint64_t
607 vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
608 {
609 
610 	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
611 }
612 
613 static uint64_t
614 vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
615 {
616 
617 	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
618 }
619 
620 static void
621 vtd_invalidate_tlb(void *dom)
622 {
623 	int i;
624 	struct vtdmap *vtdmap;
625 
626 	/*
627 	 * Invalidate the IOTLB.
628 	 * XXX use domain-selective invalidation for IOTLB
629 	 */
630 	for (i = 0; i < drhd_num; i++) {
631 		vtdmap = vtdmaps[i];
632 		vtd_iotlb_global_invalidate(vtdmap);
633 	}
634 }
635 
636 static void *
637 vtd_create_domain(vm_paddr_t maxaddr)
638 {
639 	struct domain *dom;
640 	vm_paddr_t addr;
641 	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
642 	struct vtdmap *vtdmap;
643 
644 	if (drhd_num <= 0)
645 		panic("vtd_create_domain: no dma remapping hardware available");
646 
647 	/*
648 	 * Calculate AGAW.
649 	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
650 	 */
651 	addr = 0;
652 	for (gaw = 0; addr < maxaddr; gaw++)
653 		addr = 1ULL << gaw;
654 
655 	res = (gaw - 12) % 9;
656 	if (res == 0)
657 		agaw = gaw;
658 	else
659 		agaw = gaw + 9 - res;
660 
661 	if (agaw > 64)
662 		agaw = 64;
663 
664 	/*
665 	 * Select the smallest Supported AGAW and the corresponding number
666 	 * of page table levels.
667 	 */
668 	pt_levels = 2;
669 	sagaw = 30;
670 	addrwidth = 0;
671 
672 	tmp = ~0;
673 	for (i = 0; i < drhd_num; i++) {
674 		vtdmap = vtdmaps[i];
675 		/* take most compatible value */
676 		tmp &= VTD_CAP_SAGAW(vtdmap->cap);
677 	}
678 
679 	for (i = 0; i < 5; i++) {
680 		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
681 			break;
682 		pt_levels++;
683 		addrwidth++;
684 		sagaw += 9;
685 		if (sagaw > 64)
686 			sagaw = 64;
687 	}
688 
689 	if (i >= 5) {
690 		panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d",
691 		      tmp, agaw);
692 	}
693 
694 	dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
695 	dom->pt_levels = pt_levels;
696 	dom->addrwidth = addrwidth;
697 	dom->id = domain_id();
698 	dom->maxaddr = maxaddr;
699 	dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
700 	if ((uintptr_t)dom->ptp & PAGE_MASK)
701 		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
702 
703 #ifdef notyet
704 	/*
705 	 * XXX superpage mappings for the iommu do not work correctly.
706 	 *
707 	 * By default all physical memory is mapped into the host_domain.
708 	 * When a VM is allocated wired memory the pages belonging to it
709 	 * are removed from the host_domain and added to the vm's domain.
710 	 *
711 	 * If the page being removed was mapped using a superpage mapping
712 	 * in the host_domain then we need to demote the mapping before
713 	 * removing the page.
714 	 *
715 	 * There is not any code to deal with the demotion at the moment
716 	 * so we disable superpage mappings altogether.
717 	 */
718 	dom->spsmask = ~0;
719 	for (i = 0; i < drhd_num; i++) {
720 		vtdmap = vtdmaps[i];
721 		/* take most compatible value */
722 		dom->spsmask &= VTD_CAP_SPS(vtdmap->cap);
723 	}
724 #endif
725 
726 	SLIST_INSERT_HEAD(&domhead, dom, next);
727 
728 	return (dom);
729 }
730 
731 static void
732 vtd_free_ptp(uint64_t *ptp, int level)
733 {
734 	int i;
735 	uint64_t *nlp;
736 
737 	if (level > 1) {
738 		for (i = 0; i < 512; i++) {
739 			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
740 				continue;
741 			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
742 				continue;
743 			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
744 			vtd_free_ptp(nlp, level - 1);
745 		}
746 	}
747 
748 	bzero(ptp, PAGE_SIZE);
749 	free(ptp, M_VTD);
750 }
751 
752 static void
753 vtd_destroy_domain(void *arg)
754 {
755 	struct domain *dom;
756 
757 	dom = arg;
758 
759 	SLIST_REMOVE(&domhead, dom, domain, next);
760 	vtd_free_ptp(dom->ptp, dom->pt_levels);
761 	free(dom, M_VTD);
762 }
763 
764 const struct iommu_ops iommu_ops_intel = {
765 	.init = vtd_init,
766 	.cleanup = vtd_cleanup,
767 	.enable = vtd_enable,
768 	.disable = vtd_disable,
769 	.create_domain = vtd_create_domain,
770 	.destroy_domain = vtd_destroy_domain,
771 	.create_mapping = vtd_create_mapping,
772 	.remove_mapping = vtd_remove_mapping,
773 	.add_device = vtd_add_device,
774 	.remove_device = vtd_remove_device,
775 	.invalidate_tlb = vtd_invalidate_tlb,
776 };
777