xref: /freebsd/sys/amd64/vmm/intel/vtd.c (revision db33c6f3ae9d1231087710068ee4ea5398aacca7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/systm.h>
32 #include <sys/malloc.h>
33 
34 #include <vm/vm.h>
35 #include <vm/pmap.h>
36 
37 #include <dev/pci/pcireg.h>
38 
39 #include <machine/vmparam.h>
40 #include <contrib/dev/acpica/include/acpi.h>
41 
42 #include "io/iommu.h"
43 
44 /*
45  * Documented in the "Intel Virtualization Technology for Directed I/O",
46  * Architecture Spec, September 2008.
47  */
48 
49 #define VTD_DRHD_INCLUDE_PCI_ALL(Flags)  (((Flags) >> 0) & 0x1)
50 
51 /* Section 10.4 "Register Descriptions" */
52 struct vtdmap {
53 	volatile uint32_t	version;
54 	volatile uint32_t	res0;
55 	volatile uint64_t	cap;
56 	volatile uint64_t	ext_cap;
57 	volatile uint32_t	gcr;
58 	volatile uint32_t	gsr;
59 	volatile uint64_t	rta;
60 	volatile uint64_t	ccr;
61 };
62 
63 #define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
64 #define	VTD_CAP_ND(cap)		((cap) & 0x7)
65 #define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
66 #define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
67 #define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
68 
69 #define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
70 #define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
71 #define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
72 
73 #define	VTD_GCR_WBF		(1 << 27)
74 #define	VTD_GCR_SRTP		(1 << 30)
75 #define	VTD_GCR_TE		(1U << 31)
76 
77 #define	VTD_GSR_WBFS		(1 << 27)
78 #define	VTD_GSR_RTPS		(1 << 30)
79 #define	VTD_GSR_TES		(1U << 31)
80 
81 #define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
82 #define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
83 
84 #define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
85 #define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
86 #define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
87 #define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
88 #define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
89 #define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
90 #define	VTD_IIR_DOMAIN_P	32
91 
92 #define	VTD_ROOT_PRESENT	0x1
93 #define	VTD_CTX_PRESENT		0x1
94 #define	VTD_CTX_TT_ALL		(1UL << 2)
95 
96 #define	VTD_PTE_RD		(1UL << 0)
97 #define	VTD_PTE_WR		(1UL << 1)
98 #define	VTD_PTE_SUPERPAGE	(1UL << 7)
99 #define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
100 
101 #define VTD_RID2IDX(rid)	(((rid) & 0xff) * 2)
102 
103 struct domain {
104 	uint64_t	*ptp;		/* first level page table page */
105 	int		pt_levels;	/* number of page table levels */
106 	int		addrwidth;	/* 'AW' field in context entry */
107 	int		spsmask;	/* supported super page sizes */
108 	u_int		id;		/* domain id */
109 	vm_paddr_t	maxaddr;	/* highest address to be mapped */
110 	SLIST_ENTRY(domain) next;
111 };
112 
113 static SLIST_HEAD(, domain) domhead;
114 
115 #define	DRHD_MAX_UNITS	16
116 static ACPI_DMAR_HARDWARE_UNIT	*drhds[DRHD_MAX_UNITS];
117 static int			drhd_num;
118 static struct vtdmap		*vtdmaps[DRHD_MAX_UNITS];
119 static int			max_domains;
120 typedef int			(*drhd_ident_func_t)(void);
121 
122 static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
123 static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
124 
125 static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
126 
127 static int
128 vtd_max_domains(struct vtdmap *vtdmap)
129 {
130 	int nd;
131 
132 	nd = VTD_CAP_ND(vtdmap->cap);
133 
134 	switch (nd) {
135 	case 0:
136 		return (16);
137 	case 1:
138 		return (64);
139 	case 2:
140 		return (256);
141 	case 3:
142 		return (1024);
143 	case 4:
144 		return (4 * 1024);
145 	case 5:
146 		return (16 * 1024);
147 	case 6:
148 		return (64 * 1024);
149 	default:
150 		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
151 	}
152 }
153 
154 static u_int
155 domain_id(void)
156 {
157 	u_int id;
158 	struct domain *dom;
159 
160 	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
161 	for (id = 1; id < max_domains; id++) {
162 		SLIST_FOREACH(dom, &domhead, next) {
163 			if (dom->id == id)
164 				break;
165 		}
166 		if (dom == NULL)
167 			break;		/* found it */
168 	}
169 
170 	if (id >= max_domains)
171 		panic("domain ids exhausted");
172 
173 	return (id);
174 }
175 
176 static struct vtdmap *
177 vtd_device_scope(uint16_t rid)
178 {
179 	int i, remaining, pathremaining;
180 	char *end, *pathend;
181 	struct vtdmap *vtdmap;
182 	ACPI_DMAR_HARDWARE_UNIT *drhd;
183 	ACPI_DMAR_DEVICE_SCOPE *device_scope;
184 	ACPI_DMAR_PCI_PATH *path;
185 
186 	for (i = 0; i < drhd_num; i++) {
187 		drhd = drhds[i];
188 
189 		if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) {
190 			/*
191 			 * From Intel VT-d arch spec, version 3.0:
192 			 * If a DRHD structure with INCLUDE_PCI_ALL flag Set is reported
193 			 * for a Segment, it must be enumerated by BIOS after all other
194 			 * DRHD structures for the same Segment.
195 			 */
196 			vtdmap = vtdmaps[i];
197 			return(vtdmap);
198 		}
199 
200 		end = (char *)drhd + drhd->Header.Length;
201 		remaining = drhd->Header.Length - sizeof(ACPI_DMAR_HARDWARE_UNIT);
202 		while (remaining > sizeof(ACPI_DMAR_DEVICE_SCOPE)) {
203 			device_scope = (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining);
204 			remaining -= device_scope->Length;
205 
206 			switch (device_scope->EntryType){
207 				/* 0x01 and 0x02 are PCI device entries */
208 				case 0x01:
209 				case 0x02:
210 					break;
211 				default:
212 					continue;
213 			}
214 
215 			if (PCI_RID2BUS(rid) != device_scope->Bus)
216 				continue;
217 
218 			pathend = (char *)device_scope + device_scope->Length;
219 			pathremaining = device_scope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE);
220 			while (pathremaining >= sizeof(ACPI_DMAR_PCI_PATH)) {
221 				path = (ACPI_DMAR_PCI_PATH *)(pathend - pathremaining);
222 				pathremaining -= sizeof(ACPI_DMAR_PCI_PATH);
223 
224 				if (PCI_RID2SLOT(rid) != path->Device)
225 					continue;
226 				if (PCI_RID2FUNC(rid) != path->Function)
227 					continue;
228 
229 				vtdmap = vtdmaps[i];
230 				return (vtdmap);
231 			}
232 		}
233 	}
234 
235 	/* No matching scope */
236 	return (NULL);
237 }
238 
239 static void
240 vtd_wbflush(struct vtdmap *vtdmap)
241 {
242 
243 	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
244 		pmap_invalidate_cache();
245 
246 	if (VTD_CAP_RWBF(vtdmap->cap)) {
247 		vtdmap->gcr = VTD_GCR_WBF;
248 		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
249 			;
250 	}
251 }
252 
253 static void
254 vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
255 {
256 
257 	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
258 	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
259 		;
260 }
261 
262 static void
263 vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
264 {
265 	int offset;
266 	volatile uint64_t *iotlb_reg, val;
267 
268 	vtd_wbflush(vtdmap);
269 
270 	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
271 	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
272 
273 	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
274 		      VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
275 
276 	while (1) {
277 		val = *iotlb_reg;
278 		if ((val & VTD_IIR_IVT) == 0)
279 			break;
280 	}
281 }
282 
283 static void
284 vtd_translation_enable(struct vtdmap *vtdmap)
285 {
286 
287 	vtdmap->gcr = VTD_GCR_TE;
288 	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
289 		;
290 }
291 
292 static void
293 vtd_translation_disable(struct vtdmap *vtdmap)
294 {
295 
296 	vtdmap->gcr = 0;
297 	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
298 		;
299 }
300 
301 static int
302 vtd_init(void)
303 {
304 	int i, units, remaining, tmp;
305 	struct vtdmap *vtdmap;
306 	vm_paddr_t ctx_paddr;
307 	char *end, envname[32];
308 	unsigned long mapaddr;
309 	ACPI_STATUS status;
310 	ACPI_TABLE_DMAR *dmar;
311 	ACPI_DMAR_HEADER *hdr;
312 	ACPI_DMAR_HARDWARE_UNIT *drhd;
313 
314 	/*
315 	 * Allow the user to override the ACPI DMAR table by specifying the
316 	 * physical address of each remapping unit.
317 	 *
318 	 * The following example specifies two remapping units at
319 	 * physical addresses 0xfed90000 and 0xfeda0000 respectively.
320 	 * set vtd.regmap.0.addr=0xfed90000
321 	 * set vtd.regmap.1.addr=0xfeda0000
322 	 */
323 	for (units = 0; units < DRHD_MAX_UNITS; units++) {
324 		snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units);
325 		if (getenv_ulong(envname, &mapaddr) == 0)
326 			break;
327 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
328 	}
329 
330 	if (units > 0)
331 		goto skip_dmar;
332 
333 	/* Search for DMAR table. */
334 	status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
335 	if (ACPI_FAILURE(status))
336 		return (ENXIO);
337 
338 	end = (char *)dmar + dmar->Header.Length;
339 	remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR);
340 	while (remaining > sizeof(ACPI_DMAR_HEADER)) {
341 		hdr = (ACPI_DMAR_HEADER *)(end - remaining);
342 		if (hdr->Length > remaining)
343 			break;
344 		/*
345 		 * From Intel VT-d arch spec, version 1.3:
346 		 * BIOS implementations must report mapping structures
347 		 * in numerical order, i.e. All remapping structures of
348 		 * type 0 (DRHD) enumerated before remapping structures of
349 		 * type 1 (RMRR) and so forth.
350 		 */
351 		if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
352 			break;
353 
354 		drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
355 		drhds[units] = drhd;
356 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
357 		if (++units >= DRHD_MAX_UNITS)
358 			break;
359 		remaining -= hdr->Length;
360 	}
361 
362 	if (units <= 0)
363 		return (ENXIO);
364 
365 skip_dmar:
366 	drhd_num = units;
367 
368 	max_domains = 64 * 1024; /* maximum valid value */
369 	for (i = 0; i < drhd_num; i++){
370 		vtdmap = vtdmaps[i];
371 
372 		if (VTD_CAP_CM(vtdmap->cap) != 0)
373 			panic("vtd_init: invalid caching mode");
374 
375 		/* take most compatible (minimum) value */
376 		if ((tmp = vtd_max_domains(vtdmap)) < max_domains)
377 			max_domains = tmp;
378 	}
379 
380 	/*
381 	 * Set up the root-table to point to the context-entry tables
382 	 */
383 	for (i = 0; i < 256; i++) {
384 		ctx_paddr = vtophys(ctx_tables[i]);
385 		if (ctx_paddr & PAGE_MASK)
386 			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
387 
388 		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
389 	}
390 
391 	return (0);
392 }
393 
394 static void
395 vtd_cleanup(void)
396 {
397 }
398 
399 static void
400 vtd_enable(void)
401 {
402 	int i;
403 	struct vtdmap *vtdmap;
404 
405 	for (i = 0; i < drhd_num; i++) {
406 		vtdmap = vtdmaps[i];
407 		vtd_wbflush(vtdmap);
408 
409 		/* Update the root table address */
410 		vtdmap->rta = vtophys(root_table);
411 		vtdmap->gcr = VTD_GCR_SRTP;
412 		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
413 			;
414 
415 		vtd_ctx_global_invalidate(vtdmap);
416 		vtd_iotlb_global_invalidate(vtdmap);
417 
418 		vtd_translation_enable(vtdmap);
419 	}
420 }
421 
422 static void
423 vtd_disable(void)
424 {
425 	int i;
426 	struct vtdmap *vtdmap;
427 
428 	for (i = 0; i < drhd_num; i++) {
429 		vtdmap = vtdmaps[i];
430 		vtd_translation_disable(vtdmap);
431 	}
432 }
433 
434 static void
435 vtd_add_device(void *arg, uint16_t rid)
436 {
437 	int idx;
438 	uint64_t *ctxp;
439 	struct domain *dom = arg;
440 	vm_paddr_t pt_paddr;
441 	struct vtdmap *vtdmap;
442 	uint8_t bus;
443 
444 	KASSERT(dom != NULL, ("domain is NULL"));
445 
446 	bus = PCI_RID2BUS(rid);
447 	ctxp = ctx_tables[bus];
448 	pt_paddr = vtophys(dom->ptp);
449 	idx = VTD_RID2IDX(rid);
450 
451 	if (ctxp[idx] & VTD_CTX_PRESENT) {
452 		panic("vtd_add_device: device %x is already owned by "
453 		      "domain %d", rid,
454 		      (uint16_t)(ctxp[idx + 1] >> 8));
455 	}
456 
457 	if ((vtdmap = vtd_device_scope(rid)) == NULL)
458 		panic("vtd_add_device: device %x is not in scope for "
459 		      "any DMA remapping unit", rid);
460 
461 	/*
462 	 * Order is important. The 'present' bit is set only after all fields
463 	 * of the context pointer are initialized.
464 	 */
465 	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
466 
467 	if (VTD_ECAP_DI(vtdmap->ext_cap))
468 		ctxp[idx] = VTD_CTX_TT_ALL;
469 	else
470 		ctxp[idx] = 0;
471 
472 	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
473 
474 	/*
475 	 * 'Not Present' entries are not cached in either the Context Cache
476 	 * or in the IOTLB, so there is no need to invalidate either of them.
477 	 */
478 }
479 
480 static void
481 vtd_remove_device(void *arg, uint16_t rid)
482 {
483 	int i, idx;
484 	uint64_t *ctxp;
485 	struct vtdmap *vtdmap;
486 	uint8_t bus;
487 
488 	bus = PCI_RID2BUS(rid);
489 	ctxp = ctx_tables[bus];
490 	idx = VTD_RID2IDX(rid);
491 
492 	/*
493 	 * Order is important. The 'present' bit is must be cleared first.
494 	 */
495 	ctxp[idx] = 0;
496 	ctxp[idx + 1] = 0;
497 
498 	/*
499 	 * Invalidate the Context Cache and the IOTLB.
500 	 *
501 	 * XXX use device-selective invalidation for Context Cache
502 	 * XXX use domain-selective invalidation for IOTLB
503 	 */
504 	for (i = 0; i < drhd_num; i++) {
505 		vtdmap = vtdmaps[i];
506 		vtd_ctx_global_invalidate(vtdmap);
507 		vtd_iotlb_global_invalidate(vtdmap);
508 	}
509 }
510 
511 #define	CREATE_MAPPING	0
512 #define	REMOVE_MAPPING	1
513 
514 static uint64_t
515 vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
516 		   int remove)
517 {
518 	struct domain *dom;
519 	int i, spshift, ptpshift, ptpindex, nlevels;
520 	uint64_t spsize, *ptp;
521 
522 	dom = arg;
523 	ptpindex = 0;
524 	ptpshift = 0;
525 
526 	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
527 	    gpa, len));
528 	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
529 	    "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
530 
531 	if (gpa & PAGE_MASK)
532 		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
533 
534 	if (hpa & PAGE_MASK)
535 		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
536 
537 	if (len & PAGE_MASK)
538 		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
539 
540 	/*
541 	 * Compute the size of the mapping that we can accommodate.
542 	 *
543 	 * This is based on three factors:
544 	 * - supported super page size
545 	 * - alignment of the region starting at 'gpa' and 'hpa'
546 	 * - length of the region 'len'
547 	 */
548 	spshift = 48;
549 	for (i = 3; i >= 0; i--) {
550 		spsize = 1UL << spshift;
551 		if ((dom->spsmask & (1 << i)) != 0 &&
552 		    (gpa & (spsize - 1)) == 0 &&
553 		    (hpa & (spsize - 1)) == 0 &&
554 		    (len >= spsize)) {
555 			break;
556 		}
557 		spshift -= 9;
558 	}
559 
560 	ptp = dom->ptp;
561 	nlevels = dom->pt_levels;
562 	while (--nlevels >= 0) {
563 		ptpshift = 12 + nlevels * 9;
564 		ptpindex = (gpa >> ptpshift) & 0x1FF;
565 
566 		/* We have reached the leaf mapping */
567 		if (spshift >= ptpshift) {
568 			break;
569 		}
570 
571 		/*
572 		 * We are working on a non-leaf page table page.
573 		 *
574 		 * Create a downstream page table page if necessary and point
575 		 * to it from the current page table.
576 		 */
577 		if (ptp[ptpindex] == 0) {
578 			void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
579 			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
580 		}
581 
582 		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
583 	}
584 
585 	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
586 		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
587 
588 	/*
589 	 * Update the 'gpa' -> 'hpa' mapping
590 	 */
591 	if (remove) {
592 		ptp[ptpindex] = 0;
593 	} else {
594 		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
595 
596 		if (nlevels > 0)
597 			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
598 	}
599 
600 	return (1UL << ptpshift);
601 }
602 
603 static uint64_t
604 vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
605 {
606 
607 	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
608 }
609 
610 static uint64_t
611 vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
612 {
613 
614 	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
615 }
616 
617 static void
618 vtd_invalidate_tlb(void *dom)
619 {
620 	int i;
621 	struct vtdmap *vtdmap;
622 
623 	/*
624 	 * Invalidate the IOTLB.
625 	 * XXX use domain-selective invalidation for IOTLB
626 	 */
627 	for (i = 0; i < drhd_num; i++) {
628 		vtdmap = vtdmaps[i];
629 		vtd_iotlb_global_invalidate(vtdmap);
630 	}
631 }
632 
633 static void *
634 vtd_create_domain(vm_paddr_t maxaddr)
635 {
636 	struct domain *dom;
637 	vm_paddr_t addr;
638 	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
639 	struct vtdmap *vtdmap;
640 
641 	if (drhd_num <= 0)
642 		panic("vtd_create_domain: no dma remapping hardware available");
643 
644 	/*
645 	 * Calculate AGAW.
646 	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
647 	 */
648 	addr = 0;
649 	for (gaw = 0; addr < maxaddr; gaw++)
650 		addr = 1ULL << gaw;
651 
652 	res = (gaw - 12) % 9;
653 	if (res == 0)
654 		agaw = gaw;
655 	else
656 		agaw = gaw + 9 - res;
657 
658 	if (agaw > 64)
659 		agaw = 64;
660 
661 	/*
662 	 * Select the smallest Supported AGAW and the corresponding number
663 	 * of page table levels.
664 	 */
665 	pt_levels = 2;
666 	sagaw = 30;
667 	addrwidth = 0;
668 
669 	tmp = ~0;
670 	for (i = 0; i < drhd_num; i++) {
671 		vtdmap = vtdmaps[i];
672 		/* take most compatible value */
673 		tmp &= VTD_CAP_SAGAW(vtdmap->cap);
674 	}
675 
676 	for (i = 0; i < 5; i++) {
677 		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
678 			break;
679 		pt_levels++;
680 		addrwidth++;
681 		sagaw += 9;
682 		if (sagaw > 64)
683 			sagaw = 64;
684 	}
685 
686 	if (i >= 5) {
687 		panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d",
688 		      tmp, agaw);
689 	}
690 
691 	dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
692 	dom->pt_levels = pt_levels;
693 	dom->addrwidth = addrwidth;
694 	dom->id = domain_id();
695 	dom->maxaddr = maxaddr;
696 	dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
697 	if ((uintptr_t)dom->ptp & PAGE_MASK)
698 		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
699 
700 #ifdef notyet
701 	/*
702 	 * XXX superpage mappings for the iommu do not work correctly.
703 	 *
704 	 * By default all physical memory is mapped into the host_domain.
705 	 * When a VM is allocated wired memory the pages belonging to it
706 	 * are removed from the host_domain and added to the vm's domain.
707 	 *
708 	 * If the page being removed was mapped using a superpage mapping
709 	 * in the host_domain then we need to demote the mapping before
710 	 * removing the page.
711 	 *
712 	 * There is not any code to deal with the demotion at the moment
713 	 * so we disable superpage mappings altogether.
714 	 */
715 	dom->spsmask = ~0;
716 	for (i = 0; i < drhd_num; i++) {
717 		vtdmap = vtdmaps[i];
718 		/* take most compatible value */
719 		dom->spsmask &= VTD_CAP_SPS(vtdmap->cap);
720 	}
721 #endif
722 
723 	SLIST_INSERT_HEAD(&domhead, dom, next);
724 
725 	return (dom);
726 }
727 
728 static void
729 vtd_free_ptp(uint64_t *ptp, int level)
730 {
731 	int i;
732 	uint64_t *nlp;
733 
734 	if (level > 1) {
735 		for (i = 0; i < 512; i++) {
736 			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
737 				continue;
738 			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
739 				continue;
740 			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
741 			vtd_free_ptp(nlp, level - 1);
742 		}
743 	}
744 
745 	bzero(ptp, PAGE_SIZE);
746 	free(ptp, M_VTD);
747 }
748 
749 static void
750 vtd_destroy_domain(void *arg)
751 {
752 	struct domain *dom;
753 
754 	dom = arg;
755 
756 	SLIST_REMOVE(&domhead, dom, domain, next);
757 	vtd_free_ptp(dom->ptp, dom->pt_levels);
758 	free(dom, M_VTD);
759 }
760 
761 const struct iommu_ops iommu_ops_intel = {
762 	.init = vtd_init,
763 	.cleanup = vtd_cleanup,
764 	.enable = vtd_enable,
765 	.disable = vtd_disable,
766 	.create_domain = vtd_create_domain,
767 	.destroy_domain = vtd_destroy_domain,
768 	.create_mapping = vtd_create_mapping,
769 	.remove_mapping = vtd_remove_mapping,
770 	.add_device = vtd_add_device,
771 	.remove_device = vtd_remove_device,
772 	.invalidate_tlb = vtd_invalidate_tlb,
773 };
774