xref: /illumos-gate/usr/src/uts/intel/io/vmm/intel/vtd.c (revision ba5ca68405ba4441c86a6cfc87f4ddcb3565c81d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2018 Joyent, Inc.
41  * Copyright 2022 Oxide Computer Company
42  */
43 
44 #include <sys/cdefs.h>
45 __FBSDID("$FreeBSD$");
46 
47 #include <sys/param.h>
48 #include <sys/kernel.h>
49 #include <sys/systm.h>
50 #include <sys/kmem.h>
51 
52 #include <dev/pci/pcireg.h>
53 
54 #include <machine/vmparam.h>
55 #include <sys/vmm_vm.h>
56 
57 #include <contrib/dev/acpica/include/acpi.h>
58 
59 #include <sys/sunndi.h>
60 
61 #include "io/iommu.h"
62 
63 /*
64  * Documented in the "Intel Virtualization Technology for Directed I/O",
65  * Architecture Spec, September 2008.
66  */
67 
68 #define	VTD_DRHD_INCLUDE_PCI_ALL(Flags)  (((Flags) >> 0) & 0x1)
69 
70 /* Section 10.4 "Register Descriptions" */
71 struct vtdmap {
72 	volatile uint32_t	version;
73 	volatile uint32_t	res0;
74 	volatile uint64_t	cap;
75 	volatile uint64_t	ext_cap;
76 	volatile uint32_t	gcr;
77 	volatile uint32_t	gsr;
78 	volatile uint64_t	rta;
79 	volatile uint64_t	ccr;
80 };
81 
82 #define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
83 #define	VTD_CAP_ND(cap)		((cap) & 0x7)
84 #define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
85 #define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
86 #define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
87 
88 #define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
89 #define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
90 #define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
91 
92 #define	VTD_GCR_WBF		(1 << 27)
93 #define	VTD_GCR_SRTP		(1 << 30)
94 #define	VTD_GCR_TE		(1U << 31)
95 
96 #define	VTD_GSR_WBFS		(1 << 27)
97 #define	VTD_GSR_RTPS		(1 << 30)
98 #define	VTD_GSR_TES		(1U << 31)
99 
100 #define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
101 #define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
102 
103 #define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
104 #define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
105 #define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
106 #define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
107 #define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
108 #define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
109 #define	VTD_IIR_DOMAIN_P	32
110 
111 #define	VTD_ROOT_PRESENT	0x1
112 #define	VTD_CTX_PRESENT		0x1
113 #define	VTD_CTX_TT_ALL		(1UL << 2)
114 
115 #define	VTD_PTE_RD		(1UL << 0)
116 #define	VTD_PTE_WR		(1UL << 1)
117 #define	VTD_PTE_SUPERPAGE	(1UL << 7)
118 #define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
119 
120 #define	VTD_RID2IDX(rid)	(((rid) & 0xff) * 2)
121 
122 struct domain {
123 	uint64_t	*ptp;		/* first level page table page */
124 	int		pt_levels;	/* number of page table levels */
125 	int		addrwidth;	/* 'AW' field in context entry */
126 	int		spsmask;	/* supported super page sizes */
127 	uint_t		id;		/* domain id */
128 	vm_paddr_t	maxaddr;	/* highest address to be mapped */
129 	SLIST_ENTRY(domain) next;
130 };
131 
132 static SLIST_HEAD(, domain) domhead;
133 
134 #define	DRHD_MAX_UNITS	8
135 static ACPI_DMAR_HARDWARE_UNIT	*drhds[DRHD_MAX_UNITS];
136 static int			drhd_num;
137 static struct vtdmap		*vtdmaps[DRHD_MAX_UNITS];
138 static int			max_domains;
139 typedef int			(*drhd_ident_func_t)(void);
140 static dev_info_t		*vtddips[DRHD_MAX_UNITS];
141 
142 static uint64_t root_table[PAGE_SIZE / sizeof (uint64_t)] __aligned(4096);
143 static uint64_t ctx_tables[256][PAGE_SIZE / sizeof (uint64_t)] __aligned(4096);
144 
145 static int
146 vtd_max_domains(struct vtdmap *vtdmap)
147 {
148 	int nd;
149 
150 	nd = VTD_CAP_ND(vtdmap->cap);
151 
152 	switch (nd) {
153 	case 0:
154 		return (16);
155 	case 1:
156 		return (64);
157 	case 2:
158 		return (256);
159 	case 3:
160 		return (1024);
161 	case 4:
162 		return (4 * 1024);
163 	case 5:
164 		return (16 * 1024);
165 	case 6:
166 		return (64 * 1024);
167 	default:
168 		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
169 	}
170 }
171 
172 static uint_t
173 domain_id(void)
174 {
175 	uint_t id;
176 	struct domain *dom;
177 
178 	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
179 	for (id = 1; id < max_domains; id++) {
180 		SLIST_FOREACH(dom, &domhead, next) {
181 			if (dom->id == id)
182 				break;
183 		}
184 		if (dom == NULL)
185 			break;		/* found it */
186 	}
187 
188 	if (id >= max_domains)
189 		panic("domain ids exhausted");
190 
191 	return (id);
192 }
193 
194 static struct vtdmap *
195 vtd_device_scope(uint16_t rid)
196 {
197 	int i, remaining, pathrem;
198 	char *end, *pathend;
199 	struct vtdmap *vtdmap;
200 	ACPI_DMAR_HARDWARE_UNIT *drhd;
201 	ACPI_DMAR_DEVICE_SCOPE *device_scope;
202 	ACPI_DMAR_PCI_PATH *path;
203 
204 	for (i = 0; i < drhd_num; i++) {
205 		drhd = drhds[i];
206 
207 		if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) {
208 			/*
209 			 * From Intel VT-d arch spec, version 3.0:
210 			 * If a DRHD structure with INCLUDE_PCI_ALL flag Set is
211 			 * reported for a Segment, it must be enumerated by BIOS
212 			 * after all other DRHD structures for the same Segment.
213 			 */
214 			vtdmap = vtdmaps[i];
215 			return (vtdmap);
216 		}
217 
218 		end = (char *)drhd + drhd->Header.Length;
219 		remaining = drhd->Header.Length -
220 		    sizeof (ACPI_DMAR_HARDWARE_UNIT);
221 		while (remaining > sizeof (ACPI_DMAR_DEVICE_SCOPE)) {
222 			device_scope =
223 			    (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining);
224 			remaining -= device_scope->Length;
225 
226 			switch (device_scope->EntryType) {
227 				/* 0x01 and 0x02 are PCI device entries */
228 				case 0x01:
229 				case 0x02:
230 					break;
231 				default:
232 					continue;
233 			}
234 
235 			if (PCI_RID2BUS(rid) != device_scope->Bus)
236 				continue;
237 
238 			pathend = (char *)device_scope + device_scope->Length;
239 			pathrem = device_scope->Length -
240 			    sizeof (ACPI_DMAR_DEVICE_SCOPE);
241 			while (pathrem >= sizeof (ACPI_DMAR_PCI_PATH)) {
242 				path = (ACPI_DMAR_PCI_PATH *)
243 				    (pathend - pathrem);
244 				pathrem -= sizeof (ACPI_DMAR_PCI_PATH);
245 
246 				if (PCI_RID2SLOT(rid) != path->Device)
247 					continue;
248 				if (PCI_RID2FUNC(rid) != path->Function)
249 					continue;
250 
251 				vtdmap = vtdmaps[i];
252 				return (vtdmap);
253 			}
254 		}
255 	}
256 
257 	/* No matching scope */
258 	return (NULL);
259 }
260 
261 static void
262 vtd_wbflush(struct vtdmap *vtdmap)
263 {
264 
265 	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
266 		invalidate_cache_all();
267 
268 	if (VTD_CAP_RWBF(vtdmap->cap)) {
269 		vtdmap->gcr = VTD_GCR_WBF;
270 		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
271 			;
272 	}
273 }
274 
275 static void
276 vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
277 {
278 
279 	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
280 	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
281 		;
282 }
283 
284 static void
285 vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
286 {
287 	int offset;
288 	volatile uint64_t *iotlb_reg, val;
289 
290 	vtd_wbflush(vtdmap);
291 
292 	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
293 	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
294 
295 	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
296 	    VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
297 
298 	while (1) {
299 		val = *iotlb_reg;
300 		if ((val & VTD_IIR_IVT) == 0)
301 			break;
302 	}
303 }
304 
305 static void
306 vtd_translation_enable(struct vtdmap *vtdmap)
307 {
308 
309 	vtdmap->gcr = VTD_GCR_TE;
310 	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
311 		;
312 }
313 
314 static void
315 vtd_translation_disable(struct vtdmap *vtdmap)
316 {
317 
318 	vtdmap->gcr = 0;
319 	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
320 		;
321 }
322 
323 static void *
324 vtd_map(dev_info_t *dip)
325 {
326 	caddr_t regs;
327 	ddi_acc_handle_t hdl;
328 	int error;
329 
330 	static ddi_device_acc_attr_t regs_attr = {
331 		DDI_DEVICE_ATTR_V0,
332 		DDI_NEVERSWAP_ACC,
333 		DDI_STRICTORDER_ACC,
334 	};
335 
336 	error = ddi_regs_map_setup(dip, 0, &regs, 0, PAGE_SIZE, &regs_attr,
337 	    &hdl);
338 
339 	if (error != DDI_SUCCESS)
340 		return (NULL);
341 
342 	ddi_set_driver_private(dip, hdl);
343 
344 	return (regs);
345 }
346 
347 static void
348 vtd_unmap(dev_info_t *dip)
349 {
350 	ddi_acc_handle_t hdl = ddi_get_driver_private(dip);
351 
352 	if (hdl != NULL)
353 		ddi_regs_map_free(&hdl);
354 }
355 
356 static dev_info_t *
357 vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *drhd, int unit)
358 {
359 	dev_info_t *dip;
360 	struct ddi_parent_private_data *pdptr;
361 	struct regspec reg;
362 	int circ;
363 
364 	/*
365 	 * Try to find an existing devinfo node for this vtd unit.
366 	 */
367 	ndi_devi_enter(ddi_root_node(), &circ);
368 	dip = ddi_find_devinfo("vtd", unit, 0);
369 	ndi_devi_exit(ddi_root_node(), circ);
370 
371 	if (dip != NULL)
372 		return (dip);
373 
374 	/*
375 	 * None found, construct a devinfo node for this vtd unit.
376 	 */
377 	dip = ddi_add_child(ddi_root_node(), "vtd",
378 	    DEVI_SID_NODEID, unit);
379 
380 	reg.regspec_bustype = 0;
381 	reg.regspec_addr = drhd->Address;
382 	reg.regspec_size = PAGE_SIZE;
383 
384 	/*
385 	 * update the reg properties
386 	 *
387 	 *   reg property will be used for register
388 	 *   set access
389 	 *
390 	 * refer to the bus_map of root nexus driver
391 	 * I/O or memory mapping:
392 	 *
393 	 * <bustype=0, addr=x, len=x>: memory
394 	 * <bustype=1, addr=x, len=x>: i/o
395 	 * <bustype>1, addr=0, len=x>: x86-compatibility i/o
396 	 */
397 	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE,
398 	    dip, "reg", (int *)&reg,
399 	    sizeof (struct regspec) / sizeof (int));
400 
401 	/*
402 	 * This is an artificially constructed dev_info, and we
403 	 * need to set a few more things to be able to use it
404 	 * for ddi_dma_alloc_handle/free_handle.
405 	 */
406 	ddi_set_driver(dip, ddi_get_driver(ddi_root_node()));
407 	DEVI(dip)->devi_bus_dma_allochdl =
408 	    DEVI(ddi_get_driver((ddi_root_node())));
409 
410 	pdptr = kmem_zalloc(sizeof (struct ddi_parent_private_data)
411 	    + sizeof (struct regspec), KM_SLEEP);
412 	pdptr->par_nreg = 1;
413 	pdptr->par_reg = (struct regspec *)(pdptr + 1);
414 	pdptr->par_reg->regspec_bustype = 0;
415 	pdptr->par_reg->regspec_addr = drhd->Address;
416 	pdptr->par_reg->regspec_size = PAGE_SIZE;
417 	ddi_set_parent_data(dip, pdptr);
418 
419 	return (dip);
420 }
421 
422 static int
423 vtd_init(void)
424 {
425 	int i, units, remaining, tmp;
426 	struct vtdmap *vtdmap;
427 	vm_paddr_t ctx_paddr;
428 	char *end;
429 #ifdef __FreeBSD__
430 	char envname[32];
431 	unsigned long mapaddr;
432 #endif
433 	ACPI_STATUS status;
434 	ACPI_TABLE_DMAR *dmar;
435 	ACPI_DMAR_HEADER *hdr;
436 	ACPI_DMAR_HARDWARE_UNIT *drhd;
437 
438 #ifdef __FreeBSD__
439 	/*
440 	 * Allow the user to override the ACPI DMAR table by specifying the
441 	 * physical address of each remapping unit.
442 	 *
443 	 * The following example specifies two remapping units at
444 	 * physical addresses 0xfed90000 and 0xfeda0000 respectively.
445 	 * set vtd.regmap.0.addr=0xfed90000
446 	 * set vtd.regmap.1.addr=0xfeda0000
447 	 */
448 	for (units = 0; units < DRHD_MAX_UNITS; units++) {
449 		snprintf(envname, sizeof (envname), "vtd.regmap.%d.addr",
450 		    units);
451 		if (getenv_ulong(envname, &mapaddr) == 0)
452 			break;
453 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
454 	}
455 
456 	if (units > 0)
457 		goto skip_dmar;
458 #else
459 	units = 0;
460 #endif
461 	/* Search for DMAR table. */
462 	status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
463 	if (ACPI_FAILURE(status))
464 		return (ENXIO);
465 
466 	end = (char *)dmar + dmar->Header.Length;
467 	remaining = dmar->Header.Length - sizeof (ACPI_TABLE_DMAR);
468 	while (remaining > sizeof (ACPI_DMAR_HEADER)) {
469 		hdr = (ACPI_DMAR_HEADER *)(end - remaining);
470 		if (hdr->Length > remaining)
471 			break;
472 		/*
473 		 * From Intel VT-d arch spec, version 1.3:
474 		 * BIOS implementations must report mapping structures
475 		 * in numerical order, i.e. All remapping structures of
476 		 * type 0 (DRHD) enumerated before remapping structures of
477 		 * type 1 (RMRR) and so forth.
478 		 */
479 		if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
480 			break;
481 
482 		drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
483 		drhds[units] = drhd;
484 #ifdef __FreeBSD__
485 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
486 #else
487 		vtddips[units] = vtd_get_dip(drhd, units);
488 		vtdmaps[units] = (struct vtdmap *)vtd_map(vtddips[units]);
489 		if (vtdmaps[units] == NULL)
490 			goto fail;
491 #endif
492 		if (++units >= DRHD_MAX_UNITS)
493 			break;
494 		remaining -= hdr->Length;
495 	}
496 
497 	if (units <= 0)
498 		return (ENXIO);
499 
500 #ifdef __FreeBSD__
501 skip_dmar:
502 #endif
503 	drhd_num = units;
504 
505 	max_domains = 64 * 1024; /* maximum valid value */
506 	for (i = 0; i < drhd_num; i++) {
507 		vtdmap = vtdmaps[i];
508 
509 		if (VTD_CAP_CM(vtdmap->cap) != 0)
510 			panic("vtd_init: invalid caching mode");
511 
512 		/* take most compatible (minimum) value */
513 		if ((tmp = vtd_max_domains(vtdmap)) < max_domains)
514 			max_domains = tmp;
515 	}
516 
517 	/*
518 	 * Set up the root-table to point to the context-entry tables
519 	 */
520 	for (i = 0; i < 256; i++) {
521 		ctx_paddr = vtophys(ctx_tables[i]);
522 		if (ctx_paddr & PAGE_MASK)
523 			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
524 
525 		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
526 	}
527 
528 	return (0);
529 
530 #ifndef __FreeBSD__
531 fail:
532 	for (i = 0; i <= units; i++)
533 		vtd_unmap(vtddips[i]);
534 	return (ENXIO);
535 #endif
536 }
537 
538 static void
539 vtd_cleanup(void)
540 {
541 #ifndef __FreeBSD__
542 	int i;
543 
544 	KASSERT(SLIST_EMPTY(&domhead), ("domain list not empty"));
545 
546 	bzero(root_table, sizeof (root_table));
547 
548 	for (i = 0; i <= drhd_num; i++) {
549 		vtdmaps[i] = NULL;
550 		/*
551 		 * Unmap the vtd registers. Note that the devinfo nodes
552 		 * themselves aren't removed, they are considered system state
553 		 * and can be reused when the module is reloaded.
554 		 */
555 		if (vtddips[i] != NULL)
556 			vtd_unmap(vtddips[i]);
557 	}
558 #endif
559 }
560 
561 static void
562 vtd_enable(void)
563 {
564 	int i;
565 	struct vtdmap *vtdmap;
566 
567 	for (i = 0; i < drhd_num; i++) {
568 		vtdmap = vtdmaps[i];
569 		vtd_wbflush(vtdmap);
570 
571 		/* Update the root table address */
572 		vtdmap->rta = vtophys(root_table);
573 		vtdmap->gcr = VTD_GCR_SRTP;
574 		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
575 			;
576 
577 		vtd_ctx_global_invalidate(vtdmap);
578 		vtd_iotlb_global_invalidate(vtdmap);
579 
580 		vtd_translation_enable(vtdmap);
581 	}
582 }
583 
584 static void
585 vtd_disable(void)
586 {
587 	int i;
588 	struct vtdmap *vtdmap;
589 
590 	for (i = 0; i < drhd_num; i++) {
591 		vtdmap = vtdmaps[i];
592 		vtd_translation_disable(vtdmap);
593 	}
594 }
595 
596 static void
597 vtd_add_device(void *arg, uint16_t rid)
598 {
599 	int idx;
600 	uint64_t *ctxp;
601 	struct domain *dom = arg;
602 	vm_paddr_t pt_paddr;
603 	struct vtdmap *vtdmap;
604 	uint8_t bus;
605 
606 	bus = PCI_RID2BUS(rid);
607 	ctxp = ctx_tables[bus];
608 	pt_paddr = vtophys(dom->ptp);
609 	idx = VTD_RID2IDX(rid);
610 
611 	if (ctxp[idx] & VTD_CTX_PRESENT) {
612 		panic("vtd_add_device: device %x is already owned by "
613 		    "domain %d", rid, (uint16_t)(ctxp[idx + 1] >> 8));
614 	}
615 
616 	if ((vtdmap = vtd_device_scope(rid)) == NULL)
617 		panic("vtd_add_device: device %x is not in scope for "
618 		    "any DMA remapping unit", rid);
619 
620 	/*
621 	 * Order is important. The 'present' bit is set only after all fields
622 	 * of the context pointer are initialized.
623 	 */
624 	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
625 
626 	if (VTD_ECAP_DI(vtdmap->ext_cap))
627 		ctxp[idx] = VTD_CTX_TT_ALL;
628 	else
629 		ctxp[idx] = 0;
630 
631 	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
632 
633 	/*
634 	 * 'Not Present' entries are not cached in either the Context Cache
635 	 * or in the IOTLB, so there is no need to invalidate either of them.
636 	 */
637 }
638 
639 static void
640 vtd_remove_device(void *arg, uint16_t rid)
641 {
642 	int i, idx;
643 	uint64_t *ctxp;
644 	struct vtdmap *vtdmap;
645 	uint8_t bus;
646 
647 	bus = PCI_RID2BUS(rid);
648 	ctxp = ctx_tables[bus];
649 	idx = VTD_RID2IDX(rid);
650 
651 	/*
652 	 * Order is important. The 'present' bit is must be cleared first.
653 	 */
654 	ctxp[idx] = 0;
655 	ctxp[idx + 1] = 0;
656 
657 	/*
658 	 * Invalidate the Context Cache and the IOTLB.
659 	 *
660 	 * XXX use device-selective invalidation for Context Cache
661 	 * XXX use domain-selective invalidation for IOTLB
662 	 */
663 	for (i = 0; i < drhd_num; i++) {
664 		vtdmap = vtdmaps[i];
665 		vtd_ctx_global_invalidate(vtdmap);
666 		vtd_iotlb_global_invalidate(vtdmap);
667 	}
668 }
669 
670 #define	CREATE_MAPPING	0
671 #define	REMOVE_MAPPING	1
672 
673 static uint64_t
674 vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
675     int remove)
676 {
677 	struct domain *dom;
678 	int i, spshift, ptpshift, ptpindex, nlevels;
679 	uint64_t spsize, *ptp;
680 
681 	dom = arg;
682 	ptpindex = 0;
683 	ptpshift = 0;
684 
685 	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %lx/%lx", __func__,
686 	    gpa, len));
687 	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %lx/%lx beyond "
688 	    "domain maxaddr %lx", __func__, gpa, len, dom->maxaddr));
689 
690 	if (gpa & PAGE_MASK)
691 		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
692 
693 	if (hpa & PAGE_MASK)
694 		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
695 
696 	if (len & PAGE_MASK)
697 		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
698 
699 	/*
700 	 * Compute the size of the mapping that we can accommodate.
701 	 *
702 	 * This is based on three factors:
703 	 * - supported super page size
704 	 * - alignment of the region starting at 'gpa' and 'hpa'
705 	 * - length of the region 'len'
706 	 */
707 	spshift = 48;
708 	for (i = 3; i >= 0; i--) {
709 		spsize = 1UL << spshift;
710 		if ((dom->spsmask & (1 << i)) != 0 &&
711 		    (gpa & (spsize - 1)) == 0 &&
712 		    (hpa & (spsize - 1)) == 0 &&
713 		    (len >= spsize)) {
714 			break;
715 		}
716 		spshift -= 9;
717 	}
718 
719 	ptp = dom->ptp;
720 	nlevels = dom->pt_levels;
721 	while (--nlevels >= 0) {
722 		ptpshift = 12 + nlevels * 9;
723 		ptpindex = (gpa >> ptpshift) & 0x1FF;
724 
725 		/* We have reached the leaf mapping */
726 		if (spshift >= ptpshift) {
727 			break;
728 		}
729 
730 		/*
731 		 * We are working on a non-leaf page table page.
732 		 *
733 		 * Create a downstream page table page if necessary and point
734 		 * to it from the current page table.
735 		 */
736 		if (ptp[ptpindex] == 0) {
737 			void *nlp = vmm_ptp_alloc();
738 			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
739 		}
740 
741 		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
742 	}
743 
744 	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
745 		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
746 
747 	/*
748 	 * Update the 'gpa' -> 'hpa' mapping
749 	 */
750 	if (remove) {
751 		ptp[ptpindex] = 0;
752 	} else {
753 		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
754 
755 		if (nlevels > 0)
756 			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
757 	}
758 
759 	return (1UL << ptpshift);
760 }
761 
762 static uint64_t
763 vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
764 {
765 
766 	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
767 }
768 
769 static uint64_t
770 vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
771 {
772 
773 	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
774 }
775 
776 static void
777 vtd_invalidate_tlb(void *dom)
778 {
779 	int i;
780 	struct vtdmap *vtdmap;
781 
782 	/*
783 	 * Invalidate the IOTLB.
784 	 * XXX use domain-selective invalidation for IOTLB
785 	 */
786 	for (i = 0; i < drhd_num; i++) {
787 		vtdmap = vtdmaps[i];
788 		vtd_iotlb_global_invalidate(vtdmap);
789 	}
790 }
791 
792 static void *
793 vtd_create_domain(vm_paddr_t maxaddr)
794 {
795 	struct domain *dom;
796 	vm_paddr_t addr;
797 	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
798 	struct vtdmap *vtdmap;
799 
800 	if (drhd_num <= 0)
801 		panic("vtd_create_domain: no dma remapping hardware available");
802 
803 	/*
804 	 * Calculate AGAW.
805 	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
806 	 */
807 	addr = 0;
808 	for (gaw = 0; addr < maxaddr; gaw++)
809 		addr = 1ULL << gaw;
810 
811 	res = (gaw - 12) % 9;
812 	if (res == 0)
813 		agaw = gaw;
814 	else
815 		agaw = gaw + 9 - res;
816 
817 	if (agaw > 64)
818 		agaw = 64;
819 
820 	/*
821 	 * Select the smallest Supported AGAW and the corresponding number
822 	 * of page table levels.
823 	 */
824 	pt_levels = 2;
825 	sagaw = 30;
826 	addrwidth = 0;
827 
828 	tmp = ~0;
829 	for (i = 0; i < drhd_num; i++) {
830 		vtdmap = vtdmaps[i];
831 		/* take most compatible value */
832 		tmp &= VTD_CAP_SAGAW(vtdmap->cap);
833 	}
834 
835 	for (i = 0; i < 5; i++) {
836 		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
837 			break;
838 		pt_levels++;
839 		addrwidth++;
840 		sagaw += 9;
841 		if (sagaw > 64)
842 			sagaw = 64;
843 	}
844 
845 	if (i >= 5) {
846 		panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d",
847 		    tmp, agaw);
848 	}
849 
850 	dom = kmem_zalloc(sizeof (struct domain), KM_SLEEP);
851 	dom->pt_levels = pt_levels;
852 	dom->addrwidth = addrwidth;
853 	dom->id = domain_id();
854 	dom->maxaddr = maxaddr;
855 	dom->ptp = vmm_ptp_alloc();
856 	if ((uintptr_t)dom->ptp & PAGE_MASK)
857 		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
858 
859 #ifdef __FreeBSD__
860 #ifdef notyet
861 	/*
862 	 * XXX superpage mappings for the iommu do not work correctly.
863 	 *
864 	 * By default all physical memory is mapped into the host_domain.
865 	 * When a VM is allocated wired memory the pages belonging to it
866 	 * are removed from the host_domain and added to the vm's domain.
867 	 *
868 	 * If the page being removed was mapped using a superpage mapping
869 	 * in the host_domain then we need to demote the mapping before
870 	 * removing the page.
871 	 *
872 	 * There is not any code to deal with the demotion at the moment
873 	 * so we disable superpage mappings altogether.
874 	 */
875 	dom->spsmask = ~0;
876 	for (i = 0; i < drhd_num; i++) {
877 		vtdmap = vtdmaps[i];
878 		/* take most compatible value */
879 		dom->spsmask &= VTD_CAP_SPS(vtdmap->cap);
880 	}
881 #endif
882 #else
883 	/*
884 	 * On illumos we decidedly do not remove memory mapped to a VM's domain
885 	 * from the host_domain, so we don't have to deal with page demotion and
886 	 * can just use large pages.
887 	 *
888 	 * Since VM memory is currently allocated as 4k pages and mapped into
889 	 * the VM domain page by page, the use of large pages is essentially
890 	 * limited to the host_domain.
891 	 */
892 	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
893 #endif
894 
895 	SLIST_INSERT_HEAD(&domhead, dom, next);
896 
897 	return (dom);
898 }
899 
900 static void
901 vtd_free_ptp(uint64_t *ptp, int level)
902 {
903 	int i;
904 	uint64_t *nlp;
905 
906 	if (level > 1) {
907 		for (i = 0; i < 512; i++) {
908 			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
909 				continue;
910 			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
911 				continue;
912 			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
913 			vtd_free_ptp(nlp, level - 1);
914 		}
915 	}
916 
917 	vmm_ptp_free(ptp);
918 }
919 
920 static void
921 vtd_destroy_domain(void *arg)
922 {
923 	struct domain *dom;
924 
925 	dom = arg;
926 
927 	SLIST_REMOVE(&domhead, dom, domain, next);
928 	vtd_free_ptp(dom->ptp, dom->pt_levels);
929 	kmem_free(dom, sizeof (*dom));
930 }
931 
932 const struct iommu_ops vmm_iommu_ops = {
933 	.init = vtd_init,
934 	.cleanup = vtd_cleanup,
935 	.enable = vtd_enable,
936 	.disable = vtd_disable,
937 	.create_domain = vtd_create_domain,
938 	.destroy_domain = vtd_destroy_domain,
939 	.create_mapping = vtd_create_mapping,
940 	.remove_mapping = vtd_remove_mapping,
941 	.add_device = vtd_add_device,
942 	.remove_device = vtd_remove_device,
943 	.invalidate_tlb = vtd_invalidate_tlb,
944 };
945 
946 
947 static struct modlmisc modlmisc = {
948 	&mod_miscops,
949 	"bhyve vmm vtd",
950 };
951 
952 static struct modlinkage modlinkage = {
953 	MODREV_1,
954 	&modlmisc,
955 	NULL
956 };
957 
958 int
959 _init(void)
960 {
961 	return (mod_install(&modlinkage));
962 }
963 
964 int
965 _fini(void)
966 {
967 	return (mod_remove(&modlinkage));
968 }
969 
970 int
971 _info(struct modinfo *modinfop)
972 {
973 	return (mod_info(&modlinkage, modinfop));
974 }
975