xref: /freebsd/usr.sbin/bhyve/pci_passthru.c (revision 1e4896b176ff664dc9c2fce5426bf2fdf8017a7d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #ifndef WITHOUT_CAPSICUM
36 #include <sys/capsicum.h>
37 #endif
38 #include <sys/types.h>
39 #include <sys/mman.h>
40 #include <sys/pciio.h>
41 #include <sys/ioctl.h>
42 
43 #include <dev/io/iodev.h>
44 #include <dev/pci/pcireg.h>
45 
46 #include <machine/iodev.h>
47 
48 #ifndef WITHOUT_CAPSICUM
49 #include <capsicum_helpers.h>
50 #endif
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <err.h>
55 #include <errno.h>
56 #include <fcntl.h>
57 #include <sysexits.h>
58 #include <unistd.h>
59 
60 #include <machine/vmm.h>
61 #include <vmmapi.h>
62 #include "pci_emul.h"
63 #include "mem.h"
64 
65 #ifndef _PATH_DEVPCI
66 #define	_PATH_DEVPCI	"/dev/pci"
67 #endif
68 
69 #ifndef	_PATH_DEVIO
70 #define	_PATH_DEVIO	"/dev/io"
71 #endif
72 
73 #ifndef _PATH_MEM
74 #define	_PATH_MEM	"/dev/mem"
75 #endif
76 
77 #define	LEGACY_SUPPORT	1
78 
79 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
80 #define MSIX_CAPLEN 12
81 
82 static int pcifd = -1;
83 static int iofd = -1;
84 static int memfd = -1;
85 
86 struct passthru_softc {
87 	struct pci_devinst *psc_pi;
88 	struct pcibar psc_bar[PCI_BARMAX + 1];
89 	struct {
90 		int		capoff;
91 		int		msgctrl;
92 		int		emulated;
93 	} psc_msi;
94 	struct {
95 		int		capoff;
96 	} psc_msix;
97 	struct pcisel psc_sel;
98 };
99 
100 static int
101 msi_caplen(int msgctrl)
102 {
103 	int len;
104 
105 	len = 10;		/* minimum length of msi capability */
106 
107 	if (msgctrl & PCIM_MSICTRL_64BIT)
108 		len += 4;
109 
110 #if 0
111 	/*
112 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
113 	 * We'll let the guest manipulate them directly.
114 	 */
115 	if (msgctrl & PCIM_MSICTRL_VECTOR)
116 		len += 10;
117 #endif
118 
119 	return (len);
120 }
121 
122 static uint32_t
123 read_config(const struct pcisel *sel, long reg, int width)
124 {
125 	struct pci_io pi;
126 
127 	bzero(&pi, sizeof(pi));
128 	pi.pi_sel = *sel;
129 	pi.pi_reg = reg;
130 	pi.pi_width = width;
131 
132 	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
133 		return (0);				/* XXX */
134 	else
135 		return (pi.pi_data);
136 }
137 
138 static void
139 write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
140 {
141 	struct pci_io pi;
142 
143 	bzero(&pi, sizeof(pi));
144 	pi.pi_sel = *sel;
145 	pi.pi_reg = reg;
146 	pi.pi_width = width;
147 	pi.pi_data = data;
148 
149 	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
150 }
151 
152 #ifdef LEGACY_SUPPORT
153 static int
154 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
155 {
156 	int capoff, i;
157 	struct msicap msicap;
158 	u_char *capdata;
159 
160 	pci_populate_msicap(&msicap, msgnum, nextptr);
161 
162 	/*
163 	 * XXX
164 	 * Copy the msi capability structure in the last 16 bytes of the
165 	 * config space. This is wrong because it could shadow something
166 	 * useful to the device.
167 	 */
168 	capoff = 256 - roundup(sizeof(msicap), 4);
169 	capdata = (u_char *)&msicap;
170 	for (i = 0; i < sizeof(msicap); i++)
171 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
172 
173 	return (capoff);
174 }
175 #endif	/* LEGACY_SUPPORT */
176 
177 static int
178 cfginitmsi(struct passthru_softc *sc)
179 {
180 	int i, ptr, capptr, cap, sts, caplen, table_size;
181 	uint32_t u32;
182 	struct pcisel sel;
183 	struct pci_devinst *pi;
184 	struct msixcap msixcap;
185 	uint32_t *msixcap_ptr;
186 
187 	pi = sc->psc_pi;
188 	sel = sc->psc_sel;
189 
190 	/*
191 	 * Parse the capabilities and cache the location of the MSI
192 	 * and MSI-X capabilities.
193 	 */
194 	sts = read_config(&sel, PCIR_STATUS, 2);
195 	if (sts & PCIM_STATUS_CAPPRESENT) {
196 		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
197 		while (ptr != 0 && ptr != 0xff) {
198 			cap = read_config(&sel, ptr + PCICAP_ID, 1);
199 			if (cap == PCIY_MSI) {
200 				/*
201 				 * Copy the MSI capability into the config
202 				 * space of the emulated pci device
203 				 */
204 				sc->psc_msi.capoff = ptr;
205 				sc->psc_msi.msgctrl = read_config(&sel,
206 								  ptr + 2, 2);
207 				sc->psc_msi.emulated = 0;
208 				caplen = msi_caplen(sc->psc_msi.msgctrl);
209 				capptr = ptr;
210 				while (caplen > 0) {
211 					u32 = read_config(&sel, capptr, 4);
212 					pci_set_cfgdata32(pi, capptr, u32);
213 					caplen -= 4;
214 					capptr += 4;
215 				}
216 			} else if (cap == PCIY_MSIX) {
217 				/*
218 				 * Copy the MSI-X capability
219 				 */
220 				sc->psc_msix.capoff = ptr;
221 				caplen = 12;
222 				msixcap_ptr = (uint32_t*) &msixcap;
223 				capptr = ptr;
224 				while (caplen > 0) {
225 					u32 = read_config(&sel, capptr, 4);
226 					*msixcap_ptr = u32;
227 					pci_set_cfgdata32(pi, capptr, u32);
228 					caplen -= 4;
229 					capptr += 4;
230 					msixcap_ptr++;
231 				}
232 			}
233 			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
234 		}
235 	}
236 
237 	if (sc->psc_msix.capoff != 0) {
238 		pi->pi_msix.pba_bar =
239 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
240 		pi->pi_msix.pba_offset =
241 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
242 		pi->pi_msix.table_bar =
243 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
244 		pi->pi_msix.table_offset =
245 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
246 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
247 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
248 
249 		/* Allocate the emulated MSI-X table array */
250 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
251 		pi->pi_msix.table = calloc(1, table_size);
252 
253 		/* Mask all table entries */
254 		for (i = 0; i < pi->pi_msix.table_count; i++) {
255 			pi->pi_msix.table[i].vector_control |=
256 						PCIM_MSIX_VCTRL_MASK;
257 		}
258 	}
259 
260 #ifdef LEGACY_SUPPORT
261 	/*
262 	 * If the passthrough device does not support MSI then craft a
263 	 * MSI capability for it. We link the new MSI capability at the
264 	 * head of the list of capabilities.
265 	 */
266 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
267 		int origptr, msiptr;
268 		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
269 		msiptr = passthru_add_msicap(pi, 1, origptr);
270 		sc->psc_msi.capoff = msiptr;
271 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
272 		sc->psc_msi.emulated = 1;
273 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
274 	}
275 #endif
276 
277 	/* Make sure one of the capabilities is present */
278 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
279 		return (-1);
280 	else
281 		return (0);
282 }
283 
284 static uint64_t
285 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
286 {
287 	struct pci_devinst *pi;
288 	struct msix_table_entry *entry;
289 	uint8_t *src8;
290 	uint16_t *src16;
291 	uint32_t *src32;
292 	uint64_t *src64;
293 	uint64_t data;
294 	size_t entry_offset;
295 	int index;
296 
297 	pi = sc->psc_pi;
298 	if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset &&
299 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
300 		switch(size) {
301 		case 1:
302 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
303 			    pi->pi_msix.pba_page_offset);
304 			data = *src8;
305 			break;
306 		case 2:
307 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
308 			    pi->pi_msix.pba_page_offset);
309 			data = *src16;
310 			break;
311 		case 4:
312 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
313 			    pi->pi_msix.pba_page_offset);
314 			data = *src32;
315 			break;
316 		case 8:
317 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
318 			    pi->pi_msix.pba_page_offset);
319 			data = *src64;
320 			break;
321 		default:
322 			return (-1);
323 		}
324 		return (data);
325 	}
326 
327 	if (offset < pi->pi_msix.table_offset)
328 		return (-1);
329 
330 	offset -= pi->pi_msix.table_offset;
331 	index = offset / MSIX_TABLE_ENTRY_SIZE;
332 	if (index >= pi->pi_msix.table_count)
333 		return (-1);
334 
335 	entry = &pi->pi_msix.table[index];
336 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
337 
338 	switch(size) {
339 	case 1:
340 		src8 = (uint8_t *)((void *)entry + entry_offset);
341 		data = *src8;
342 		break;
343 	case 2:
344 		src16 = (uint16_t *)((void *)entry + entry_offset);
345 		data = *src16;
346 		break;
347 	case 4:
348 		src32 = (uint32_t *)((void *)entry + entry_offset);
349 		data = *src32;
350 		break;
351 	case 8:
352 		src64 = (uint64_t *)((void *)entry + entry_offset);
353 		data = *src64;
354 		break;
355 	default:
356 		return (-1);
357 	}
358 
359 	return (data);
360 }
361 
362 static void
363 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
364 		 uint64_t offset, int size, uint64_t data)
365 {
366 	struct pci_devinst *pi;
367 	struct msix_table_entry *entry;
368 	uint8_t *dest8;
369 	uint16_t *dest16;
370 	uint32_t *dest32;
371 	uint64_t *dest64;
372 	size_t entry_offset;
373 	uint32_t vector_control;
374 	int index;
375 
376 	pi = sc->psc_pi;
377 	if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset &&
378 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
379 		switch(size) {
380 		case 1:
381 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
382 			    pi->pi_msix.pba_page_offset);
383 			*dest8 = data;
384 			break;
385 		case 2:
386 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
387 			    pi->pi_msix.pba_page_offset);
388 			*dest16 = data;
389 			break;
390 		case 4:
391 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
392 			    pi->pi_msix.pba_page_offset);
393 			*dest32 = data;
394 			break;
395 		case 8:
396 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
397 			    pi->pi_msix.pba_page_offset);
398 			*dest64 = data;
399 			break;
400 		default:
401 			break;
402 		}
403 		return;
404 	}
405 
406 	if (offset < pi->pi_msix.table_offset)
407 		return;
408 
409 	offset -= pi->pi_msix.table_offset;
410 	index = offset / MSIX_TABLE_ENTRY_SIZE;
411 	if (index >= pi->pi_msix.table_count)
412 		return;
413 
414 	entry = &pi->pi_msix.table[index];
415 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
416 
417 	/* Only 4 byte naturally-aligned writes are supported */
418 	assert(size == 4);
419 	assert(entry_offset % 4 == 0);
420 
421 	vector_control = entry->vector_control;
422 	dest32 = (uint32_t *)((void *)entry + entry_offset);
423 	*dest32 = data;
424 	/* If MSI-X hasn't been enabled, do nothing */
425 	if (pi->pi_msix.enabled) {
426 		/* If the entry is masked, don't set it up */
427 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
428 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
429 			(void)vm_setup_pptdev_msix(ctx, vcpu,
430 			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
431 			    sc->psc_sel.pc_func, index, entry->addr,
432 			    entry->msg_data, entry->vector_control);
433 		}
434 	}
435 }
436 
437 static int
438 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
439 {
440 	int b, s, f;
441 	int error, idx;
442 	size_t len, remaining;
443 	uint32_t table_size, table_offset;
444 	uint32_t pba_size, pba_offset;
445 	vm_paddr_t start;
446 	struct pci_devinst *pi = sc->psc_pi;
447 
448 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
449 
450 	b = sc->psc_sel.pc_bus;
451 	s = sc->psc_sel.pc_dev;
452 	f = sc->psc_sel.pc_func;
453 
454 	/*
455 	 * If the MSI-X table BAR maps memory intended for
456 	 * other uses, it is at least assured that the table
457 	 * either resides in its own page within the region,
458 	 * or it resides in a page shared with only the PBA.
459 	 */
460 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
461 
462 	table_size = pi->pi_msix.table_offset - table_offset;
463 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
464 	table_size = roundup2(table_size, 4096);
465 
466 	idx = pi->pi_msix.table_bar;
467 	start = pi->pi_bar[idx].addr;
468 	remaining = pi->pi_bar[idx].size;
469 
470 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
471 		pba_offset = pi->pi_msix.pba_offset;
472 		pba_size = pi->pi_msix.pba_size;
473 		if (pba_offset >= table_offset + table_size ||
474 		    table_offset >= pba_offset + pba_size) {
475 			/*
476 			 * If the PBA does not share a page with the MSI-x
477 			 * tables, no PBA emulation is required.
478 			 */
479 			pi->pi_msix.pba_page = NULL;
480 			pi->pi_msix.pba_page_offset = 0;
481 		} else {
482 			/*
483 			 * The PBA overlaps with either the first or last
484 			 * page of the MSI-X table region.  Map the
485 			 * appropriate page.
486 			 */
487 			if (pba_offset <= table_offset)
488 				pi->pi_msix.pba_page_offset = table_offset;
489 			else
490 				pi->pi_msix.pba_page_offset = table_offset +
491 				    table_size - 4096;
492 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
493 			    PROT_WRITE, MAP_SHARED, memfd, start +
494 			    pi->pi_msix.pba_page_offset);
495 			if (pi->pi_msix.pba_page == MAP_FAILED) {
496 				warn(
497 			    "Failed to map PBA page for MSI-X on %d/%d/%d",
498 				    b, s, f);
499 				return (-1);
500 			}
501 		}
502 	}
503 
504 	/* Map everything before the MSI-X table */
505 	if (table_offset > 0) {
506 		len = table_offset;
507 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
508 		if (error)
509 			return (error);
510 
511 		base += len;
512 		start += len;
513 		remaining -= len;
514 	}
515 
516 	/* Skip the MSI-X table */
517 	base += table_size;
518 	start += table_size;
519 	remaining -= table_size;
520 
521 	/* Map everything beyond the end of the MSI-X table */
522 	if (remaining > 0) {
523 		len = remaining;
524 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
525 		if (error)
526 			return (error);
527 	}
528 
529 	return (0);
530 }
531 
532 static int
533 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
534 {
535 	int i, error;
536 	struct pci_devinst *pi;
537 	struct pci_bar_io bar;
538 	enum pcibar_type bartype;
539 	uint64_t base, size;
540 
541 	pi = sc->psc_pi;
542 
543 	/*
544 	 * Initialize BAR registers
545 	 */
546 	for (i = 0; i <= PCI_BARMAX; i++) {
547 		bzero(&bar, sizeof(bar));
548 		bar.pbi_sel = sc->psc_sel;
549 		bar.pbi_reg = PCIR_BAR(i);
550 
551 		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
552 			continue;
553 
554 		if (PCI_BAR_IO(bar.pbi_base)) {
555 			bartype = PCIBAR_IO;
556 			base = bar.pbi_base & PCIM_BAR_IO_BASE;
557 		} else {
558 			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
559 			case PCIM_BAR_MEM_64:
560 				bartype = PCIBAR_MEM64;
561 				break;
562 			default:
563 				bartype = PCIBAR_MEM32;
564 				break;
565 			}
566 			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
567 		}
568 		size = bar.pbi_length;
569 
570 		if (bartype != PCIBAR_IO) {
571 			if (((base | size) & PAGE_MASK) != 0) {
572 				warnx("passthru device %d/%d/%d BAR %d: "
573 				    "base %#lx or size %#lx not page aligned\n",
574 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
575 				    sc->psc_sel.pc_func, i, base, size);
576 				return (-1);
577 			}
578 		}
579 
580 		/* Cache information about the "real" BAR */
581 		sc->psc_bar[i].type = bartype;
582 		sc->psc_bar[i].size = size;
583 		sc->psc_bar[i].addr = base;
584 
585 		/* Allocate the BAR in the guest I/O or MMIO space */
586 		error = pci_emul_alloc_bar(pi, i, bartype, size);
587 		if (error)
588 			return (-1);
589 
590 		/* The MSI-X table needs special handling */
591 		if (i == pci_msix_table_bar(pi)) {
592 			error = init_msix_table(ctx, sc, base);
593 			if (error)
594 				return (-1);
595 		} else if (bartype != PCIBAR_IO) {
596 			/* Map the physical BAR in the guest MMIO space */
597 			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
598 				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
599 				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
600 			if (error)
601 				return (-1);
602 		}
603 
604 		/*
605 		 * 64-bit BAR takes up two slots so skip the next one.
606 		 */
607 		if (bartype == PCIBAR_MEM64) {
608 			i++;
609 			assert(i <= PCI_BARMAX);
610 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
611 		}
612 	}
613 	return (0);
614 }
615 
616 static int
617 cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
618 {
619 	int error;
620 	struct passthru_softc *sc;
621 
622 	error = 1;
623 	sc = pi->pi_arg;
624 
625 	bzero(&sc->psc_sel, sizeof(struct pcisel));
626 	sc->psc_sel.pc_bus = bus;
627 	sc->psc_sel.pc_dev = slot;
628 	sc->psc_sel.pc_func = func;
629 
630 	if (cfginitmsi(sc) != 0) {
631 		warnx("failed to initialize MSI for PCI %d/%d/%d",
632 		    bus, slot, func);
633 		goto done;
634 	}
635 
636 	if (cfginitbar(ctx, sc) != 0) {
637 		warnx("failed to initialize BARs for PCI %d/%d/%d",
638 		    bus, slot, func);
639 		goto done;
640 	}
641 
642 	pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(&sc->psc_sel,
643 	    PCIR_COMMAND, 2));
644 
645 	error = 0;				/* success */
646 done:
647 	return (error);
648 }
649 
650 static int
651 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
652 {
653 	int bus, slot, func, error, memflags;
654 	struct passthru_softc *sc;
655 #ifndef WITHOUT_CAPSICUM
656 	cap_rights_t rights;
657 	cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR };
658 	cap_ioctl_t io_ioctls[] = { IODEV_PIO };
659 #endif
660 
661 	sc = NULL;
662 	error = 1;
663 
664 #ifndef WITHOUT_CAPSICUM
665 	cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
666 #endif
667 
668 	memflags = vm_get_memflags(ctx);
669 	if (!(memflags & VM_MEM_F_WIRED)) {
670 		warnx("passthru requires guest memory to be wired");
671 		return (error);
672 	}
673 
674 	if (pcifd < 0) {
675 		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
676 		if (pcifd < 0) {
677 			warn("failed to open %s", _PATH_DEVPCI);
678 			return (error);
679 		}
680 	}
681 
682 #ifndef WITHOUT_CAPSICUM
683 	if (caph_rights_limit(pcifd, &rights) == -1)
684 		errx(EX_OSERR, "Unable to apply rights for sandbox");
685 	if (caph_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1)
686 		errx(EX_OSERR, "Unable to apply rights for sandbox");
687 #endif
688 
689 	if (iofd < 0) {
690 		iofd = open(_PATH_DEVIO, O_RDWR, 0);
691 		if (iofd < 0) {
692 			warn("failed to open %s", _PATH_DEVIO);
693 			return (error);
694 		}
695 	}
696 
697 #ifndef WITHOUT_CAPSICUM
698 	if (caph_rights_limit(iofd, &rights) == -1)
699 		errx(EX_OSERR, "Unable to apply rights for sandbox");
700 	if (caph_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1)
701 		errx(EX_OSERR, "Unable to apply rights for sandbox");
702 #endif
703 
704 	if (memfd < 0) {
705 		memfd = open(_PATH_MEM, O_RDWR, 0);
706 		if (memfd < 0) {
707 			warn("failed to open %s", _PATH_MEM);
708 			return (error);
709 		}
710 	}
711 
712 #ifndef WITHOUT_CAPSICUM
713 	cap_rights_clear(&rights, CAP_IOCTL);
714 	cap_rights_set(&rights, CAP_MMAP_RW);
715 	if (caph_rights_limit(memfd, &rights) == -1)
716 		errx(EX_OSERR, "Unable to apply rights for sandbox");
717 #endif
718 
719 	if (opts == NULL ||
720 	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
721 		warnx("invalid passthru options");
722 		return (error);
723 	}
724 
725 	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
726 		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
727 		    bus, slot, func);
728 		goto done;
729 	}
730 
731 	sc = calloc(1, sizeof(struct passthru_softc));
732 
733 	pi->pi_arg = sc;
734 	sc->psc_pi = pi;
735 
736 	/* initialize config space */
737 	error = cfginit(ctx, pi, bus, slot, func);
738 done:
739 	if (error) {
740 		free(sc);
741 		vm_unassign_pptdev(ctx, bus, slot, func);
742 	}
743 	return (error);
744 }
745 
746 static int
747 bar_access(int coff)
748 {
749 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
750 		return (1);
751 	else
752 		return (0);
753 }
754 
755 static int
756 msicap_access(struct passthru_softc *sc, int coff)
757 {
758 	int caplen;
759 
760 	if (sc->psc_msi.capoff == 0)
761 		return (0);
762 
763 	caplen = msi_caplen(sc->psc_msi.msgctrl);
764 
765 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
766 		return (1);
767 	else
768 		return (0);
769 }
770 
771 static int
772 msixcap_access(struct passthru_softc *sc, int coff)
773 {
774 	if (sc->psc_msix.capoff == 0)
775 		return (0);
776 
777 	return (coff >= sc->psc_msix.capoff &&
778 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
779 }
780 
781 static int
782 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
783 		 int coff, int bytes, uint32_t *rv)
784 {
785 	struct passthru_softc *sc;
786 
787 	sc = pi->pi_arg;
788 
789 	/*
790 	 * PCI BARs and MSI capability is emulated.
791 	 */
792 	if (bar_access(coff) || msicap_access(sc, coff))
793 		return (-1);
794 
795 #ifdef LEGACY_SUPPORT
796 	/*
797 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
798 	 * natively.
799 	 */
800 	if (sc->psc_msi.emulated) {
801 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
802 			return (-1);
803 	}
804 #endif
805 
806 	/*
807 	 * Emulate the command register.  If a single read reads both the
808 	 * command and status registers, read the status register from the
809 	 * device's config space.
810 	 */
811 	if (coff == PCIR_COMMAND) {
812 		if (bytes <= 2)
813 			return (-1);
814 		*rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 |
815 		    pci_get_cfgdata16(pi, PCIR_COMMAND);
816 		return (0);
817 	}
818 
819 	/* Everything else just read from the device's config space */
820 	*rv = read_config(&sc->psc_sel, coff, bytes);
821 
822 	return (0);
823 }
824 
825 static int
826 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
827 		  int coff, int bytes, uint32_t val)
828 {
829 	int error, msix_table_entries, i;
830 	struct passthru_softc *sc;
831 	uint16_t cmd_old;
832 
833 	sc = pi->pi_arg;
834 
835 	/*
836 	 * PCI BARs are emulated
837 	 */
838 	if (bar_access(coff))
839 		return (-1);
840 
841 	/*
842 	 * MSI capability is emulated
843 	 */
844 	if (msicap_access(sc, coff)) {
845 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
846 		    PCIY_MSI);
847 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
848 			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
849 			pi->pi_msi.addr, pi->pi_msi.msg_data,
850 			pi->pi_msi.maxmsgnum);
851 		if (error != 0)
852 			err(1, "vm_setup_pptdev_msi");
853 		return (0);
854 	}
855 
856 	if (msixcap_access(sc, coff)) {
857 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
858 		    PCIY_MSIX);
859 		if (pi->pi_msix.enabled) {
860 			msix_table_entries = pi->pi_msix.table_count;
861 			for (i = 0; i < msix_table_entries; i++) {
862 				error = vm_setup_pptdev_msix(ctx, vcpu,
863 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
864 				    sc->psc_sel.pc_func, i,
865 				    pi->pi_msix.table[i].addr,
866 				    pi->pi_msix.table[i].msg_data,
867 				    pi->pi_msix.table[i].vector_control);
868 
869 				if (error)
870 					err(1, "vm_setup_pptdev_msix");
871 			}
872 		}
873 		return (0);
874 	}
875 
876 #ifdef LEGACY_SUPPORT
877 	/*
878 	 * If this device does not support MSI natively then we cannot let
879 	 * the guest disable legacy interrupts from the device. It is the
880 	 * legacy interrupt that is triggering the virtual MSI to the guest.
881 	 */
882 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
883 		if (coff == PCIR_COMMAND && bytes == 2)
884 			val &= ~PCIM_CMD_INTxDIS;
885 	}
886 #endif
887 
888 	write_config(&sc->psc_sel, coff, bytes, val);
889 	if (coff == PCIR_COMMAND) {
890 		cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
891 		if (bytes == 1)
892 			pci_set_cfgdata8(pi, PCIR_COMMAND, val);
893 		else if (bytes == 2)
894 			pci_set_cfgdata16(pi, PCIR_COMMAND, val);
895 		pci_emul_cmd_changed(pi, cmd_old);
896 	}
897 
898 	return (0);
899 }
900 
901 static void
902 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
903 	       uint64_t offset, int size, uint64_t value)
904 {
905 	struct passthru_softc *sc;
906 	struct iodev_pio_req pio;
907 
908 	sc = pi->pi_arg;
909 
910 	if (baridx == pci_msix_table_bar(pi)) {
911 		msix_table_write(ctx, vcpu, sc, offset, size, value);
912 	} else {
913 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
914 		bzero(&pio, sizeof(struct iodev_pio_req));
915 		pio.access = IODEV_PIO_WRITE;
916 		pio.port = sc->psc_bar[baridx].addr + offset;
917 		pio.width = size;
918 		pio.val = value;
919 
920 		(void)ioctl(iofd, IODEV_PIO, &pio);
921 	}
922 }
923 
924 static uint64_t
925 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
926 	      uint64_t offset, int size)
927 {
928 	struct passthru_softc *sc;
929 	struct iodev_pio_req pio;
930 	uint64_t val;
931 
932 	sc = pi->pi_arg;
933 
934 	if (baridx == pci_msix_table_bar(pi)) {
935 		val = msix_table_read(sc, offset, size);
936 	} else {
937 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
938 		bzero(&pio, sizeof(struct iodev_pio_req));
939 		pio.access = IODEV_PIO_READ;
940 		pio.port = sc->psc_bar[baridx].addr + offset;
941 		pio.width = size;
942 		pio.val = 0;
943 
944 		(void)ioctl(iofd, IODEV_PIO, &pio);
945 
946 		val = pio.val;
947 	}
948 
949 	return (val);
950 }
951 
952 struct pci_devemu passthru = {
953 	.pe_emu		= "passthru",
954 	.pe_init	= passthru_init,
955 	.pe_cfgwrite	= passthru_cfgwrite,
956 	.pe_cfgread	= passthru_cfgread,
957 	.pe_barwrite 	= passthru_write,
958 	.pe_barread    	= passthru_read,
959 };
960 PCI_EMUL_SET(passthru);
961