xref: /freebsd/usr.sbin/bhyve/pci_passthru.c (revision 5dae51da3da0cc94d17bd67b308fad304ebec7e0)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/mman.h>
35 #include <sys/pciio.h>
36 #include <sys/ioctl.h>
37 
38 #include <dev/io/iodev.h>
39 #include <dev/pci/pcireg.h>
40 
41 #include <machine/iodev.h>
42 
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include <err.h>
47 #include <fcntl.h>
48 #include <unistd.h>
49 
50 #include <machine/vmm.h>
51 #include <vmmapi.h>
52 #include "pci_emul.h"
53 #include "mem.h"
54 
55 #ifndef _PATH_DEVPCI
56 #define	_PATH_DEVPCI	"/dev/pci"
57 #endif
58 
59 #ifndef	_PATH_DEVIO
60 #define	_PATH_DEVIO	"/dev/io"
61 #endif
62 
63 #ifndef _PATH_MEM
64 #define	_PATH_MEM	"/dev/mem"
65 #endif
66 
67 #define	LEGACY_SUPPORT	1
68 
69 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
70 #define MSIX_CAPLEN 12
71 
72 static int pcifd = -1;
73 static int iofd = -1;
74 static int memfd = -1;
75 
76 struct passthru_softc {
77 	struct pci_devinst *psc_pi;
78 	struct pcibar psc_bar[PCI_BARMAX + 1];
79 	struct {
80 		int		capoff;
81 		int		msgctrl;
82 		int		emulated;
83 	} psc_msi;
84 	struct {
85 		int		capoff;
86 	} psc_msix;
87 	struct pcisel psc_sel;
88 };
89 
90 static int
91 msi_caplen(int msgctrl)
92 {
93 	int len;
94 
95 	len = 10;		/* minimum length of msi capability */
96 
97 	if (msgctrl & PCIM_MSICTRL_64BIT)
98 		len += 4;
99 
100 #if 0
101 	/*
102 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
103 	 * We'll let the guest manipulate them directly.
104 	 */
105 	if (msgctrl & PCIM_MSICTRL_VECTOR)
106 		len += 10;
107 #endif
108 
109 	return (len);
110 }
111 
112 static uint32_t
113 read_config(const struct pcisel *sel, long reg, int width)
114 {
115 	struct pci_io pi;
116 
117 	bzero(&pi, sizeof(pi));
118 	pi.pi_sel = *sel;
119 	pi.pi_reg = reg;
120 	pi.pi_width = width;
121 
122 	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
123 		return (0);				/* XXX */
124 	else
125 		return (pi.pi_data);
126 }
127 
128 static void
129 write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
130 {
131 	struct pci_io pi;
132 
133 	bzero(&pi, sizeof(pi));
134 	pi.pi_sel = *sel;
135 	pi.pi_reg = reg;
136 	pi.pi_width = width;
137 	pi.pi_data = data;
138 
139 	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
140 }
141 
142 #ifdef LEGACY_SUPPORT
143 static int
144 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
145 {
146 	int capoff, i;
147 	struct msicap msicap;
148 	u_char *capdata;
149 
150 	pci_populate_msicap(&msicap, msgnum, nextptr);
151 
152 	/*
153 	 * XXX
154 	 * Copy the msi capability structure in the last 16 bytes of the
155 	 * config space. This is wrong because it could shadow something
156 	 * useful to the device.
157 	 */
158 	capoff = 256 - roundup(sizeof(msicap), 4);
159 	capdata = (u_char *)&msicap;
160 	for (i = 0; i < sizeof(msicap); i++)
161 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
162 
163 	return (capoff);
164 }
165 #endif	/* LEGACY_SUPPORT */
166 
167 static int
168 cfginitmsi(struct passthru_softc *sc)
169 {
170 	int i, ptr, capptr, cap, sts, caplen, table_size;
171 	uint32_t u32;
172 	struct pcisel sel;
173 	struct pci_devinst *pi;
174 	struct msixcap msixcap;
175 	uint32_t *msixcap_ptr;
176 
177 	pi = sc->psc_pi;
178 	sel = sc->psc_sel;
179 
180 	/*
181 	 * Parse the capabilities and cache the location of the MSI
182 	 * and MSI-X capabilities.
183 	 */
184 	sts = read_config(&sel, PCIR_STATUS, 2);
185 	if (sts & PCIM_STATUS_CAPPRESENT) {
186 		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
187 		while (ptr != 0 && ptr != 0xff) {
188 			cap = read_config(&sel, ptr + PCICAP_ID, 1);
189 			if (cap == PCIY_MSI) {
190 				/*
191 				 * Copy the MSI capability into the config
192 				 * space of the emulated pci device
193 				 */
194 				sc->psc_msi.capoff = ptr;
195 				sc->psc_msi.msgctrl = read_config(&sel,
196 								  ptr + 2, 2);
197 				sc->psc_msi.emulated = 0;
198 				caplen = msi_caplen(sc->psc_msi.msgctrl);
199 				capptr = ptr;
200 				while (caplen > 0) {
201 					u32 = read_config(&sel, capptr, 4);
202 					pci_set_cfgdata32(pi, capptr, u32);
203 					caplen -= 4;
204 					capptr += 4;
205 				}
206 			} else if (cap == PCIY_MSIX) {
207 				/*
208 				 * Copy the MSI-X capability
209 				 */
210 				sc->psc_msix.capoff = ptr;
211 				caplen = 12;
212 				msixcap_ptr = (uint32_t*) &msixcap;
213 				capptr = ptr;
214 				while (caplen > 0) {
215 					u32 = read_config(&sel, capptr, 4);
216 					*msixcap_ptr = u32;
217 					pci_set_cfgdata32(pi, capptr, u32);
218 					caplen -= 4;
219 					capptr += 4;
220 					msixcap_ptr++;
221 				}
222 			}
223 			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
224 		}
225 	}
226 
227 	if (sc->psc_msix.capoff != 0) {
228 		pi->pi_msix.pba_bar =
229 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
230 		pi->pi_msix.pba_offset =
231 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
232 		pi->pi_msix.table_bar =
233 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
234 		pi->pi_msix.table_offset =
235 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
236 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
237 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
238 
239 		/* Allocate the emulated MSI-X table array */
240 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
241 		pi->pi_msix.table = calloc(1, table_size);
242 
243 		/* Mask all table entries */
244 		for (i = 0; i < pi->pi_msix.table_count; i++) {
245 			pi->pi_msix.table[i].vector_control |=
246 						PCIM_MSIX_VCTRL_MASK;
247 		}
248 	}
249 
250 #ifdef LEGACY_SUPPORT
251 	/*
252 	 * If the passthrough device does not support MSI then craft a
253 	 * MSI capability for it. We link the new MSI capability at the
254 	 * head of the list of capabilities.
255 	 */
256 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
257 		int origptr, msiptr;
258 		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
259 		msiptr = passthru_add_msicap(pi, 1, origptr);
260 		sc->psc_msi.capoff = msiptr;
261 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
262 		sc->psc_msi.emulated = 1;
263 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
264 	}
265 #endif
266 
267 	/* Make sure one of the capabilities is present */
268 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
269 		return (-1);
270 	else
271 		return (0);
272 }
273 
274 static uint64_t
275 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
276 {
277 	struct pci_devinst *pi;
278 	struct msix_table_entry *entry;
279 	uint8_t *src8;
280 	uint16_t *src16;
281 	uint32_t *src32;
282 	uint64_t *src64;
283 	uint64_t data;
284 	size_t entry_offset;
285 	int index;
286 
287 	pi = sc->psc_pi;
288 	if (offset >= pi->pi_msix.pba_offset &&
289 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
290 		switch(size) {
291 		case 1:
292 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
293 			    pi->pi_msix.pba_page_offset);
294 			data = *src8;
295 			break;
296 		case 2:
297 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
298 			    pi->pi_msix.pba_page_offset);
299 			data = *src16;
300 			break;
301 		case 4:
302 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
303 			    pi->pi_msix.pba_page_offset);
304 			data = *src32;
305 			break;
306 		case 8:
307 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
308 			    pi->pi_msix.pba_page_offset);
309 			data = *src64;
310 			break;
311 		default:
312 			return (-1);
313 		}
314 		return (data);
315 	}
316 
317 	if (offset < pi->pi_msix.table_offset)
318 		return (-1);
319 
320 	offset -= pi->pi_msix.table_offset;
321 	index = offset / MSIX_TABLE_ENTRY_SIZE;
322 	if (index >= pi->pi_msix.table_count)
323 		return (-1);
324 
325 	entry = &pi->pi_msix.table[index];
326 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
327 
328 	switch(size) {
329 	case 1:
330 		src8 = (uint8_t *)((void *)entry + entry_offset);
331 		data = *src8;
332 		break;
333 	case 2:
334 		src16 = (uint16_t *)((void *)entry + entry_offset);
335 		data = *src16;
336 		break;
337 	case 4:
338 		src32 = (uint32_t *)((void *)entry + entry_offset);
339 		data = *src32;
340 		break;
341 	case 8:
342 		src64 = (uint64_t *)((void *)entry + entry_offset);
343 		data = *src64;
344 		break;
345 	default:
346 		return (-1);
347 	}
348 
349 	return (data);
350 }
351 
352 static void
353 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
354 		 uint64_t offset, int size, uint64_t data)
355 {
356 	struct pci_devinst *pi;
357 	struct msix_table_entry *entry;
358 	uint8_t *dest8;
359 	uint16_t *dest16;
360 	uint32_t *dest32;
361 	uint64_t *dest64;
362 	size_t entry_offset;
363 	uint32_t vector_control;
364 	int index;
365 
366 	pi = sc->psc_pi;
367 	if (offset >= pi->pi_msix.pba_offset &&
368 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
369 		switch(size) {
370 		case 1:
371 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
372 			    pi->pi_msix.pba_page_offset);
373 			*dest8 = data;
374 			break;
375 		case 2:
376 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
377 			    pi->pi_msix.pba_page_offset);
378 			*dest16 = data;
379 			break;
380 		case 4:
381 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
382 			    pi->pi_msix.pba_page_offset);
383 			*dest32 = data;
384 			break;
385 		case 8:
386 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
387 			    pi->pi_msix.pba_page_offset);
388 			*dest64 = data;
389 			break;
390 		default:
391 			break;
392 		}
393 		return;
394 	}
395 
396 	if (offset < pi->pi_msix.table_offset)
397 		return;
398 
399 	offset -= pi->pi_msix.table_offset;
400 	index = offset / MSIX_TABLE_ENTRY_SIZE;
401 	if (index >= pi->pi_msix.table_count)
402 		return;
403 
404 	entry = &pi->pi_msix.table[index];
405 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
406 
407 	/* Only 4 byte naturally-aligned writes are supported */
408 	assert(size == 4);
409 	assert(entry_offset % 4 == 0);
410 
411 	vector_control = entry->vector_control;
412 	dest32 = (uint32_t *)((void *)entry + entry_offset);
413 	*dest32 = data;
414 	/* If MSI-X hasn't been enabled, do nothing */
415 	if (pi->pi_msix.enabled) {
416 		/* If the entry is masked, don't set it up */
417 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
418 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
419 			(void)vm_setup_pptdev_msix(ctx, vcpu,
420 			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
421 			    sc->psc_sel.pc_func, index, entry->addr,
422 			    entry->msg_data, entry->vector_control);
423 		}
424 	}
425 }
426 
427 static int
428 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
429 {
430 	int b, s, f;
431 	int error, idx;
432 	size_t len, remaining;
433 	uint32_t table_size, table_offset;
434 	uint32_t pba_size, pba_offset;
435 	vm_paddr_t start;
436 	struct pci_devinst *pi = sc->psc_pi;
437 
438 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
439 
440 	b = sc->psc_sel.pc_bus;
441 	s = sc->psc_sel.pc_dev;
442 	f = sc->psc_sel.pc_func;
443 
444 	/*
445 	 * If the MSI-X table BAR maps memory intended for
446 	 * other uses, it is at least assured that the table
447 	 * either resides in its own page within the region,
448 	 * or it resides in a page shared with only the PBA.
449 	 */
450 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
451 
452 	table_size = pi->pi_msix.table_offset - table_offset;
453 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
454 	table_size = roundup2(table_size, 4096);
455 
456 	idx = pi->pi_msix.table_bar;
457 	start = pi->pi_bar[idx].addr;
458 	remaining = pi->pi_bar[idx].size;
459 
460 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
461 		pba_offset = pi->pi_msix.pba_offset;
462 		pba_size = pi->pi_msix.pba_size;
463 		if (pba_offset >= table_offset + table_size ||
464 		    table_offset >= pba_offset + pba_size) {
465 			/*
466 			 * If the PBA does not share a page with the MSI-x
467 			 * tables, no PBA emulation is required.
468 			 */
469 			pi->pi_msix.pba_page = NULL;
470 			pi->pi_msix.pba_page_offset = 0;
471 		} else {
472 			/*
473 			 * The PBA overlaps with either the first or last
474 			 * page of the MSI-X table region.  Map the
475 			 * appropriate page.
476 			 */
477 			if (pba_offset <= table_offset)
478 				pi->pi_msix.pba_page_offset = table_offset;
479 			else
480 				pi->pi_msix.pba_page_offset = table_offset +
481 				    table_size - 4096;
482 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
483 			    PROT_WRITE, MAP_SHARED, memfd, start +
484 			    pi->pi_msix.pba_page_offset);
485 			if (pi->pi_msix.pba_page == MAP_FAILED) {
486 				warn(
487 			    "Failed to map PBA page for MSI-X on %d/%d/%d",
488 				    b, s, f);
489 				return (-1);
490 			}
491 		}
492 	}
493 
494 	/* Map everything before the MSI-X table */
495 	if (table_offset > 0) {
496 		len = table_offset;
497 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
498 		if (error)
499 			return (error);
500 
501 		base += len;
502 		start += len;
503 		remaining -= len;
504 	}
505 
506 	/* Skip the MSI-X table */
507 	base += table_size;
508 	start += table_size;
509 	remaining -= table_size;
510 
511 	/* Map everything beyond the end of the MSI-X table */
512 	if (remaining > 0) {
513 		len = remaining;
514 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
515 		if (error)
516 			return (error);
517 	}
518 
519 	return (0);
520 }
521 
522 static int
523 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
524 {
525 	int i, error;
526 	struct pci_devinst *pi;
527 	struct pci_bar_io bar;
528 	enum pcibar_type bartype;
529 	uint64_t base, size;
530 
531 	pi = sc->psc_pi;
532 
533 	/*
534 	 * Initialize BAR registers
535 	 */
536 	for (i = 0; i <= PCI_BARMAX; i++) {
537 		bzero(&bar, sizeof(bar));
538 		bar.pbi_sel = sc->psc_sel;
539 		bar.pbi_reg = PCIR_BAR(i);
540 
541 		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
542 			continue;
543 
544 		if (PCI_BAR_IO(bar.pbi_base)) {
545 			bartype = PCIBAR_IO;
546 			base = bar.pbi_base & PCIM_BAR_IO_BASE;
547 		} else {
548 			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
549 			case PCIM_BAR_MEM_64:
550 				bartype = PCIBAR_MEM64;
551 				break;
552 			default:
553 				bartype = PCIBAR_MEM32;
554 				break;
555 			}
556 			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
557 		}
558 		size = bar.pbi_length;
559 
560 		if (bartype != PCIBAR_IO) {
561 			if (((base | size) & PAGE_MASK) != 0) {
562 				warnx("passthru device %d/%d/%d BAR %d: "
563 				    "base %#lx or size %#lx not page aligned\n",
564 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
565 				    sc->psc_sel.pc_func, i, base, size);
566 				return (-1);
567 			}
568 		}
569 
570 		/* Cache information about the "real" BAR */
571 		sc->psc_bar[i].type = bartype;
572 		sc->psc_bar[i].size = size;
573 		sc->psc_bar[i].addr = base;
574 
575 		/* Allocate the BAR in the guest I/O or MMIO space */
576 		error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
577 		if (error)
578 			return (-1);
579 
580 		/* The MSI-X table needs special handling */
581 		if (i == pci_msix_table_bar(pi)) {
582 			error = init_msix_table(ctx, sc, base);
583 			if (error)
584 				return (-1);
585 		} else if (bartype != PCIBAR_IO) {
586 			/* Map the physical BAR in the guest MMIO space */
587 			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
588 				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
589 				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
590 			if (error)
591 				return (-1);
592 		}
593 
594 		/*
595 		 * 64-bit BAR takes up two slots so skip the next one.
596 		 */
597 		if (bartype == PCIBAR_MEM64) {
598 			i++;
599 			assert(i <= PCI_BARMAX);
600 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
601 		}
602 	}
603 	return (0);
604 }
605 
606 static int
607 cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
608 {
609 	int error;
610 	struct passthru_softc *sc;
611 
612 	error = 1;
613 	sc = pi->pi_arg;
614 
615 	bzero(&sc->psc_sel, sizeof(struct pcisel));
616 	sc->psc_sel.pc_bus = bus;
617 	sc->psc_sel.pc_dev = slot;
618 	sc->psc_sel.pc_func = func;
619 
620 	if (cfginitmsi(sc) != 0) {
621 		warnx("failed to initialize MSI for PCI %d/%d/%d",
622 		    bus, slot, func);
623 		goto done;
624 	}
625 
626 	if (cfginitbar(ctx, sc) != 0) {
627 		warnx("failed to initialize BARs for PCI %d/%d/%d",
628 		    bus, slot, func);
629 		goto done;
630 	}
631 
632 	error = 0;				/* success */
633 done:
634 	return (error);
635 }
636 
637 static int
638 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
639 {
640 	int bus, slot, func, error, memflags;
641 	struct passthru_softc *sc;
642 
643 	sc = NULL;
644 	error = 1;
645 
646 	memflags = vm_get_memflags(ctx);
647 	if (!(memflags & VM_MEM_F_WIRED)) {
648 		warnx("passthru requires guest memory to be wired");
649 		goto done;
650 	}
651 
652 	if (pcifd < 0) {
653 		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
654 		if (pcifd < 0) {
655 			warn("failed to open %s", _PATH_DEVPCI);
656 			goto done;
657 		}
658 	}
659 
660 	if (iofd < 0) {
661 		iofd = open(_PATH_DEVIO, O_RDWR, 0);
662 		if (iofd < 0) {
663 			warn("failed to open %s", _PATH_DEVIO);
664 			goto done;
665 		}
666 	}
667 
668 	if (memfd < 0) {
669 		memfd = open(_PATH_MEM, O_RDWR, 0);
670 		if (memfd < 0) {
671 			warn("failed to open %s", _PATH_MEM);
672 			goto done;
673 		}
674 	}
675 
676 	if (opts == NULL ||
677 	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
678 		warnx("invalid passthru options");
679 		goto done;
680 	}
681 
682 	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
683 		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
684 		    bus, slot, func);
685 		goto done;
686 	}
687 
688 	sc = calloc(1, sizeof(struct passthru_softc));
689 
690 	pi->pi_arg = sc;
691 	sc->psc_pi = pi;
692 
693 	/* initialize config space */
694 	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
695 		goto done;
696 
697 	error = 0;		/* success */
698 done:
699 	if (error) {
700 		free(sc);
701 		vm_unassign_pptdev(ctx, bus, slot, func);
702 	}
703 	return (error);
704 }
705 
706 static int
707 bar_access(int coff)
708 {
709 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
710 		return (1);
711 	else
712 		return (0);
713 }
714 
715 static int
716 msicap_access(struct passthru_softc *sc, int coff)
717 {
718 	int caplen;
719 
720 	if (sc->psc_msi.capoff == 0)
721 		return (0);
722 
723 	caplen = msi_caplen(sc->psc_msi.msgctrl);
724 
725 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
726 		return (1);
727 	else
728 		return (0);
729 }
730 
731 static int
732 msixcap_access(struct passthru_softc *sc, int coff)
733 {
734 	if (sc->psc_msix.capoff == 0)
735 		return (0);
736 
737 	return (coff >= sc->psc_msix.capoff &&
738 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
739 }
740 
741 static int
742 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
743 		 int coff, int bytes, uint32_t *rv)
744 {
745 	struct passthru_softc *sc;
746 
747 	sc = pi->pi_arg;
748 
749 	/*
750 	 * PCI BARs and MSI capability is emulated.
751 	 */
752 	if (bar_access(coff) || msicap_access(sc, coff))
753 		return (-1);
754 
755 #ifdef LEGACY_SUPPORT
756 	/*
757 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
758 	 * natively.
759 	 */
760 	if (sc->psc_msi.emulated) {
761 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
762 			return (-1);
763 	}
764 #endif
765 
766 	/* Everything else just read from the device's config space */
767 	*rv = read_config(&sc->psc_sel, coff, bytes);
768 
769 	return (0);
770 }
771 
772 static int
773 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
774 		  int coff, int bytes, uint32_t val)
775 {
776 	int error, msix_table_entries, i;
777 	struct passthru_softc *sc;
778 
779 	sc = pi->pi_arg;
780 
781 	/*
782 	 * PCI BARs are emulated
783 	 */
784 	if (bar_access(coff))
785 		return (-1);
786 
787 	/*
788 	 * MSI capability is emulated
789 	 */
790 	if (msicap_access(sc, coff)) {
791 		msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
792 
793 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
794 			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
795 			pi->pi_msi.addr, pi->pi_msi.msg_data,
796 			pi->pi_msi.maxmsgnum);
797 		if (error != 0)
798 			err(1, "vm_setup_pptdev_msi");
799 		return (0);
800 	}
801 
802 	if (msixcap_access(sc, coff)) {
803 		msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
804 		if (pi->pi_msix.enabled) {
805 			msix_table_entries = pi->pi_msix.table_count;
806 			for (i = 0; i < msix_table_entries; i++) {
807 				error = vm_setup_pptdev_msix(ctx, vcpu,
808 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
809 				    sc->psc_sel.pc_func, i,
810 				    pi->pi_msix.table[i].addr,
811 				    pi->pi_msix.table[i].msg_data,
812 				    pi->pi_msix.table[i].vector_control);
813 
814 				if (error)
815 					err(1, "vm_setup_pptdev_msix");
816 			}
817 		}
818 		return (0);
819 	}
820 
821 #ifdef LEGACY_SUPPORT
822 	/*
823 	 * If this device does not support MSI natively then we cannot let
824 	 * the guest disable legacy interrupts from the device. It is the
825 	 * legacy interrupt that is triggering the virtual MSI to the guest.
826 	 */
827 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
828 		if (coff == PCIR_COMMAND && bytes == 2)
829 			val &= ~PCIM_CMD_INTxDIS;
830 	}
831 #endif
832 
833 	write_config(&sc->psc_sel, coff, bytes, val);
834 
835 	return (0);
836 }
837 
838 static void
839 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
840 	       uint64_t offset, int size, uint64_t value)
841 {
842 	struct passthru_softc *sc;
843 	struct iodev_pio_req pio;
844 
845 	sc = pi->pi_arg;
846 
847 	if (baridx == pci_msix_table_bar(pi)) {
848 		msix_table_write(ctx, vcpu, sc, offset, size, value);
849 	} else {
850 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
851 		bzero(&pio, sizeof(struct iodev_pio_req));
852 		pio.access = IODEV_PIO_WRITE;
853 		pio.port = sc->psc_bar[baridx].addr + offset;
854 		pio.width = size;
855 		pio.val = value;
856 
857 		(void)ioctl(iofd, IODEV_PIO, &pio);
858 	}
859 }
860 
861 static uint64_t
862 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
863 	      uint64_t offset, int size)
864 {
865 	struct passthru_softc *sc;
866 	struct iodev_pio_req pio;
867 	uint64_t val;
868 
869 	sc = pi->pi_arg;
870 
871 	if (baridx == pci_msix_table_bar(pi)) {
872 		val = msix_table_read(sc, offset, size);
873 	} else {
874 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
875 		bzero(&pio, sizeof(struct iodev_pio_req));
876 		pio.access = IODEV_PIO_READ;
877 		pio.port = sc->psc_bar[baridx].addr + offset;
878 		pio.width = size;
879 		pio.val = 0;
880 
881 		(void)ioctl(iofd, IODEV_PIO, &pio);
882 
883 		val = pio.val;
884 	}
885 
886 	return (val);
887 }
888 
889 struct pci_devemu passthru = {
890 	.pe_emu		= "passthru",
891 	.pe_init	= passthru_init,
892 	.pe_cfgwrite	= passthru_cfgwrite,
893 	.pe_cfgread	= passthru_cfgread,
894 	.pe_barwrite 	= passthru_write,
895 	.pe_barread    	= passthru_read,
896 };
897 PCI_EMUL_SET(passthru);
898