xref: /freebsd/usr.sbin/bhyve/pci_passthru.c (revision 3e41d09d08f5bfa2fc1386241f334b865d6da085)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/mman.h>
35 #include <sys/pciio.h>
36 #include <sys/ioctl.h>
37 
38 #include <dev/io/iodev.h>
39 #include <dev/pci/pcireg.h>
40 
41 #include <machine/iodev.h>
42 
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <unistd.h>
49 
50 #include <machine/vmm.h>
51 #include <vmmapi.h>
52 #include "pci_emul.h"
53 #include "mem.h"
54 
55 #ifndef _PATH_DEVPCI
56 #define	_PATH_DEVPCI	"/dev/pci"
57 #endif
58 
59 #ifndef	_PATH_DEVIO
60 #define	_PATH_DEVIO	"/dev/io"
61 #endif
62 
63 #ifndef _PATH_MEM
64 #define	_PATH_MEM	"/dev/mem"
65 #endif
66 
67 #define	LEGACY_SUPPORT	1
68 
69 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
70 #define MSIX_CAPLEN 12
71 
72 static int pcifd = -1;
73 static int iofd = -1;
74 static int memfd = -1;
75 
76 struct passthru_softc {
77 	struct pci_devinst *psc_pi;
78 	struct pcibar psc_bar[PCI_BARMAX + 1];
79 	struct {
80 		int		capoff;
81 		int		msgctrl;
82 		int		emulated;
83 	} psc_msi;
84 	struct {
85 		int		capoff;
86 	} psc_msix;
87 	struct pcisel psc_sel;
88 };
89 
90 static int
91 msi_caplen(int msgctrl)
92 {
93 	int len;
94 
95 	len = 10;		/* minimum length of msi capability */
96 
97 	if (msgctrl & PCIM_MSICTRL_64BIT)
98 		len += 4;
99 
100 #if 0
101 	/*
102 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
103 	 * We'll let the guest manipulate them directly.
104 	 */
105 	if (msgctrl & PCIM_MSICTRL_VECTOR)
106 		len += 10;
107 #endif
108 
109 	return (len);
110 }
111 
112 static uint32_t
113 read_config(const struct pcisel *sel, long reg, int width)
114 {
115 	struct pci_io pi;
116 
117 	bzero(&pi, sizeof(pi));
118 	pi.pi_sel = *sel;
119 	pi.pi_reg = reg;
120 	pi.pi_width = width;
121 
122 	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
123 		return (0);				/* XXX */
124 	else
125 		return (pi.pi_data);
126 }
127 
128 static void
129 write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
130 {
131 	struct pci_io pi;
132 
133 	bzero(&pi, sizeof(pi));
134 	pi.pi_sel = *sel;
135 	pi.pi_reg = reg;
136 	pi.pi_width = width;
137 	pi.pi_data = data;
138 
139 	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
140 }
141 
142 #ifdef LEGACY_SUPPORT
143 static int
144 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
145 {
146 	int capoff, i;
147 	struct msicap msicap;
148 	u_char *capdata;
149 
150 	pci_populate_msicap(&msicap, msgnum, nextptr);
151 
152 	/*
153 	 * XXX
154 	 * Copy the msi capability structure in the last 16 bytes of the
155 	 * config space. This is wrong because it could shadow something
156 	 * useful to the device.
157 	 */
158 	capoff = 256 - roundup(sizeof(msicap), 4);
159 	capdata = (u_char *)&msicap;
160 	for (i = 0; i < sizeof(msicap); i++)
161 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
162 
163 	return (capoff);
164 }
165 #endif	/* LEGACY_SUPPORT */
166 
167 static int
168 cfginitmsi(struct passthru_softc *sc)
169 {
170 	int i, ptr, capptr, cap, sts, caplen, table_size;
171 	uint32_t u32;
172 	struct pcisel sel;
173 	struct pci_devinst *pi;
174 	struct msixcap msixcap;
175 	uint32_t *msixcap_ptr;
176 
177 	pi = sc->psc_pi;
178 	sel = sc->psc_sel;
179 
180 	/*
181 	 * Parse the capabilities and cache the location of the MSI
182 	 * and MSI-X capabilities.
183 	 */
184 	sts = read_config(&sel, PCIR_STATUS, 2);
185 	if (sts & PCIM_STATUS_CAPPRESENT) {
186 		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
187 		while (ptr != 0 && ptr != 0xff) {
188 			cap = read_config(&sel, ptr + PCICAP_ID, 1);
189 			if (cap == PCIY_MSI) {
190 				/*
191 				 * Copy the MSI capability into the config
192 				 * space of the emulated pci device
193 				 */
194 				sc->psc_msi.capoff = ptr;
195 				sc->psc_msi.msgctrl = read_config(&sel,
196 								  ptr + 2, 2);
197 				sc->psc_msi.emulated = 0;
198 				caplen = msi_caplen(sc->psc_msi.msgctrl);
199 				capptr = ptr;
200 				while (caplen > 0) {
201 					u32 = read_config(&sel, capptr, 4);
202 					pci_set_cfgdata32(pi, capptr, u32);
203 					caplen -= 4;
204 					capptr += 4;
205 				}
206 			} else if (cap == PCIY_MSIX) {
207 				/*
208 				 * Copy the MSI-X capability
209 				 */
210 				sc->psc_msix.capoff = ptr;
211 				caplen = 12;
212 				msixcap_ptr = (uint32_t*) &msixcap;
213 				capptr = ptr;
214 				while (caplen > 0) {
215 					u32 = read_config(&sel, capptr, 4);
216 					*msixcap_ptr = u32;
217 					pci_set_cfgdata32(pi, capptr, u32);
218 					caplen -= 4;
219 					capptr += 4;
220 					msixcap_ptr++;
221 				}
222 			}
223 			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
224 		}
225 	}
226 
227 	if (sc->psc_msix.capoff != 0) {
228 		pi->pi_msix.pba_bar =
229 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
230 		pi->pi_msix.pba_offset =
231 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
232 		pi->pi_msix.table_bar =
233 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
234 		pi->pi_msix.table_offset =
235 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
236 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
237 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
238 
239 		/* Allocate the emulated MSI-X table array */
240 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
241 		pi->pi_msix.table = calloc(1, table_size);
242 
243 		/* Mask all table entries */
244 		for (i = 0; i < pi->pi_msix.table_count; i++) {
245 			pi->pi_msix.table[i].vector_control |=
246 						PCIM_MSIX_VCTRL_MASK;
247 		}
248 	}
249 
250 #ifdef LEGACY_SUPPORT
251 	/*
252 	 * If the passthrough device does not support MSI then craft a
253 	 * MSI capability for it. We link the new MSI capability at the
254 	 * head of the list of capabilities.
255 	 */
256 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
257 		int origptr, msiptr;
258 		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
259 		msiptr = passthru_add_msicap(pi, 1, origptr);
260 		sc->psc_msi.capoff = msiptr;
261 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
262 		sc->psc_msi.emulated = 1;
263 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
264 	}
265 #endif
266 
267 	/* Make sure one of the capabilities is present */
268 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
269 		return (-1);
270 	else
271 		return (0);
272 }
273 
274 static uint64_t
275 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
276 {
277 	struct pci_devinst *pi;
278 	struct msix_table_entry *entry;
279 	uint8_t *src8;
280 	uint16_t *src16;
281 	uint32_t *src32;
282 	uint64_t *src64;
283 	uint64_t data;
284 	size_t entry_offset;
285 	int index;
286 
287 	pi = sc->psc_pi;
288 	if (offset >= pi->pi_msix.pba_offset &&
289 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
290 		switch(size) {
291 		case 1:
292 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
293 			    pi->pi_msix.pba_page_offset);
294 			data = *src8;
295 			break;
296 		case 2:
297 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
298 			    pi->pi_msix.pba_page_offset);
299 			data = *src16;
300 			break;
301 		case 4:
302 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
303 			    pi->pi_msix.pba_page_offset);
304 			data = *src32;
305 			break;
306 		case 8:
307 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
308 			    pi->pi_msix.pba_page_offset);
309 			data = *src64;
310 			break;
311 		default:
312 			return (-1);
313 		}
314 		return (data);
315 	}
316 
317 	if (offset < pi->pi_msix.table_offset)
318 		return (-1);
319 
320 	offset -= pi->pi_msix.table_offset;
321 	index = offset / MSIX_TABLE_ENTRY_SIZE;
322 	if (index >= pi->pi_msix.table_count)
323 		return (-1);
324 
325 	entry = &pi->pi_msix.table[index];
326 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
327 
328 	switch(size) {
329 	case 1:
330 		src8 = (uint8_t *)((void *)entry + entry_offset);
331 		data = *src8;
332 		break;
333 	case 2:
334 		src16 = (uint16_t *)((void *)entry + entry_offset);
335 		data = *src16;
336 		break;
337 	case 4:
338 		src32 = (uint32_t *)((void *)entry + entry_offset);
339 		data = *src32;
340 		break;
341 	case 8:
342 		src64 = (uint64_t *)((void *)entry + entry_offset);
343 		data = *src64;
344 		break;
345 	default:
346 		return (-1);
347 	}
348 
349 	return (data);
350 }
351 
352 static void
353 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
354 		 uint64_t offset, int size, uint64_t data)
355 {
356 	struct pci_devinst *pi;
357 	struct msix_table_entry *entry;
358 	uint8_t *dest8;
359 	uint16_t *dest16;
360 	uint32_t *dest32;
361 	uint64_t *dest64;
362 	size_t entry_offset;
363 	uint32_t vector_control;
364 	int error, index;
365 
366 	pi = sc->psc_pi;
367 	if (offset >= pi->pi_msix.pba_offset &&
368 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
369 		switch(size) {
370 		case 1:
371 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
372 			    pi->pi_msix.pba_page_offset);
373 			*dest8 = data;
374 			break;
375 		case 2:
376 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
377 			    pi->pi_msix.pba_page_offset);
378 			*dest16 = data;
379 			break;
380 		case 4:
381 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
382 			    pi->pi_msix.pba_page_offset);
383 			*dest32 = data;
384 			break;
385 		case 8:
386 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
387 			    pi->pi_msix.pba_page_offset);
388 			*dest64 = data;
389 			break;
390 		default:
391 			break;
392 		}
393 		return;
394 	}
395 
396 	if (offset < pi->pi_msix.table_offset)
397 		return;
398 
399 	offset -= pi->pi_msix.table_offset;
400 	index = offset / MSIX_TABLE_ENTRY_SIZE;
401 	if (index >= pi->pi_msix.table_count)
402 		return;
403 
404 	entry = &pi->pi_msix.table[index];
405 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
406 
407 	/* Only 4 byte naturally-aligned writes are supported */
408 	assert(size == 4);
409 	assert(entry_offset % 4 == 0);
410 
411 	vector_control = entry->vector_control;
412 	dest32 = (uint32_t *)((void *)entry + entry_offset);
413 	*dest32 = data;
414 	/* If MSI-X hasn't been enabled, do nothing */
415 	if (pi->pi_msix.enabled) {
416 		/* If the entry is masked, don't set it up */
417 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
418 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
419 			error = vm_setup_pptdev_msix(ctx, vcpu,
420 			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
421 			    sc->psc_sel.pc_func, index, entry->addr,
422 			    entry->msg_data, entry->vector_control);
423 		}
424 	}
425 }
426 
427 static int
428 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
429 {
430 	int b, s, f;
431 	int error, idx;
432 	size_t len, remaining;
433 	uint32_t table_size, table_offset;
434 	uint32_t pba_size, pba_offset;
435 	vm_paddr_t start;
436 	struct pci_devinst *pi = sc->psc_pi;
437 
438 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
439 
440 	b = sc->psc_sel.pc_bus;
441 	s = sc->psc_sel.pc_dev;
442 	f = sc->psc_sel.pc_func;
443 
444 	/*
445 	 * If the MSI-X table BAR maps memory intended for
446 	 * other uses, it is at least assured that the table
447 	 * either resides in its own page within the region,
448 	 * or it resides in a page shared with only the PBA.
449 	 */
450 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
451 
452 	table_size = pi->pi_msix.table_offset - table_offset;
453 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
454 	table_size = roundup2(table_size, 4096);
455 
456 	idx = pi->pi_msix.table_bar;
457 	start = pi->pi_bar[idx].addr;
458 	remaining = pi->pi_bar[idx].size;
459 
460 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
461 		pba_offset = pi->pi_msix.pba_offset;
462 		pba_size = pi->pi_msix.pba_size;
463 		if (pba_offset >= table_offset + table_size ||
464 		    table_offset >= pba_offset + pba_size) {
465 			/*
466 			 * If the PBA does not share a page with the MSI-x
467 			 * tables, no PBA emulation is required.
468 			 */
469 			pi->pi_msix.pba_page = NULL;
470 			pi->pi_msix.pba_page_offset = 0;
471 		} else {
472 			/*
473 			 * The PBA overlaps with either the first or last
474 			 * page of the MSI-X table region.  Map the
475 			 * appropriate page.
476 			 */
477 			if (pba_offset <= table_offset)
478 				pi->pi_msix.pba_page_offset = table_offset;
479 			else
480 				pi->pi_msix.pba_page_offset = table_offset +
481 				    table_size - 4096;
482 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
483 			    PROT_WRITE, MAP_SHARED, memfd, start +
484 			    pi->pi_msix.pba_page_offset);
485 			if (pi->pi_msix.pba_page == MAP_FAILED) {
486 				printf(
487 		    "Failed to map PBA page for MSI-X on %d/%d/%d: %s\n",
488 				    b, s, f, strerror(errno));
489 				return (-1);
490 			}
491 		}
492 	}
493 
494 	/* Map everything before the MSI-X table */
495 	if (table_offset > 0) {
496 		len = table_offset;
497 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
498 		if (error)
499 			return (error);
500 
501 		base += len;
502 		start += len;
503 		remaining -= len;
504 	}
505 
506 	/* Skip the MSI-X table */
507 	base += table_size;
508 	start += table_size;
509 	remaining -= table_size;
510 
511 	/* Map everything beyond the end of the MSI-X table */
512 	if (remaining > 0) {
513 		len = remaining;
514 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
515 		if (error)
516 			return (error);
517 	}
518 
519 	return (0);
520 }
521 
522 static int
523 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
524 {
525 	int i, error;
526 	struct pci_devinst *pi;
527 	struct pci_bar_io bar;
528 	enum pcibar_type bartype;
529 	uint64_t base, size;
530 
531 	pi = sc->psc_pi;
532 
533 	/*
534 	 * Initialize BAR registers
535 	 */
536 	for (i = 0; i <= PCI_BARMAX; i++) {
537 		bzero(&bar, sizeof(bar));
538 		bar.pbi_sel = sc->psc_sel;
539 		bar.pbi_reg = PCIR_BAR(i);
540 
541 		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
542 			continue;
543 
544 		if (PCI_BAR_IO(bar.pbi_base)) {
545 			bartype = PCIBAR_IO;
546 			base = bar.pbi_base & PCIM_BAR_IO_BASE;
547 		} else {
548 			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
549 			case PCIM_BAR_MEM_64:
550 				bartype = PCIBAR_MEM64;
551 				break;
552 			default:
553 				bartype = PCIBAR_MEM32;
554 				break;
555 			}
556 			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
557 		}
558 		size = bar.pbi_length;
559 
560 		if (bartype != PCIBAR_IO) {
561 			if (((base | size) & PAGE_MASK) != 0) {
562 				printf("passthru device %d/%d/%d BAR %d: "
563 				    "base %#lx or size %#lx not page aligned\n",
564 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
565 				    sc->psc_sel.pc_func, i, base, size);
566 				return (-1);
567 			}
568 		}
569 
570 		/* Cache information about the "real" BAR */
571 		sc->psc_bar[i].type = bartype;
572 		sc->psc_bar[i].size = size;
573 		sc->psc_bar[i].addr = base;
574 
575 		/* Allocate the BAR in the guest I/O or MMIO space */
576 		error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
577 		if (error)
578 			return (-1);
579 
580 		/* The MSI-X table needs special handling */
581 		if (i == pci_msix_table_bar(pi)) {
582 			error = init_msix_table(ctx, sc, base);
583 			if (error)
584 				return (-1);
585 		} else if (bartype != PCIBAR_IO) {
586 			/* Map the physical BAR in the guest MMIO space */
587 			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
588 				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
589 				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
590 			if (error)
591 				return (-1);
592 		}
593 
594 		/*
595 		 * 64-bit BAR takes up two slots so skip the next one.
596 		 */
597 		if (bartype == PCIBAR_MEM64) {
598 			i++;
599 			assert(i <= PCI_BARMAX);
600 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
601 		}
602 	}
603 	return (0);
604 }
605 
606 static int
607 cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
608 {
609 	int error;
610 	struct passthru_softc *sc;
611 
612 	error = 1;
613 	sc = pi->pi_arg;
614 
615 	bzero(&sc->psc_sel, sizeof(struct pcisel));
616 	sc->psc_sel.pc_bus = bus;
617 	sc->psc_sel.pc_dev = slot;
618 	sc->psc_sel.pc_func = func;
619 
620 	if (cfginitmsi(sc) != 0)
621 		goto done;
622 
623 	if (cfginitbar(ctx, sc) != 0)
624 		goto done;
625 
626 	error = 0;				/* success */
627 done:
628 	return (error);
629 }
630 
631 static int
632 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
633 {
634 	int bus, slot, func, error, memflags;
635 	struct passthru_softc *sc;
636 
637 	sc = NULL;
638 	error = 1;
639 
640 	memflags = vm_get_memflags(ctx);
641 	if (!(memflags & VM_MEM_F_WIRED)) {
642 		fprintf(stderr, "passthru requires guest memory to be wired\n");
643 		goto done;
644 	}
645 
646 	if (pcifd < 0) {
647 		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
648 		if (pcifd < 0)
649 			goto done;
650 	}
651 
652 	if (iofd < 0) {
653 		iofd = open(_PATH_DEVIO, O_RDWR, 0);
654 		if (iofd < 0)
655 			goto done;
656 	}
657 
658 	if (memfd < 0) {
659 		memfd = open(_PATH_MEM, O_RDWR, 0);
660 		if (memfd < 0)
661 			goto done;
662 	}
663 
664 	if (opts == NULL ||
665 	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
666 		goto done;
667 
668 	if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
669 		goto done;
670 
671 	sc = calloc(1, sizeof(struct passthru_softc));
672 
673 	pi->pi_arg = sc;
674 	sc->psc_pi = pi;
675 
676 	/* initialize config space */
677 	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
678 		goto done;
679 
680 	error = 0;		/* success */
681 done:
682 	if (error) {
683 		free(sc);
684 		vm_unassign_pptdev(ctx, bus, slot, func);
685 	}
686 	return (error);
687 }
688 
689 static int
690 bar_access(int coff)
691 {
692 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
693 		return (1);
694 	else
695 		return (0);
696 }
697 
698 static int
699 msicap_access(struct passthru_softc *sc, int coff)
700 {
701 	int caplen;
702 
703 	if (sc->psc_msi.capoff == 0)
704 		return (0);
705 
706 	caplen = msi_caplen(sc->psc_msi.msgctrl);
707 
708 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
709 		return (1);
710 	else
711 		return (0);
712 }
713 
714 static int
715 msixcap_access(struct passthru_softc *sc, int coff)
716 {
717 	if (sc->psc_msix.capoff == 0)
718 		return (0);
719 
720 	return (coff >= sc->psc_msix.capoff &&
721 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
722 }
723 
724 static int
725 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
726 		 int coff, int bytes, uint32_t *rv)
727 {
728 	struct passthru_softc *sc;
729 
730 	sc = pi->pi_arg;
731 
732 	/*
733 	 * PCI BARs and MSI capability is emulated.
734 	 */
735 	if (bar_access(coff) || msicap_access(sc, coff))
736 		return (-1);
737 
738 #ifdef LEGACY_SUPPORT
739 	/*
740 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
741 	 * natively.
742 	 */
743 	if (sc->psc_msi.emulated) {
744 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
745 			return (-1);
746 	}
747 #endif
748 
749 	/* Everything else just read from the device's config space */
750 	*rv = read_config(&sc->psc_sel, coff, bytes);
751 
752 	return (0);
753 }
754 
755 static int
756 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
757 		  int coff, int bytes, uint32_t val)
758 {
759 	int error, msix_table_entries, i;
760 	struct passthru_softc *sc;
761 
762 	sc = pi->pi_arg;
763 
764 	/*
765 	 * PCI BARs are emulated
766 	 */
767 	if (bar_access(coff))
768 		return (-1);
769 
770 	/*
771 	 * MSI capability is emulated
772 	 */
773 	if (msicap_access(sc, coff)) {
774 		msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
775 
776 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
777 			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
778 			pi->pi_msi.addr, pi->pi_msi.msg_data,
779 			pi->pi_msi.maxmsgnum);
780 		if (error != 0) {
781 			printf("vm_setup_pptdev_msi error %d\r\n", errno);
782 			exit(1);
783 		}
784 		return (0);
785 	}
786 
787 	if (msixcap_access(sc, coff)) {
788 		msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
789 		if (pi->pi_msix.enabled) {
790 			msix_table_entries = pi->pi_msix.table_count;
791 			for (i = 0; i < msix_table_entries; i++) {
792 				error = vm_setup_pptdev_msix(ctx, vcpu,
793 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
794 				    sc->psc_sel.pc_func, i,
795 				    pi->pi_msix.table[i].addr,
796 				    pi->pi_msix.table[i].msg_data,
797 				    pi->pi_msix.table[i].vector_control);
798 
799 				if (error) {
800 					printf("vm_setup_pptdev_msix error "
801 					    "%d\r\n", errno);
802 					exit(1);
803 				}
804 			}
805 		}
806 		return (0);
807 	}
808 
809 #ifdef LEGACY_SUPPORT
810 	/*
811 	 * If this device does not support MSI natively then we cannot let
812 	 * the guest disable legacy interrupts from the device. It is the
813 	 * legacy interrupt that is triggering the virtual MSI to the guest.
814 	 */
815 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
816 		if (coff == PCIR_COMMAND && bytes == 2)
817 			val &= ~PCIM_CMD_INTxDIS;
818 	}
819 #endif
820 
821 	write_config(&sc->psc_sel, coff, bytes, val);
822 
823 	return (0);
824 }
825 
826 static void
827 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
828 	       uint64_t offset, int size, uint64_t value)
829 {
830 	struct passthru_softc *sc;
831 	struct iodev_pio_req pio;
832 
833 	sc = pi->pi_arg;
834 
835 	if (baridx == pci_msix_table_bar(pi)) {
836 		msix_table_write(ctx, vcpu, sc, offset, size, value);
837 	} else {
838 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
839 		bzero(&pio, sizeof(struct iodev_pio_req));
840 		pio.access = IODEV_PIO_WRITE;
841 		pio.port = sc->psc_bar[baridx].addr + offset;
842 		pio.width = size;
843 		pio.val = value;
844 
845 		(void)ioctl(iofd, IODEV_PIO, &pio);
846 	}
847 }
848 
849 static uint64_t
850 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
851 	      uint64_t offset, int size)
852 {
853 	struct passthru_softc *sc;
854 	struct iodev_pio_req pio;
855 	uint64_t val;
856 
857 	sc = pi->pi_arg;
858 
859 	if (baridx == pci_msix_table_bar(pi)) {
860 		val = msix_table_read(sc, offset, size);
861 	} else {
862 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
863 		bzero(&pio, sizeof(struct iodev_pio_req));
864 		pio.access = IODEV_PIO_READ;
865 		pio.port = sc->psc_bar[baridx].addr + offset;
866 		pio.width = size;
867 		pio.val = 0;
868 
869 		(void)ioctl(iofd, IODEV_PIO, &pio);
870 
871 		val = pio.val;
872 	}
873 
874 	return (val);
875 }
876 
877 struct pci_devemu passthru = {
878 	.pe_emu		= "passthru",
879 	.pe_init	= passthru_init,
880 	.pe_cfgwrite	= passthru_cfgwrite,
881 	.pe_cfgread	= passthru_cfgread,
882 	.pe_barwrite 	= passthru_write,
883 	.pe_barread    	= passthru_read,
884 };
885 PCI_EMUL_SET(passthru);
886