xref: /illumos-gate/usr/src/cmd/bhyve/pci_passthru.c (revision 3df2e8b2fd61f45437285750d2880d6416a9200c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #ifndef WITHOUT_CAPSICUM
36 #include <sys/capsicum.h>
37 #endif
38 #include <sys/types.h>
39 #include <sys/mman.h>
40 #include <sys/pciio.h>
41 #include <sys/ioctl.h>
42 
43 #include <sys/pci.h>
44 
45 #include <dev/io/iodev.h>
46 #include <dev/pci/pcireg.h>
47 
48 #include <machine/iodev.h>
49 
50 #ifndef WITHOUT_CAPSICUM
51 #include <capsicum_helpers.h>
52 #endif
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <err.h>
57 #include <errno.h>
58 #include <fcntl.h>
59 #include <sysexits.h>
60 #include <unistd.h>
61 
62 #include <machine/vmm.h>
63 #include <vmmapi.h>
64 #include <sys/ppt_dev.h>
65 
66 #include "config.h"
67 #include "debug.h"
68 #include "pci_emul.h"
69 #include "mem.h"
70 
71 #define	LEGACY_SUPPORT	1
72 
73 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
74 #define MSIX_CAPLEN 12
75 
76 struct passthru_softc {
77 	struct pci_devinst *psc_pi;
78 	struct pcibar psc_bar[PCI_BARMAX + 1];
79 	struct {
80 		int		capoff;
81 		int		msgctrl;
82 		int		emulated;
83 	} psc_msi;
84 	struct {
85 		int		capoff;
86 	} psc_msix;
87 	int pptfd;
88 	int msi_limit;
89 	int msix_limit;
90 };
91 
92 static int
93 msi_caplen(int msgctrl)
94 {
95 	int len;
96 
97 	len = 10;		/* minimum length of msi capability */
98 
99 	if (msgctrl & PCIM_MSICTRL_64BIT)
100 		len += 4;
101 
102 #if 0
103 	/*
104 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
105 	 * We'll let the guest manipulate them directly.
106 	 */
107 	if (msgctrl & PCIM_MSICTRL_VECTOR)
108 		len += 10;
109 #endif
110 
111 	return (len);
112 }
113 
114 static uint32_t
115 read_config(const struct passthru_softc *sc, long reg, int width)
116 {
117 	struct ppt_cfg_io pi;
118 
119 	pi.pci_off = reg;
120 	pi.pci_width = width;
121 
122 	if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) {
123 		return (0);
124 	}
125 	return (pi.pci_data);
126 }
127 
128 static void
129 write_config(const struct passthru_softc *sc, long reg, int width,
130     uint32_t data)
131 {
132 	struct ppt_cfg_io pi;
133 
134 	pi.pci_off = reg;
135 	pi.pci_width = width;
136 	pi.pci_data = data;
137 
138 	(void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi);
139 }
140 
141 static int
142 passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type,
143     uint64_t *base, uint64_t *size)
144 {
145 	struct ppt_bar_query pb;
146 
147 	pb.pbq_baridx = bar;
148 
149 	if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) {
150 		return (-1);
151 	}
152 
153 	switch (pb.pbq_type) {
154 	case PCI_ADDR_IO:
155 		*type = PCIBAR_IO;
156 		break;
157 	case PCI_ADDR_MEM32:
158 		*type = PCIBAR_MEM32;
159 		break;
160 	case PCI_ADDR_MEM64:
161 		*type = PCIBAR_MEM64;
162 		break;
163 	default:
164 		err(1, "unrecognized BAR type: %u\n", pb.pbq_type);
165 		break;
166 	}
167 
168 	*base = pb.pbq_base;
169 	*size = pb.pbq_size;
170 	return (0);
171 }
172 
173 static int
174 passthru_dev_open(const char *path, int *pptfdp)
175 {
176 	int pptfd;
177 
178 	if ((pptfd = open(path, O_RDWR)) < 0) {
179 		return (errno);
180 	}
181 
182 	/* XXX: verify fd with ioctl? */
183 	*pptfdp = pptfd;
184 	return (0);
185 }
186 
187 #ifdef LEGACY_SUPPORT
188 static int
189 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
190 {
191 	int capoff, i;
192 	struct msicap msicap;
193 	u_char *capdata;
194 
195 	pci_populate_msicap(&msicap, msgnum, nextptr);
196 
197 	/*
198 	 * XXX
199 	 * Copy the msi capability structure in the last 16 bytes of the
200 	 * config space. This is wrong because it could shadow something
201 	 * useful to the device.
202 	 */
203 	capoff = 256 - roundup(sizeof(msicap), 4);
204 	capdata = (u_char *)&msicap;
205 	for (i = 0; i < sizeof(msicap); i++)
206 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
207 
208 	return (capoff);
209 }
210 #endif	/* LEGACY_SUPPORT */
211 
212 static void
213 passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap)
214 {
215 	struct pci_devinst *pi = sc->psc_pi;
216 	int off;
217 
218 	/* Reduce the number of MSI vectors if higher than OS limit */
219 	if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) {
220 		int msi_limit, mmc;
221 
222 		msi_limit =
223 		    sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 :
224 		    sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 :
225 		    sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 :
226 		    sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 :
227 		    sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 :
228 		    PCIM_MSICTRL_MMC_1;
229 		mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK;
230 
231 		if (mmc > msi_limit) {
232 			sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK;
233 			sc->psc_msi.msgctrl |= msi_limit;
234 			pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl);
235 		}
236 	}
237 
238 	/* Reduce the number of MSI-X vectors if higher than OS limit */
239 	if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) {
240 		if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) {
241 			msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE;
242 			msixcap->msgctrl |= sc->msix_limit - 1;
243 			pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl);
244 		}
245 	}
246 }
247 
248 static int
249 cfginitmsi(struct passthru_softc *sc)
250 {
251 	int i, ptr, capptr, cap, sts, caplen, table_size;
252 	uint32_t u32;
253 	struct pci_devinst *pi = sc->psc_pi;
254 	struct msixcap msixcap;
255 	uint32_t *msixcap_ptr;
256 
257 	/*
258 	 * Parse the capabilities and cache the location of the MSI
259 	 * and MSI-X capabilities.
260 	 */
261 	sts = read_config(sc, PCIR_STATUS, 2);
262 	if (sts & PCIM_STATUS_CAPPRESENT) {
263 		ptr = read_config(sc, PCIR_CAP_PTR, 1);
264 		while (ptr != 0 && ptr != 0xff) {
265 			cap = read_config(sc, ptr + PCICAP_ID, 1);
266 			if (cap == PCIY_MSI) {
267 				/*
268 				 * Copy the MSI capability into the config
269 				 * space of the emulated pci device
270 				 */
271 				sc->psc_msi.capoff = ptr;
272 				sc->psc_msi.msgctrl = read_config(sc,
273 				    ptr + 2, 2);
274 				sc->psc_msi.emulated = 0;
275 				caplen = msi_caplen(sc->psc_msi.msgctrl);
276 				capptr = ptr;
277 				while (caplen > 0) {
278 					u32 = read_config(sc, capptr, 4);
279 					pci_set_cfgdata32(pi, capptr, u32);
280 					caplen -= 4;
281 					capptr += 4;
282 				}
283 			} else if (cap == PCIY_MSIX) {
284 				/*
285 				 * Copy the MSI-X capability
286 				 */
287 				sc->psc_msix.capoff = ptr;
288 				caplen = 12;
289 				msixcap_ptr = (uint32_t*) &msixcap;
290 				capptr = ptr;
291 				while (caplen > 0) {
292 					u32 = read_config(sc, capptr, 4);
293 					*msixcap_ptr = u32;
294 					pci_set_cfgdata32(pi, capptr, u32);
295 					caplen -= 4;
296 					capptr += 4;
297 					msixcap_ptr++;
298 				}
299 			}
300 			ptr = read_config(sc, ptr + PCICAP_NEXTPTR, 1);
301 		}
302 	}
303 
304 	passthru_intr_limit(sc, &msixcap);
305 
306 	if (sc->psc_msix.capoff != 0) {
307 		pi->pi_msix.pba_bar =
308 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
309 		pi->pi_msix.pba_offset =
310 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
311 		pi->pi_msix.table_bar =
312 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
313 		pi->pi_msix.table_offset =
314 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
315 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
316 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
317 
318 		/* Allocate the emulated MSI-X table array */
319 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
320 		pi->pi_msix.table = calloc(1, table_size);
321 
322 		/* Mask all table entries */
323 		for (i = 0; i < pi->pi_msix.table_count; i++) {
324 			pi->pi_msix.table[i].vector_control |=
325 						PCIM_MSIX_VCTRL_MASK;
326 		}
327 	}
328 
329 #ifdef LEGACY_SUPPORT
330 	/*
331 	 * If the passthrough device does not support MSI then craft a
332 	 * MSI capability for it. We link the new MSI capability at the
333 	 * head of the list of capabilities.
334 	 */
335 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
336 		int origptr, msiptr;
337 		origptr = read_config(sc, PCIR_CAP_PTR, 1);
338 		msiptr = passthru_add_msicap(pi, 1, origptr);
339 		sc->psc_msi.capoff = msiptr;
340 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
341 		sc->psc_msi.emulated = 1;
342 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
343 	}
344 #endif
345 
346 	/* Make sure one of the capabilities is present */
347 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
348 		return (-1);
349 	else
350 		return (0);
351 }
352 
353 static uint64_t
354 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
355 {
356 	struct pci_devinst *pi;
357 	struct msix_table_entry *entry;
358 	uint8_t *src8;
359 	uint16_t *src16;
360 	uint32_t *src32;
361 	uint64_t *src64;
362 	uint64_t data;
363 	size_t entry_offset;
364 	uint32_t table_offset;
365 	int index, table_count;
366 
367 	pi = sc->psc_pi;
368 
369 	table_offset = pi->pi_msix.table_offset;
370 	table_count = pi->pi_msix.table_count;
371 	if (offset < table_offset ||
372 	    offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) {
373 		switch (size) {
374 		case 1:
375 			src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset);
376 			data = *src8;
377 			break;
378 		case 2:
379 			src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset);
380 			data = *src16;
381 			break;
382 		case 4:
383 			src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset);
384 			data = *src32;
385 			break;
386 		case 8:
387 			src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset);
388 			data = *src64;
389 			break;
390 		default:
391 			return (-1);
392 		}
393 		return (data);
394 	}
395 
396 	offset -= table_offset;
397 	index = offset / MSIX_TABLE_ENTRY_SIZE;
398 	assert(index < table_count);
399 
400 	entry = &pi->pi_msix.table[index];
401 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
402 
403 	switch (size) {
404 	case 1:
405 		src8 = (uint8_t *)((uint8_t *)entry + entry_offset);
406 		data = *src8;
407 		break;
408 	case 2:
409 		src16 = (uint16_t *)((uint8_t *)entry + entry_offset);
410 		data = *src16;
411 		break;
412 	case 4:
413 		src32 = (uint32_t *)((uint8_t *)entry + entry_offset);
414 		data = *src32;
415 		break;
416 	case 8:
417 		src64 = (uint64_t *)((uint8_t *)entry + entry_offset);
418 		data = *src64;
419 		break;
420 	default:
421 		return (-1);
422 	}
423 
424 	return (data);
425 }
426 
427 static void
428 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
429 		 uint64_t offset, int size, uint64_t data)
430 {
431 	struct pci_devinst *pi;
432 	struct msix_table_entry *entry;
433 	uint8_t *dest8;
434 	uint16_t *dest16;
435 	uint32_t *dest32;
436 	uint64_t *dest64;
437 	size_t entry_offset;
438 	uint32_t table_offset, vector_control;
439 	int index, table_count;
440 
441 	pi = sc->psc_pi;
442 
443 	table_offset = pi->pi_msix.table_offset;
444 	table_count = pi->pi_msix.table_count;
445 	if (offset < table_offset ||
446 	    offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) {
447 		switch (size) {
448 		case 1:
449 			dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset);
450 			*dest8 = data;
451 			break;
452 		case 2:
453 			dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset);
454 			*dest16 = data;
455 			break;
456 		case 4:
457 			dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset);
458 			*dest32 = data;
459 			break;
460 		case 8:
461 			dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset);
462 			*dest64 = data;
463 			break;
464 		}
465 		return;
466 	}
467 
468 	offset -= table_offset;
469 	index = offset / MSIX_TABLE_ENTRY_SIZE;
470 	assert(index < table_count);
471 
472 	entry = &pi->pi_msix.table[index];
473 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
474 
475 	/* Only 4 byte naturally-aligned writes are supported */
476 	assert(size == 4);
477 	assert(entry_offset % 4 == 0);
478 
479 	vector_control = entry->vector_control;
480 	dest32 = (uint32_t *)((void *)entry + entry_offset);
481 	*dest32 = data;
482 	/* If MSI-X hasn't been enabled, do nothing */
483 	if (pi->pi_msix.enabled) {
484 		/* If the entry is masked, don't set it up */
485 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
486 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
487 			(void) vm_setup_pptdev_msix(ctx, vcpu, sc->pptfd,
488 			    index, entry->addr, entry->msg_data,
489 			    entry->vector_control);
490 		}
491 	}
492 }
493 
494 static int
495 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc)
496 {
497 	struct pci_devinst *pi = sc->psc_pi;
498 	uint32_t table_size, table_offset;
499 	int i;
500 
501 	i = pci_msix_table_bar(pi);
502 	assert(i >= 0);
503 
504         /*
505          * Map the region of the BAR containing the MSI-X table.  This is
506          * necessary for two reasons:
507          * 1. The PBA may reside in the first or last page containing the MSI-X
508          *    table.
509          * 2. While PCI devices are not supposed to use the page(s) containing
510          *    the MSI-X table for other purposes, some do in practice.
511          */
512 
513 	/*
514 	 * Mapping pptfd provides access to the BAR containing the MSI-X
515 	 * table. See ppt_devmap() in usr/src/uts/i86pc/io/vmm/io/ppt.c
516 	 *
517 	 * This maps the whole BAR and then mprotect(PROT_NONE) is used below
518 	 * to prevent access to pages that don't contain the MSI-X table.
519 	 * When porting this, it was tempting to just map the MSI-X table pages
520 	 * but that would mean updating everywhere that assumes that
521 	 * pi->pi_msix.mapped_addr points to the start of the BAR. For now,
522 	 * keep closer to upstream.
523 	 */
524 	pi->pi_msix.mapped_size = sc->psc_bar[i].size;
525 	pi->pi_msix.mapped_addr = (uint8_t *)mmap(NULL, pi->pi_msix.mapped_size,
526 	    PROT_READ | PROT_WRITE, MAP_SHARED, sc->pptfd, 0);
527 	if (pi->pi_msix.mapped_addr == MAP_FAILED) {
528 		warn("Failed to map MSI-X table BAR on %d", sc->pptfd);
529 		return (-1);
530 	}
531 
532 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
533 
534 	table_size = pi->pi_msix.table_offset - table_offset;
535 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
536 	table_size = roundup2(table_size, 4096);
537 
538 	/*
539 	 * Unmap any pages not containing the table, we do not need to emulate
540 	 * accesses to them.  Avoid releasing address space to help ensure that
541 	 * a buggy out-of-bounds access causes a crash.
542 	 */
543 	if (table_offset != 0)
544 		if (mprotect((caddr_t)pi->pi_msix.mapped_addr, table_offset,
545 		    PROT_NONE) != 0)
546 			warn("Failed to unmap MSI-X table BAR region");
547 	if (table_offset + table_size != pi->pi_msix.mapped_size)
548 		if (mprotect((caddr_t)
549 		    pi->pi_msix.mapped_addr + table_offset + table_size,
550 		    pi->pi_msix.mapped_size - (table_offset + table_size),
551 		    PROT_NONE) != 0)
552 			warn("Failed to unmap MSI-X table BAR region");
553 
554 	return (0);
555 }
556 
557 static int
558 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
559 {
560 	struct pci_devinst *pi = sc->psc_pi;
561 	uint_t i;
562 
563 	/*
564 	 * Initialize BAR registers
565 	 */
566 	for (i = 0; i <= PCI_BARMAX; i++) {
567 		enum pcibar_type bartype;
568 		uint64_t base, size;
569 		int error;
570 
571 		if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) {
572 			continue;
573 		}
574 
575 		if (bartype != PCIBAR_IO) {
576 			if (((base | size) & PAGE_MASK) != 0) {
577 				warnx("passthru device %d BAR %d: "
578 				    "base %#lx or size %#lx not page aligned\n",
579 				    sc->pptfd, i, base, size);
580 				return (-1);
581 			}
582 		}
583 
584 		/* Cache information about the "real" BAR */
585 		sc->psc_bar[i].type = bartype;
586 		sc->psc_bar[i].size = size;
587 		sc->psc_bar[i].addr = base;
588 		sc->psc_bar[i].lobits = 0;
589 
590 		/* Allocate the BAR in the guest I/O or MMIO space */
591 		error = pci_emul_alloc_bar(pi, i, bartype, size);
592 		if (error)
593 			return (-1);
594 
595 		/* Use same lobits as physical bar */
596 		uint8_t lobits = read_config(sc, PCIR_BAR(i), 0x01);
597 		if (bartype == PCIBAR_MEM32 || bartype == PCIBAR_MEM64) {
598 			lobits &= ~PCIM_BAR_MEM_BASE;
599 		} else {
600 			lobits &= ~PCIM_BAR_IO_BASE;
601 		}
602 		sc->psc_bar[i].lobits = lobits;
603 		pi->pi_bar[i].lobits = lobits;
604 
605 		/*
606 		 * 64-bit BAR takes up two slots so skip the next one.
607 		 */
608 		if (bartype == PCIBAR_MEM64) {
609 			i++;
610 			assert(i <= PCI_BARMAX);
611 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
612 		}
613 	}
614 	return (0);
615 }
616 
617 static int
618 cfginit(struct vmctx *ctx, struct passthru_softc *sc)
619 {
620 	struct pci_devinst *pi = sc->psc_pi;
621 	int error;
622 
623 	if (cfginitmsi(sc) != 0) {
624 		warnx("failed to initialize MSI for PCI %d", sc->pptfd);
625 		return (-1);
626 	}
627 
628 	if (cfginitbar(ctx, sc) != 0) {
629 		warnx("failed to initialize BARs for PCI %d", sc->pptfd);
630 		return (-1);
631 	}
632 
633 	write_config(sc, PCIR_COMMAND, 2, pci_get_cfgdata16(pi, PCIR_COMMAND));
634 
635 	/*
636 	* We need to do this after PCIR_COMMAND got possibly updated, e.g.,
637 	* a BAR was enabled.
638 	*/
639 	if (pci_msix_table_bar(pi) >= 0) {
640 		error = init_msix_table(ctx, sc);
641 		if (error != 0) {
642 			warnx("failed to initialize MSI-X table for PCI %d",
643 			    sc->pptfd);
644 			goto done;
645 		}
646 	}
647 
648 	error = 0;				/* success */
649 done:
650 	return (error);
651 }
652 
653 static int
654 passthru_legacy_config(nvlist_t *nvl, const char *opts)
655 {
656 	if (opts == NULL)
657 		return (0);
658 
659 	if (strncmp(opts, "/dev/ppt", 8) == 0)
660 		set_config_value_node(nvl, "path", opts);
661 
662 	return (0);
663 }
664 
665 static int
666 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
667 {
668 	int error, memflags, pptfd;
669 	struct passthru_softc *sc;
670 	const char *path;
671 
672 	pptfd = -1;
673 	sc = NULL;
674 	error = 1;
675 
676 	memflags = vm_get_memflags(ctx);
677 	if (!(memflags & VM_MEM_F_WIRED)) {
678 		warnx("passthru requires guest memory to be wired");
679 		goto done;
680 	}
681 
682 	path = get_config_value_node(nvl, "path");
683 	if (path == NULL || passthru_dev_open(path, &pptfd) != 0) {
684 		warnx("invalid passthru options");
685 		goto done;
686 	}
687 
688 	if (vm_assign_pptdev(ctx, pptfd) != 0) {
689 		warnx("PCI device at %d is not using the ppt driver", pptfd);
690 		goto done;
691 	}
692 
693 	sc = calloc(1, sizeof(struct passthru_softc));
694 
695 	pi->pi_arg = sc;
696 	sc->psc_pi = pi;
697 	sc->pptfd = pptfd;
698 
699 	if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit,
700 	    &sc->msix_limit)) != 0)
701 		goto done;
702 
703 	/* initialize config space */
704 	error = cfginit(ctx, sc);
705 done:
706 	if (error) {
707 		free(sc);
708 		if (pptfd != -1)
709 			vm_unassign_pptdev(ctx, pptfd);
710 	}
711 	return (error);
712 }
713 
714 static int
715 bar_access(int coff)
716 {
717 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
718 		return (1);
719 	else
720 		return (0);
721 }
722 
723 static int
724 msicap_access(struct passthru_softc *sc, int coff)
725 {
726 	int caplen;
727 
728 	if (sc->psc_msi.capoff == 0)
729 		return (0);
730 
731 	caplen = msi_caplen(sc->psc_msi.msgctrl);
732 
733 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
734 		return (1);
735 	else
736 		return (0);
737 }
738 
739 static int
740 msixcap_access(struct passthru_softc *sc, int coff)
741 {
742 	if (sc->psc_msix.capoff == 0)
743 		return (0);
744 
745 	return (coff >= sc->psc_msix.capoff &&
746 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
747 }
748 
749 static int
750 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
751 		int coff, int bytes, uint32_t *rv)
752 {
753 	struct passthru_softc *sc;
754 
755 	sc = pi->pi_arg;
756 
757 	/*
758 	 * PCI BARs and MSI capability is emulated.
759 	 */
760 	if (bar_access(coff) || msicap_access(sc, coff) ||
761 	    msixcap_access(sc, coff))
762 		return (-1);
763 
764 	/*
765 	 * MSI-X is also emulated since a limit on interrupts may be imposed by
766 	 * the OS, altering the perceived register state.
767 	 */
768 	if (msixcap_access(sc, coff))
769 		return (-1);
770 
771 #ifdef LEGACY_SUPPORT
772 	/*
773 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
774 	 * natively.
775 	 */
776 	if (sc->psc_msi.emulated) {
777 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
778 			return (-1);
779 	}
780 #endif
781 
782 	/*
783 	 * Emulate the command register.  If a single read reads both the
784 	 * command and status registers, read the status register from the
785 	 * device's config space.
786 	 */
787 	if (coff == PCIR_COMMAND) {
788 		if (bytes <= 2)
789 			return (-1);
790 		*rv = pci_get_cfgdata16(pi, PCIR_COMMAND) << 16 |
791 		    read_config(sc, PCIR_STATUS, 2);
792 		return (0);
793 	}
794 
795 	/* Everything else just read from the device's config space */
796 	*rv = read_config(sc, coff, bytes);
797 
798 	return (0);
799 }
800 
801 static int
802 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
803 		  int coff, int bytes, uint32_t val)
804 {
805 	int error, msix_table_entries, i;
806 	struct passthru_softc *sc;
807 	uint16_t cmd_old;
808 
809 	sc = pi->pi_arg;
810 
811 	/*
812 	 * PCI BARs are emulated
813 	 */
814 	if (bar_access(coff))
815 		return (-1);
816 
817 	/*
818 	 * MSI capability is emulated
819 	 */
820 	if (msicap_access(sc, coff)) {
821 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
822 		    PCIY_MSI);
823 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->pptfd,
824 		    pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum);
825 		if (error != 0)
826 			err(1, "vm_setup_pptdev_msi");
827 		return (0);
828 	}
829 
830 	if (msixcap_access(sc, coff)) {
831 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
832 		    PCIY_MSIX);
833 		if (pi->pi_msix.enabled) {
834 			msix_table_entries = pi->pi_msix.table_count;
835 			for (i = 0; i < msix_table_entries; i++) {
836 				error = vm_setup_pptdev_msix(ctx, vcpu,
837 				    sc->pptfd, i,
838 				    pi->pi_msix.table[i].addr,
839 				    pi->pi_msix.table[i].msg_data,
840 				    pi->pi_msix.table[i].vector_control);
841 
842 				if (error)
843 					err(1, "vm_setup_pptdev_msix");
844 			}
845 		} else {
846 			error = vm_disable_pptdev_msix(ctx, sc->pptfd);
847 			if (error)
848 				err(1, "vm_disable_pptdev_msix");
849 		}
850 		return (0);
851 	}
852 
853 #ifdef LEGACY_SUPPORT
854 	/*
855 	 * If this device does not support MSI natively then we cannot let
856 	 * the guest disable legacy interrupts from the device. It is the
857 	 * legacy interrupt that is triggering the virtual MSI to the guest.
858 	 */
859 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
860 		if (coff == PCIR_COMMAND && bytes == 2)
861 			val &= ~PCIM_CMD_INTxDIS;
862 	}
863 #endif
864 
865 	write_config(sc, coff, bytes, val);
866 	if (coff == PCIR_COMMAND) {
867 		cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
868 		if (bytes == 1)
869 			pci_set_cfgdata8(pi, PCIR_COMMAND, val);
870 		else if (bytes == 2)
871 			pci_set_cfgdata16(pi, PCIR_COMMAND, val);
872 		pci_emul_cmd_changed(pi, cmd_old);
873 	}
874 
875 	return (0);
876 }
877 
878 static void
879 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
880 	       uint64_t offset, int size, uint64_t value)
881 {
882 	struct passthru_softc *sc = pi->pi_arg;
883 
884 	if (baridx == pci_msix_table_bar(pi)) {
885 		msix_table_write(ctx, vcpu, sc, offset, size, value);
886 	} else {
887 		struct ppt_bar_io pbi;
888 
889 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
890 
891 		pbi.pbi_bar = baridx;
892 		pbi.pbi_width = size;
893 		pbi.pbi_off = offset;
894 		pbi.pbi_data = value;
895 		(void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi);
896 	}
897 }
898 
899 static uint64_t
900 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
901 	      uint64_t offset, int size)
902 {
903 	struct passthru_softc *sc = pi->pi_arg;
904 	uint64_t val;
905 
906 	if (baridx == pci_msix_table_bar(pi)) {
907 		val = msix_table_read(sc, offset, size);
908 	} else {
909 		struct ppt_bar_io pbi;
910 
911 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
912 
913 		pbi.pbi_bar = baridx;
914 		pbi.pbi_width = size;
915 		pbi.pbi_off = offset;
916 		if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) {
917 			val = pbi.pbi_data;
918 		} else {
919 			val = 0;
920 		}
921 	}
922 
923 	return (val);
924 }
925 
926 static void
927 passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
928 		   int enabled, uint64_t address)
929 {
930 	struct passthru_softc *sc;
931 	size_t remaining;
932 	uint32_t table_size, table_offset;
933 
934 	sc = pi->pi_arg;
935 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
936 	if (table_offset > 0) {
937 		if (!enabled) {
938 			if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
939 			    table_offset) != 0)
940 				warnx("pci_passthru: unmap_pptdev_mmio failed");
941 		} else {
942 			if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
943 			    table_offset, sc->psc_bar[baridx].addr) != 0)
944 				warnx("pci_passthru: map_pptdev_mmio failed");
945 		}
946 	}
947 	table_size = pi->pi_msix.table_offset - table_offset;
948 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
949 	table_size = roundup2(table_size, 4096);
950 	remaining = pi->pi_bar[baridx].size - table_offset - table_size;
951 	if (remaining > 0) {
952 		address += table_offset + table_size;
953 		if (!enabled) {
954 			if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
955 			    remaining) != 0)
956 				warnx("pci_passthru: unmap_pptdev_mmio failed");
957 		} else {
958 			if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
959 			    remaining, sc->psc_bar[baridx].addr +
960 			    table_offset + table_size) != 0)
961 				warnx("pci_passthru: map_pptdev_mmio failed");
962 		}
963 	}
964 }
965 
966 static void
967 passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
968 		   int enabled, uint64_t address)
969 {
970 	struct passthru_softc *sc;
971 
972 	sc = pi->pi_arg;
973 	if (!enabled) {
974 		if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
975 		    sc->psc_bar[baridx].size) != 0)
976 			warnx("pci_passthru: unmap_pptdev_mmio failed");
977 	} else {
978 		if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
979 		    sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0)
980 			warnx("pci_passthru: map_pptdev_mmio failed");
981 	}
982 }
983 
984 static void
985 passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
986 	      int enabled, uint64_t address)
987 {
988 
989 	if (pi->pi_bar[baridx].type == PCIBAR_IO)
990 		return;
991 	if (baridx == pci_msix_table_bar(pi))
992 		passthru_msix_addr(ctx, pi, baridx, enabled, address);
993 	else
994 		passthru_mmio_addr(ctx, pi, baridx, enabled, address);
995 }
996 
997 struct pci_devemu passthru = {
998 	.pe_emu		= "passthru",
999 	.pe_init	= passthru_init,
1000 	.pe_legacy_config = passthru_legacy_config,
1001 	.pe_cfgwrite	= passthru_cfgwrite,
1002 	.pe_cfgread	= passthru_cfgread,
1003 	.pe_barwrite 	= passthru_write,
1004 	.pe_barread    	= passthru_read,
1005 	.pe_baraddr	= passthru_addr,
1006 };
1007 PCI_EMUL_SET(passthru);
1008