xref: /freebsd/usr.sbin/bhyve/pci_passthru.c (revision dd41de95a84d979615a2ef11df6850622bf6184e)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #ifndef WITHOUT_CAPSICUM
36 #include <sys/capsicum.h>
37 #endif
38 #include <sys/types.h>
39 #include <sys/mman.h>
40 #include <sys/pciio.h>
41 #include <sys/ioctl.h>
42 
43 #include <dev/io/iodev.h>
44 #include <dev/pci/pcireg.h>
45 
46 #include <machine/iodev.h>
47 
48 #ifndef WITHOUT_CAPSICUM
49 #include <capsicum_helpers.h>
50 #endif
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <err.h>
55 #include <errno.h>
56 #include <fcntl.h>
57 #include <sysexits.h>
58 #include <unistd.h>
59 
60 #include <machine/vmm.h>
61 #include <vmmapi.h>
62 
63 #include "config.h"
64 #include "debug.h"
65 #include "pci_emul.h"
66 #include "mem.h"
67 
68 #ifndef _PATH_DEVPCI
69 #define	_PATH_DEVPCI	"/dev/pci"
70 #endif
71 
72 #ifndef	_PATH_DEVIO
73 #define	_PATH_DEVIO	"/dev/io"
74 #endif
75 
76 #ifndef _PATH_MEM
77 #define	_PATH_MEM	"/dev/mem"
78 #endif
79 
80 #define	LEGACY_SUPPORT	1
81 
82 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
83 #define MSIX_CAPLEN 12
84 
85 static int pcifd = -1;
86 static int iofd = -1;
87 static int memfd = -1;
88 
89 struct passthru_softc {
90 	struct pci_devinst *psc_pi;
91 	struct pcibar psc_bar[PCI_BARMAX + 1];
92 	struct {
93 		int		capoff;
94 		int		msgctrl;
95 		int		emulated;
96 	} psc_msi;
97 	struct {
98 		int		capoff;
99 	} psc_msix;
100 	struct pcisel psc_sel;
101 };
102 
103 static int
104 msi_caplen(int msgctrl)
105 {
106 	int len;
107 
108 	len = 10;		/* minimum length of msi capability */
109 
110 	if (msgctrl & PCIM_MSICTRL_64BIT)
111 		len += 4;
112 
113 #if 0
114 	/*
115 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
116 	 * We'll let the guest manipulate them directly.
117 	 */
118 	if (msgctrl & PCIM_MSICTRL_VECTOR)
119 		len += 10;
120 #endif
121 
122 	return (len);
123 }
124 
125 static uint32_t
126 read_config(const struct pcisel *sel, long reg, int width)
127 {
128 	struct pci_io pi;
129 
130 	bzero(&pi, sizeof(pi));
131 	pi.pi_sel = *sel;
132 	pi.pi_reg = reg;
133 	pi.pi_width = width;
134 
135 	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
136 		return (0);				/* XXX */
137 	else
138 		return (pi.pi_data);
139 }
140 
141 static void
142 write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
143 {
144 	struct pci_io pi;
145 
146 	bzero(&pi, sizeof(pi));
147 	pi.pi_sel = *sel;
148 	pi.pi_reg = reg;
149 	pi.pi_width = width;
150 	pi.pi_data = data;
151 
152 	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
153 }
154 
155 #ifdef LEGACY_SUPPORT
156 static int
157 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
158 {
159 	int capoff, i;
160 	struct msicap msicap;
161 	u_char *capdata;
162 
163 	pci_populate_msicap(&msicap, msgnum, nextptr);
164 
165 	/*
166 	 * XXX
167 	 * Copy the msi capability structure in the last 16 bytes of the
168 	 * config space. This is wrong because it could shadow something
169 	 * useful to the device.
170 	 */
171 	capoff = 256 - roundup(sizeof(msicap), 4);
172 	capdata = (u_char *)&msicap;
173 	for (i = 0; i < sizeof(msicap); i++)
174 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
175 
176 	return (capoff);
177 }
178 #endif	/* LEGACY_SUPPORT */
179 
180 static int
181 cfginitmsi(struct passthru_softc *sc)
182 {
183 	int i, ptr, capptr, cap, sts, caplen, table_size;
184 	uint32_t u32;
185 	struct pcisel sel;
186 	struct pci_devinst *pi;
187 	struct msixcap msixcap;
188 	uint32_t *msixcap_ptr;
189 
190 	pi = sc->psc_pi;
191 	sel = sc->psc_sel;
192 
193 	/*
194 	 * Parse the capabilities and cache the location of the MSI
195 	 * and MSI-X capabilities.
196 	 */
197 	sts = read_config(&sel, PCIR_STATUS, 2);
198 	if (sts & PCIM_STATUS_CAPPRESENT) {
199 		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
200 		while (ptr != 0 && ptr != 0xff) {
201 			cap = read_config(&sel, ptr + PCICAP_ID, 1);
202 			if (cap == PCIY_MSI) {
203 				/*
204 				 * Copy the MSI capability into the config
205 				 * space of the emulated pci device
206 				 */
207 				sc->psc_msi.capoff = ptr;
208 				sc->psc_msi.msgctrl = read_config(&sel,
209 								  ptr + 2, 2);
210 				sc->psc_msi.emulated = 0;
211 				caplen = msi_caplen(sc->psc_msi.msgctrl);
212 				capptr = ptr;
213 				while (caplen > 0) {
214 					u32 = read_config(&sel, capptr, 4);
215 					pci_set_cfgdata32(pi, capptr, u32);
216 					caplen -= 4;
217 					capptr += 4;
218 				}
219 			} else if (cap == PCIY_MSIX) {
220 				/*
221 				 * Copy the MSI-X capability
222 				 */
223 				sc->psc_msix.capoff = ptr;
224 				caplen = 12;
225 				msixcap_ptr = (uint32_t*) &msixcap;
226 				capptr = ptr;
227 				while (caplen > 0) {
228 					u32 = read_config(&sel, capptr, 4);
229 					*msixcap_ptr = u32;
230 					pci_set_cfgdata32(pi, capptr, u32);
231 					caplen -= 4;
232 					capptr += 4;
233 					msixcap_ptr++;
234 				}
235 			}
236 			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
237 		}
238 	}
239 
240 	if (sc->psc_msix.capoff != 0) {
241 		pi->pi_msix.pba_bar =
242 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
243 		pi->pi_msix.pba_offset =
244 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
245 		pi->pi_msix.table_bar =
246 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
247 		pi->pi_msix.table_offset =
248 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
249 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
250 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
251 
252 		/* Allocate the emulated MSI-X table array */
253 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
254 		pi->pi_msix.table = calloc(1, table_size);
255 
256 		/* Mask all table entries */
257 		for (i = 0; i < pi->pi_msix.table_count; i++) {
258 			pi->pi_msix.table[i].vector_control |=
259 						PCIM_MSIX_VCTRL_MASK;
260 		}
261 	}
262 
263 #ifdef LEGACY_SUPPORT
264 	/*
265 	 * If the passthrough device does not support MSI then craft a
266 	 * MSI capability for it. We link the new MSI capability at the
267 	 * head of the list of capabilities.
268 	 */
269 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
270 		int origptr, msiptr;
271 		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
272 		msiptr = passthru_add_msicap(pi, 1, origptr);
273 		sc->psc_msi.capoff = msiptr;
274 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
275 		sc->psc_msi.emulated = 1;
276 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
277 	}
278 #endif
279 
280 	/* Make sure one of the capabilities is present */
281 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
282 		return (-1);
283 	else
284 		return (0);
285 }
286 
287 static uint64_t
288 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
289 {
290 	struct pci_devinst *pi;
291 	struct msix_table_entry *entry;
292 	uint8_t *src8;
293 	uint16_t *src16;
294 	uint32_t *src32;
295 	uint64_t *src64;
296 	uint64_t data;
297 	size_t entry_offset;
298 	int index;
299 
300 	pi = sc->psc_pi;
301 	if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset &&
302 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
303 		switch(size) {
304 		case 1:
305 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
306 			    pi->pi_msix.pba_page_offset);
307 			data = *src8;
308 			break;
309 		case 2:
310 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
311 			    pi->pi_msix.pba_page_offset);
312 			data = *src16;
313 			break;
314 		case 4:
315 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
316 			    pi->pi_msix.pba_page_offset);
317 			data = *src32;
318 			break;
319 		case 8:
320 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
321 			    pi->pi_msix.pba_page_offset);
322 			data = *src64;
323 			break;
324 		default:
325 			return (-1);
326 		}
327 		return (data);
328 	}
329 
330 	if (offset < pi->pi_msix.table_offset)
331 		return (-1);
332 
333 	offset -= pi->pi_msix.table_offset;
334 	index = offset / MSIX_TABLE_ENTRY_SIZE;
335 	if (index >= pi->pi_msix.table_count)
336 		return (-1);
337 
338 	entry = &pi->pi_msix.table[index];
339 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
340 
341 	switch(size) {
342 	case 1:
343 		src8 = (uint8_t *)((void *)entry + entry_offset);
344 		data = *src8;
345 		break;
346 	case 2:
347 		src16 = (uint16_t *)((void *)entry + entry_offset);
348 		data = *src16;
349 		break;
350 	case 4:
351 		src32 = (uint32_t *)((void *)entry + entry_offset);
352 		data = *src32;
353 		break;
354 	case 8:
355 		src64 = (uint64_t *)((void *)entry + entry_offset);
356 		data = *src64;
357 		break;
358 	default:
359 		return (-1);
360 	}
361 
362 	return (data);
363 }
364 
365 static void
366 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
367 		 uint64_t offset, int size, uint64_t data)
368 {
369 	struct pci_devinst *pi;
370 	struct msix_table_entry *entry;
371 	uint8_t *dest8;
372 	uint16_t *dest16;
373 	uint32_t *dest32;
374 	uint64_t *dest64;
375 	size_t entry_offset;
376 	uint32_t vector_control;
377 	int index;
378 
379 	pi = sc->psc_pi;
380 	if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset &&
381 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
382 		switch(size) {
383 		case 1:
384 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
385 			    pi->pi_msix.pba_page_offset);
386 			*dest8 = data;
387 			break;
388 		case 2:
389 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
390 			    pi->pi_msix.pba_page_offset);
391 			*dest16 = data;
392 			break;
393 		case 4:
394 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
395 			    pi->pi_msix.pba_page_offset);
396 			*dest32 = data;
397 			break;
398 		case 8:
399 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
400 			    pi->pi_msix.pba_page_offset);
401 			*dest64 = data;
402 			break;
403 		default:
404 			break;
405 		}
406 		return;
407 	}
408 
409 	if (offset < pi->pi_msix.table_offset)
410 		return;
411 
412 	offset -= pi->pi_msix.table_offset;
413 	index = offset / MSIX_TABLE_ENTRY_SIZE;
414 	if (index >= pi->pi_msix.table_count)
415 		return;
416 
417 	entry = &pi->pi_msix.table[index];
418 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
419 
420 	/* Only 4 byte naturally-aligned writes are supported */
421 	assert(size == 4);
422 	assert(entry_offset % 4 == 0);
423 
424 	vector_control = entry->vector_control;
425 	dest32 = (uint32_t *)((void *)entry + entry_offset);
426 	*dest32 = data;
427 	/* If MSI-X hasn't been enabled, do nothing */
428 	if (pi->pi_msix.enabled) {
429 		/* If the entry is masked, don't set it up */
430 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
431 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
432 			(void)vm_setup_pptdev_msix(ctx, vcpu,
433 			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
434 			    sc->psc_sel.pc_func, index, entry->addr,
435 			    entry->msg_data, entry->vector_control);
436 		}
437 	}
438 }
439 
440 static int
441 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
442 {
443 	int b, s, f;
444 	int idx;
445 	size_t remaining;
446 	uint32_t table_size, table_offset;
447 	uint32_t pba_size, pba_offset;
448 	vm_paddr_t start;
449 	struct pci_devinst *pi = sc->psc_pi;
450 
451 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
452 
453 	b = sc->psc_sel.pc_bus;
454 	s = sc->psc_sel.pc_dev;
455 	f = sc->psc_sel.pc_func;
456 
457 	/*
458 	 * If the MSI-X table BAR maps memory intended for
459 	 * other uses, it is at least assured that the table
460 	 * either resides in its own page within the region,
461 	 * or it resides in a page shared with only the PBA.
462 	 */
463 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
464 
465 	table_size = pi->pi_msix.table_offset - table_offset;
466 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
467 	table_size = roundup2(table_size, 4096);
468 
469 	idx = pi->pi_msix.table_bar;
470 	start = pi->pi_bar[idx].addr;
471 	remaining = pi->pi_bar[idx].size;
472 
473 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
474 		pba_offset = pi->pi_msix.pba_offset;
475 		pba_size = pi->pi_msix.pba_size;
476 		if (pba_offset >= table_offset + table_size ||
477 		    table_offset >= pba_offset + pba_size) {
478 			/*
479 			 * If the PBA does not share a page with the MSI-x
480 			 * tables, no PBA emulation is required.
481 			 */
482 			pi->pi_msix.pba_page = NULL;
483 			pi->pi_msix.pba_page_offset = 0;
484 		} else {
485 			/*
486 			 * The PBA overlaps with either the first or last
487 			 * page of the MSI-X table region.  Map the
488 			 * appropriate page.
489 			 */
490 			if (pba_offset <= table_offset)
491 				pi->pi_msix.pba_page_offset = table_offset;
492 			else
493 				pi->pi_msix.pba_page_offset = table_offset +
494 				    table_size - 4096;
495 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
496 			    PROT_WRITE, MAP_SHARED, memfd, start +
497 			    pi->pi_msix.pba_page_offset);
498 			if (pi->pi_msix.pba_page == MAP_FAILED) {
499 				warn(
500 			    "Failed to map PBA page for MSI-X on %d/%d/%d",
501 				    b, s, f);
502 				return (-1);
503 			}
504 		}
505 	}
506 
507 	return (0);
508 }
509 
510 static int
511 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
512 {
513 	int i, error;
514 	struct pci_devinst *pi;
515 	struct pci_bar_io bar;
516 	enum pcibar_type bartype;
517 	uint64_t base, size;
518 
519 	pi = sc->psc_pi;
520 
521 	/*
522 	 * Initialize BAR registers
523 	 */
524 	for (i = 0; i <= PCI_BARMAX; i++) {
525 		bzero(&bar, sizeof(bar));
526 		bar.pbi_sel = sc->psc_sel;
527 		bar.pbi_reg = PCIR_BAR(i);
528 
529 		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
530 			continue;
531 
532 		if (PCI_BAR_IO(bar.pbi_base)) {
533 			bartype = PCIBAR_IO;
534 			base = bar.pbi_base & PCIM_BAR_IO_BASE;
535 		} else {
536 			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
537 			case PCIM_BAR_MEM_64:
538 				bartype = PCIBAR_MEM64;
539 				break;
540 			default:
541 				bartype = PCIBAR_MEM32;
542 				break;
543 			}
544 			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
545 		}
546 		size = bar.pbi_length;
547 
548 		if (bartype != PCIBAR_IO) {
549 			if (((base | size) & PAGE_MASK) != 0) {
550 				warnx("passthru device %d/%d/%d BAR %d: "
551 				    "base %#lx or size %#lx not page aligned\n",
552 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
553 				    sc->psc_sel.pc_func, i, base, size);
554 				return (-1);
555 			}
556 		}
557 
558 		/* Cache information about the "real" BAR */
559 		sc->psc_bar[i].type = bartype;
560 		sc->psc_bar[i].size = size;
561 		sc->psc_bar[i].addr = base;
562 
563 		/* Allocate the BAR in the guest I/O or MMIO space */
564 		error = pci_emul_alloc_bar(pi, i, bartype, size);
565 		if (error)
566 			return (-1);
567 
568 		/* The MSI-X table needs special handling */
569 		if (i == pci_msix_table_bar(pi)) {
570 			error = init_msix_table(ctx, sc, base);
571 			if (error)
572 				return (-1);
573 		}
574 
575 		/*
576 		 * 64-bit BAR takes up two slots so skip the next one.
577 		 */
578 		if (bartype == PCIBAR_MEM64) {
579 			i++;
580 			assert(i <= PCI_BARMAX);
581 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
582 		}
583 	}
584 	return (0);
585 }
586 
587 static int
588 cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
589 {
590 	int error;
591 	struct passthru_softc *sc;
592 
593 	error = 1;
594 	sc = pi->pi_arg;
595 
596 	bzero(&sc->psc_sel, sizeof(struct pcisel));
597 	sc->psc_sel.pc_bus = bus;
598 	sc->psc_sel.pc_dev = slot;
599 	sc->psc_sel.pc_func = func;
600 
601 	if (cfginitmsi(sc) != 0) {
602 		warnx("failed to initialize MSI for PCI %d/%d/%d",
603 		    bus, slot, func);
604 		goto done;
605 	}
606 
607 	if (cfginitbar(ctx, sc) != 0) {
608 		warnx("failed to initialize BARs for PCI %d/%d/%d",
609 		    bus, slot, func);
610 		goto done;
611 	}
612 
613 	pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(&sc->psc_sel,
614 	    PCIR_COMMAND, 2));
615 
616 	error = 0;				/* success */
617 done:
618 	return (error);
619 }
620 
621 static int
622 passthru_legacy_config(nvlist_t *nvl, const char *opts)
623 {
624 	char value[16];
625 	int bus, slot, func;
626 
627 	if (opts == NULL)
628 		return (0);
629 
630 	if (sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
631 		EPRINTLN("passthru: invalid options \"%s\"", opts);
632 		return (-1);
633 	}
634 
635 	snprintf(value, sizeof(value), "%d", bus);
636 	set_config_value_node(nvl, "bus", value);
637 	snprintf(value, sizeof(value), "%d", slot);
638 	set_config_value_node(nvl, "slot", value);
639 	snprintf(value, sizeof(value), "%d", func);
640 	set_config_value_node(nvl, "func", value);
641 	return (0);
642 }
643 
644 static int
645 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
646 {
647 	int bus, slot, func, error, memflags;
648 	struct passthru_softc *sc;
649 	const char *value;
650 #ifndef WITHOUT_CAPSICUM
651 	cap_rights_t rights;
652 	cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR };
653 	cap_ioctl_t io_ioctls[] = { IODEV_PIO };
654 #endif
655 
656 	sc = NULL;
657 	error = 1;
658 
659 #ifndef WITHOUT_CAPSICUM
660 	cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
661 #endif
662 
663 	memflags = vm_get_memflags(ctx);
664 	if (!(memflags & VM_MEM_F_WIRED)) {
665 		warnx("passthru requires guest memory to be wired");
666 		return (error);
667 	}
668 
669 	if (pcifd < 0) {
670 		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
671 		if (pcifd < 0) {
672 			warn("failed to open %s", _PATH_DEVPCI);
673 			return (error);
674 		}
675 	}
676 
677 #ifndef WITHOUT_CAPSICUM
678 	if (caph_rights_limit(pcifd, &rights) == -1)
679 		errx(EX_OSERR, "Unable to apply rights for sandbox");
680 	if (caph_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1)
681 		errx(EX_OSERR, "Unable to apply rights for sandbox");
682 #endif
683 
684 	if (iofd < 0) {
685 		iofd = open(_PATH_DEVIO, O_RDWR, 0);
686 		if (iofd < 0) {
687 			warn("failed to open %s", _PATH_DEVIO);
688 			return (error);
689 		}
690 	}
691 
692 #ifndef WITHOUT_CAPSICUM
693 	if (caph_rights_limit(iofd, &rights) == -1)
694 		errx(EX_OSERR, "Unable to apply rights for sandbox");
695 	if (caph_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1)
696 		errx(EX_OSERR, "Unable to apply rights for sandbox");
697 #endif
698 
699 	if (memfd < 0) {
700 		memfd = open(_PATH_MEM, O_RDWR, 0);
701 		if (memfd < 0) {
702 			warn("failed to open %s", _PATH_MEM);
703 			return (error);
704 		}
705 	}
706 
707 #ifndef WITHOUT_CAPSICUM
708 	cap_rights_clear(&rights, CAP_IOCTL);
709 	cap_rights_set(&rights, CAP_MMAP_RW);
710 	if (caph_rights_limit(memfd, &rights) == -1)
711 		errx(EX_OSERR, "Unable to apply rights for sandbox");
712 #endif
713 
714 #define GET_INT_CONFIG(var, name) do {					\
715 	value = get_config_value_node(nvl, name);			\
716 	if (value == NULL) {						\
717 		EPRINTLN("passthru: missing required %s setting", name); \
718 		return (error);						\
719 	}								\
720 	var = atoi(value);						\
721 } while (0)
722 
723 	GET_INT_CONFIG(bus, "bus");
724 	GET_INT_CONFIG(slot, "slot");
725 	GET_INT_CONFIG(func, "func");
726 
727 	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
728 		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
729 		    bus, slot, func);
730 		goto done;
731 	}
732 
733 	sc = calloc(1, sizeof(struct passthru_softc));
734 
735 	pi->pi_arg = sc;
736 	sc->psc_pi = pi;
737 
738 	/* initialize config space */
739 	error = cfginit(ctx, pi, bus, slot, func);
740 done:
741 	if (error) {
742 		free(sc);
743 		vm_unassign_pptdev(ctx, bus, slot, func);
744 	}
745 	return (error);
746 }
747 
748 static int
749 bar_access(int coff)
750 {
751 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
752 		return (1);
753 	else
754 		return (0);
755 }
756 
757 static int
758 msicap_access(struct passthru_softc *sc, int coff)
759 {
760 	int caplen;
761 
762 	if (sc->psc_msi.capoff == 0)
763 		return (0);
764 
765 	caplen = msi_caplen(sc->psc_msi.msgctrl);
766 
767 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
768 		return (1);
769 	else
770 		return (0);
771 }
772 
773 static int
774 msixcap_access(struct passthru_softc *sc, int coff)
775 {
776 	if (sc->psc_msix.capoff == 0)
777 		return (0);
778 
779 	return (coff >= sc->psc_msix.capoff &&
780 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
781 }
782 
783 static int
784 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
785 		 int coff, int bytes, uint32_t *rv)
786 {
787 	struct passthru_softc *sc;
788 
789 	sc = pi->pi_arg;
790 
791 	/*
792 	 * PCI BARs and MSI capability is emulated.
793 	 */
794 	if (bar_access(coff) || msicap_access(sc, coff))
795 		return (-1);
796 
797 #ifdef LEGACY_SUPPORT
798 	/*
799 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
800 	 * natively.
801 	 */
802 	if (sc->psc_msi.emulated) {
803 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
804 			return (-1);
805 	}
806 #endif
807 
808 	/*
809 	 * Emulate the command register.  If a single read reads both the
810 	 * command and status registers, read the status register from the
811 	 * device's config space.
812 	 */
813 	if (coff == PCIR_COMMAND) {
814 		if (bytes <= 2)
815 			return (-1);
816 		*rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 |
817 		    pci_get_cfgdata16(pi, PCIR_COMMAND);
818 		return (0);
819 	}
820 
821 	/* Everything else just read from the device's config space */
822 	*rv = read_config(&sc->psc_sel, coff, bytes);
823 
824 	return (0);
825 }
826 
827 static int
828 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
829 		  int coff, int bytes, uint32_t val)
830 {
831 	int error, msix_table_entries, i;
832 	struct passthru_softc *sc;
833 	uint16_t cmd_old;
834 
835 	sc = pi->pi_arg;
836 
837 	/*
838 	 * PCI BARs are emulated
839 	 */
840 	if (bar_access(coff))
841 		return (-1);
842 
843 	/*
844 	 * MSI capability is emulated
845 	 */
846 	if (msicap_access(sc, coff)) {
847 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
848 		    PCIY_MSI);
849 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
850 			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
851 			pi->pi_msi.addr, pi->pi_msi.msg_data,
852 			pi->pi_msi.maxmsgnum);
853 		if (error != 0)
854 			err(1, "vm_setup_pptdev_msi");
855 		return (0);
856 	}
857 
858 	if (msixcap_access(sc, coff)) {
859 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
860 		    PCIY_MSIX);
861 		if (pi->pi_msix.enabled) {
862 			msix_table_entries = pi->pi_msix.table_count;
863 			for (i = 0; i < msix_table_entries; i++) {
864 				error = vm_setup_pptdev_msix(ctx, vcpu,
865 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
866 				    sc->psc_sel.pc_func, i,
867 				    pi->pi_msix.table[i].addr,
868 				    pi->pi_msix.table[i].msg_data,
869 				    pi->pi_msix.table[i].vector_control);
870 
871 				if (error)
872 					err(1, "vm_setup_pptdev_msix");
873 			}
874 		} else {
875 			error = vm_disable_pptdev_msix(ctx, sc->psc_sel.pc_bus,
876 			    sc->psc_sel.pc_dev, sc->psc_sel.pc_func);
877 			if (error)
878 				err(1, "vm_disable_pptdev_msix");
879 		}
880 		return (0);
881 	}
882 
883 #ifdef LEGACY_SUPPORT
884 	/*
885 	 * If this device does not support MSI natively then we cannot let
886 	 * the guest disable legacy interrupts from the device. It is the
887 	 * legacy interrupt that is triggering the virtual MSI to the guest.
888 	 */
889 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
890 		if (coff == PCIR_COMMAND && bytes == 2)
891 			val &= ~PCIM_CMD_INTxDIS;
892 	}
893 #endif
894 
895 	write_config(&sc->psc_sel, coff, bytes, val);
896 	if (coff == PCIR_COMMAND) {
897 		cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
898 		if (bytes == 1)
899 			pci_set_cfgdata8(pi, PCIR_COMMAND, val);
900 		else if (bytes == 2)
901 			pci_set_cfgdata16(pi, PCIR_COMMAND, val);
902 		pci_emul_cmd_changed(pi, cmd_old);
903 	}
904 
905 	return (0);
906 }
907 
908 static void
909 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
910 	       uint64_t offset, int size, uint64_t value)
911 {
912 	struct passthru_softc *sc;
913 	struct iodev_pio_req pio;
914 
915 	sc = pi->pi_arg;
916 
917 	if (baridx == pci_msix_table_bar(pi)) {
918 		msix_table_write(ctx, vcpu, sc, offset, size, value);
919 	} else {
920 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
921 		bzero(&pio, sizeof(struct iodev_pio_req));
922 		pio.access = IODEV_PIO_WRITE;
923 		pio.port = sc->psc_bar[baridx].addr + offset;
924 		pio.width = size;
925 		pio.val = value;
926 
927 		(void)ioctl(iofd, IODEV_PIO, &pio);
928 	}
929 }
930 
931 static uint64_t
932 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
933 	      uint64_t offset, int size)
934 {
935 	struct passthru_softc *sc;
936 	struct iodev_pio_req pio;
937 	uint64_t val;
938 
939 	sc = pi->pi_arg;
940 
941 	if (baridx == pci_msix_table_bar(pi)) {
942 		val = msix_table_read(sc, offset, size);
943 	} else {
944 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
945 		bzero(&pio, sizeof(struct iodev_pio_req));
946 		pio.access = IODEV_PIO_READ;
947 		pio.port = sc->psc_bar[baridx].addr + offset;
948 		pio.width = size;
949 		pio.val = 0;
950 
951 		(void)ioctl(iofd, IODEV_PIO, &pio);
952 
953 		val = pio.val;
954 	}
955 
956 	return (val);
957 }
958 
959 static void
960 passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
961 		   int enabled, uint64_t address)
962 {
963 	struct passthru_softc *sc;
964 	size_t remaining;
965 	uint32_t table_size, table_offset;
966 
967 	sc = pi->pi_arg;
968 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
969 	if (table_offset > 0) {
970 		if (!enabled) {
971 			if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
972 						 sc->psc_sel.pc_dev,
973 						 sc->psc_sel.pc_func, address,
974 						 table_offset) != 0)
975 				warnx("pci_passthru: unmap_pptdev_mmio failed");
976 		} else {
977 			if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
978 					       sc->psc_sel.pc_dev,
979 					       sc->psc_sel.pc_func, address,
980 					       table_offset,
981 					       sc->psc_bar[baridx].addr) != 0)
982 				warnx("pci_passthru: map_pptdev_mmio failed");
983 		}
984 	}
985 	table_size = pi->pi_msix.table_offset - table_offset;
986 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
987 	table_size = roundup2(table_size, 4096);
988 	remaining = pi->pi_bar[baridx].size - table_offset - table_size;
989 	if (remaining > 0) {
990 		address += table_offset + table_size;
991 		if (!enabled) {
992 			if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
993 						 sc->psc_sel.pc_dev,
994 						 sc->psc_sel.pc_func, address,
995 						 remaining) != 0)
996 				warnx("pci_passthru: unmap_pptdev_mmio failed");
997 		} else {
998 			if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
999 					       sc->psc_sel.pc_dev,
1000 					       sc->psc_sel.pc_func, address,
1001 					       remaining,
1002 					       sc->psc_bar[baridx].addr +
1003 					       table_offset + table_size) != 0)
1004 				warnx("pci_passthru: map_pptdev_mmio failed");
1005 		}
1006 	}
1007 }
1008 
1009 static void
1010 passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
1011 		   int enabled, uint64_t address)
1012 {
1013 	struct passthru_softc *sc;
1014 
1015 	sc = pi->pi_arg;
1016 	if (!enabled) {
1017 		if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
1018 					 sc->psc_sel.pc_dev,
1019 					 sc->psc_sel.pc_func, address,
1020 					 sc->psc_bar[baridx].size) != 0)
1021 			warnx("pci_passthru: unmap_pptdev_mmio failed");
1022 	} else {
1023 		if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
1024 				       sc->psc_sel.pc_dev,
1025 				       sc->psc_sel.pc_func, address,
1026 				       sc->psc_bar[baridx].size,
1027 				       sc->psc_bar[baridx].addr) != 0)
1028 			warnx("pci_passthru: map_pptdev_mmio failed");
1029 	}
1030 }
1031 
1032 static void
1033 passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
1034 	      int enabled, uint64_t address)
1035 {
1036 
1037 	if (pi->pi_bar[baridx].type == PCIBAR_IO)
1038 		return;
1039 	if (baridx == pci_msix_table_bar(pi))
1040 		passthru_msix_addr(ctx, pi, baridx, enabled, address);
1041 	else
1042 		passthru_mmio_addr(ctx, pi, baridx, enabled, address);
1043 }
1044 
1045 struct pci_devemu passthru = {
1046 	.pe_emu		= "passthru",
1047 	.pe_init	= passthru_init,
1048 	.pe_legacy_config = passthru_legacy_config,
1049 	.pe_cfgwrite	= passthru_cfgwrite,
1050 	.pe_cfgread	= passthru_cfgread,
1051 	.pe_barwrite 	= passthru_write,
1052 	.pe_barread    	= passthru_read,
1053 	.pe_baraddr	= passthru_addr,
1054 };
1055 PCI_EMUL_SET(passthru);
1056