xref: /freebsd/usr.sbin/bhyve/pci_passthru.c (revision 3ee5c55415a7b08c6c4c403cc6b96e30d768e1c9)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #ifndef WITHOUT_CAPSICUM
36 #include <sys/capsicum.h>
37 #endif
38 #include <sys/types.h>
39 #include <sys/mman.h>
40 #include <sys/pciio.h>
41 #include <sys/ioctl.h>
42 
43 #include <dev/io/iodev.h>
44 #include <dev/pci/pcireg.h>
45 
46 #include <machine/iodev.h>
47 
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <err.h>
52 #include <errno.h>
53 #include <fcntl.h>
54 #include <sysexits.h>
55 #include <unistd.h>
56 
57 #include <machine/vmm.h>
58 #include <vmmapi.h>
59 #include "pci_emul.h"
60 #include "mem.h"
61 
62 #ifndef _PATH_DEVPCI
63 #define	_PATH_DEVPCI	"/dev/pci"
64 #endif
65 
66 #ifndef	_PATH_DEVIO
67 #define	_PATH_DEVIO	"/dev/io"
68 #endif
69 
70 #ifndef _PATH_MEM
71 #define	_PATH_MEM	"/dev/mem"
72 #endif
73 
74 #define	LEGACY_SUPPORT	1
75 
76 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
77 #define MSIX_CAPLEN 12
78 
79 static int pcifd = -1;
80 static int iofd = -1;
81 static int memfd = -1;
82 
83 struct passthru_softc {
84 	struct pci_devinst *psc_pi;
85 	struct pcibar psc_bar[PCI_BARMAX + 1];
86 	struct {
87 		int		capoff;
88 		int		msgctrl;
89 		int		emulated;
90 	} psc_msi;
91 	struct {
92 		int		capoff;
93 	} psc_msix;
94 	struct pcisel psc_sel;
95 };
96 
97 static int
98 msi_caplen(int msgctrl)
99 {
100 	int len;
101 
102 	len = 10;		/* minimum length of msi capability */
103 
104 	if (msgctrl & PCIM_MSICTRL_64BIT)
105 		len += 4;
106 
107 #if 0
108 	/*
109 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
110 	 * We'll let the guest manipulate them directly.
111 	 */
112 	if (msgctrl & PCIM_MSICTRL_VECTOR)
113 		len += 10;
114 #endif
115 
116 	return (len);
117 }
118 
119 static uint32_t
120 read_config(const struct pcisel *sel, long reg, int width)
121 {
122 	struct pci_io pi;
123 
124 	bzero(&pi, sizeof(pi));
125 	pi.pi_sel = *sel;
126 	pi.pi_reg = reg;
127 	pi.pi_width = width;
128 
129 	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
130 		return (0);				/* XXX */
131 	else
132 		return (pi.pi_data);
133 }
134 
135 static void
136 write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
137 {
138 	struct pci_io pi;
139 
140 	bzero(&pi, sizeof(pi));
141 	pi.pi_sel = *sel;
142 	pi.pi_reg = reg;
143 	pi.pi_width = width;
144 	pi.pi_data = data;
145 
146 	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
147 }
148 
149 #ifdef LEGACY_SUPPORT
150 static int
151 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
152 {
153 	int capoff, i;
154 	struct msicap msicap;
155 	u_char *capdata;
156 
157 	pci_populate_msicap(&msicap, msgnum, nextptr);
158 
159 	/*
160 	 * XXX
161 	 * Copy the msi capability structure in the last 16 bytes of the
162 	 * config space. This is wrong because it could shadow something
163 	 * useful to the device.
164 	 */
165 	capoff = 256 - roundup(sizeof(msicap), 4);
166 	capdata = (u_char *)&msicap;
167 	for (i = 0; i < sizeof(msicap); i++)
168 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
169 
170 	return (capoff);
171 }
172 #endif	/* LEGACY_SUPPORT */
173 
174 static int
175 cfginitmsi(struct passthru_softc *sc)
176 {
177 	int i, ptr, capptr, cap, sts, caplen, table_size;
178 	uint32_t u32;
179 	struct pcisel sel;
180 	struct pci_devinst *pi;
181 	struct msixcap msixcap;
182 	uint32_t *msixcap_ptr;
183 
184 	pi = sc->psc_pi;
185 	sel = sc->psc_sel;
186 
187 	/*
188 	 * Parse the capabilities and cache the location of the MSI
189 	 * and MSI-X capabilities.
190 	 */
191 	sts = read_config(&sel, PCIR_STATUS, 2);
192 	if (sts & PCIM_STATUS_CAPPRESENT) {
193 		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
194 		while (ptr != 0 && ptr != 0xff) {
195 			cap = read_config(&sel, ptr + PCICAP_ID, 1);
196 			if (cap == PCIY_MSI) {
197 				/*
198 				 * Copy the MSI capability into the config
199 				 * space of the emulated pci device
200 				 */
201 				sc->psc_msi.capoff = ptr;
202 				sc->psc_msi.msgctrl = read_config(&sel,
203 								  ptr + 2, 2);
204 				sc->psc_msi.emulated = 0;
205 				caplen = msi_caplen(sc->psc_msi.msgctrl);
206 				capptr = ptr;
207 				while (caplen > 0) {
208 					u32 = read_config(&sel, capptr, 4);
209 					pci_set_cfgdata32(pi, capptr, u32);
210 					caplen -= 4;
211 					capptr += 4;
212 				}
213 			} else if (cap == PCIY_MSIX) {
214 				/*
215 				 * Copy the MSI-X capability
216 				 */
217 				sc->psc_msix.capoff = ptr;
218 				caplen = 12;
219 				msixcap_ptr = (uint32_t*) &msixcap;
220 				capptr = ptr;
221 				while (caplen > 0) {
222 					u32 = read_config(&sel, capptr, 4);
223 					*msixcap_ptr = u32;
224 					pci_set_cfgdata32(pi, capptr, u32);
225 					caplen -= 4;
226 					capptr += 4;
227 					msixcap_ptr++;
228 				}
229 			}
230 			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
231 		}
232 	}
233 
234 	if (sc->psc_msix.capoff != 0) {
235 		pi->pi_msix.pba_bar =
236 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
237 		pi->pi_msix.pba_offset =
238 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
239 		pi->pi_msix.table_bar =
240 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
241 		pi->pi_msix.table_offset =
242 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
243 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
244 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
245 
246 		/* Allocate the emulated MSI-X table array */
247 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
248 		pi->pi_msix.table = calloc(1, table_size);
249 
250 		/* Mask all table entries */
251 		for (i = 0; i < pi->pi_msix.table_count; i++) {
252 			pi->pi_msix.table[i].vector_control |=
253 						PCIM_MSIX_VCTRL_MASK;
254 		}
255 	}
256 
257 #ifdef LEGACY_SUPPORT
258 	/*
259 	 * If the passthrough device does not support MSI then craft a
260 	 * MSI capability for it. We link the new MSI capability at the
261 	 * head of the list of capabilities.
262 	 */
263 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
264 		int origptr, msiptr;
265 		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
266 		msiptr = passthru_add_msicap(pi, 1, origptr);
267 		sc->psc_msi.capoff = msiptr;
268 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
269 		sc->psc_msi.emulated = 1;
270 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
271 	}
272 #endif
273 
274 	/* Make sure one of the capabilities is present */
275 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
276 		return (-1);
277 	else
278 		return (0);
279 }
280 
281 static uint64_t
282 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
283 {
284 	struct pci_devinst *pi;
285 	struct msix_table_entry *entry;
286 	uint8_t *src8;
287 	uint16_t *src16;
288 	uint32_t *src32;
289 	uint64_t *src64;
290 	uint64_t data;
291 	size_t entry_offset;
292 	int index;
293 
294 	pi = sc->psc_pi;
295 	if (offset >= pi->pi_msix.pba_offset &&
296 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
297 		switch(size) {
298 		case 1:
299 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
300 			    pi->pi_msix.pba_page_offset);
301 			data = *src8;
302 			break;
303 		case 2:
304 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
305 			    pi->pi_msix.pba_page_offset);
306 			data = *src16;
307 			break;
308 		case 4:
309 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
310 			    pi->pi_msix.pba_page_offset);
311 			data = *src32;
312 			break;
313 		case 8:
314 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
315 			    pi->pi_msix.pba_page_offset);
316 			data = *src64;
317 			break;
318 		default:
319 			return (-1);
320 		}
321 		return (data);
322 	}
323 
324 	if (offset < pi->pi_msix.table_offset)
325 		return (-1);
326 
327 	offset -= pi->pi_msix.table_offset;
328 	index = offset / MSIX_TABLE_ENTRY_SIZE;
329 	if (index >= pi->pi_msix.table_count)
330 		return (-1);
331 
332 	entry = &pi->pi_msix.table[index];
333 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
334 
335 	switch(size) {
336 	case 1:
337 		src8 = (uint8_t *)((void *)entry + entry_offset);
338 		data = *src8;
339 		break;
340 	case 2:
341 		src16 = (uint16_t *)((void *)entry + entry_offset);
342 		data = *src16;
343 		break;
344 	case 4:
345 		src32 = (uint32_t *)((void *)entry + entry_offset);
346 		data = *src32;
347 		break;
348 	case 8:
349 		src64 = (uint64_t *)((void *)entry + entry_offset);
350 		data = *src64;
351 		break;
352 	default:
353 		return (-1);
354 	}
355 
356 	return (data);
357 }
358 
359 static void
360 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
361 		 uint64_t offset, int size, uint64_t data)
362 {
363 	struct pci_devinst *pi;
364 	struct msix_table_entry *entry;
365 	uint8_t *dest8;
366 	uint16_t *dest16;
367 	uint32_t *dest32;
368 	uint64_t *dest64;
369 	size_t entry_offset;
370 	uint32_t vector_control;
371 	int index;
372 
373 	pi = sc->psc_pi;
374 	if (offset >= pi->pi_msix.pba_offset &&
375 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
376 		switch(size) {
377 		case 1:
378 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
379 			    pi->pi_msix.pba_page_offset);
380 			*dest8 = data;
381 			break;
382 		case 2:
383 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
384 			    pi->pi_msix.pba_page_offset);
385 			*dest16 = data;
386 			break;
387 		case 4:
388 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
389 			    pi->pi_msix.pba_page_offset);
390 			*dest32 = data;
391 			break;
392 		case 8:
393 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
394 			    pi->pi_msix.pba_page_offset);
395 			*dest64 = data;
396 			break;
397 		default:
398 			break;
399 		}
400 		return;
401 	}
402 
403 	if (offset < pi->pi_msix.table_offset)
404 		return;
405 
406 	offset -= pi->pi_msix.table_offset;
407 	index = offset / MSIX_TABLE_ENTRY_SIZE;
408 	if (index >= pi->pi_msix.table_count)
409 		return;
410 
411 	entry = &pi->pi_msix.table[index];
412 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
413 
414 	/* Only 4 byte naturally-aligned writes are supported */
415 	assert(size == 4);
416 	assert(entry_offset % 4 == 0);
417 
418 	vector_control = entry->vector_control;
419 	dest32 = (uint32_t *)((void *)entry + entry_offset);
420 	*dest32 = data;
421 	/* If MSI-X hasn't been enabled, do nothing */
422 	if (pi->pi_msix.enabled) {
423 		/* If the entry is masked, don't set it up */
424 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
425 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
426 			(void)vm_setup_pptdev_msix(ctx, vcpu,
427 			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
428 			    sc->psc_sel.pc_func, index, entry->addr,
429 			    entry->msg_data, entry->vector_control);
430 		}
431 	}
432 }
433 
434 static int
435 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
436 {
437 	int b, s, f;
438 	int error, idx;
439 	size_t len, remaining;
440 	uint32_t table_size, table_offset;
441 	uint32_t pba_size, pba_offset;
442 	vm_paddr_t start;
443 	struct pci_devinst *pi = sc->psc_pi;
444 
445 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
446 
447 	b = sc->psc_sel.pc_bus;
448 	s = sc->psc_sel.pc_dev;
449 	f = sc->psc_sel.pc_func;
450 
451 	/*
452 	 * If the MSI-X table BAR maps memory intended for
453 	 * other uses, it is at least assured that the table
454 	 * either resides in its own page within the region,
455 	 * or it resides in a page shared with only the PBA.
456 	 */
457 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
458 
459 	table_size = pi->pi_msix.table_offset - table_offset;
460 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
461 	table_size = roundup2(table_size, 4096);
462 
463 	idx = pi->pi_msix.table_bar;
464 	start = pi->pi_bar[idx].addr;
465 	remaining = pi->pi_bar[idx].size;
466 
467 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
468 		pba_offset = pi->pi_msix.pba_offset;
469 		pba_size = pi->pi_msix.pba_size;
470 		if (pba_offset >= table_offset + table_size ||
471 		    table_offset >= pba_offset + pba_size) {
472 			/*
473 			 * If the PBA does not share a page with the MSI-x
474 			 * tables, no PBA emulation is required.
475 			 */
476 			pi->pi_msix.pba_page = NULL;
477 			pi->pi_msix.pba_page_offset = 0;
478 		} else {
479 			/*
480 			 * The PBA overlaps with either the first or last
481 			 * page of the MSI-X table region.  Map the
482 			 * appropriate page.
483 			 */
484 			if (pba_offset <= table_offset)
485 				pi->pi_msix.pba_page_offset = table_offset;
486 			else
487 				pi->pi_msix.pba_page_offset = table_offset +
488 				    table_size - 4096;
489 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
490 			    PROT_WRITE, MAP_SHARED, memfd, start +
491 			    pi->pi_msix.pba_page_offset);
492 			if (pi->pi_msix.pba_page == MAP_FAILED) {
493 				warn(
494 			    "Failed to map PBA page for MSI-X on %d/%d/%d",
495 				    b, s, f);
496 				return (-1);
497 			}
498 		}
499 	}
500 
501 	/* Map everything before the MSI-X table */
502 	if (table_offset > 0) {
503 		len = table_offset;
504 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
505 		if (error)
506 			return (error);
507 
508 		base += len;
509 		start += len;
510 		remaining -= len;
511 	}
512 
513 	/* Skip the MSI-X table */
514 	base += table_size;
515 	start += table_size;
516 	remaining -= table_size;
517 
518 	/* Map everything beyond the end of the MSI-X table */
519 	if (remaining > 0) {
520 		len = remaining;
521 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
522 		if (error)
523 			return (error);
524 	}
525 
526 	return (0);
527 }
528 
529 static int
530 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
531 {
532 	int i, error;
533 	struct pci_devinst *pi;
534 	struct pci_bar_io bar;
535 	enum pcibar_type bartype;
536 	uint64_t base, size;
537 
538 	pi = sc->psc_pi;
539 
540 	/*
541 	 * Initialize BAR registers
542 	 */
543 	for (i = 0; i <= PCI_BARMAX; i++) {
544 		bzero(&bar, sizeof(bar));
545 		bar.pbi_sel = sc->psc_sel;
546 		bar.pbi_reg = PCIR_BAR(i);
547 
548 		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
549 			continue;
550 
551 		if (PCI_BAR_IO(bar.pbi_base)) {
552 			bartype = PCIBAR_IO;
553 			base = bar.pbi_base & PCIM_BAR_IO_BASE;
554 		} else {
555 			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
556 			case PCIM_BAR_MEM_64:
557 				bartype = PCIBAR_MEM64;
558 				break;
559 			default:
560 				bartype = PCIBAR_MEM32;
561 				break;
562 			}
563 			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
564 		}
565 		size = bar.pbi_length;
566 
567 		if (bartype != PCIBAR_IO) {
568 			if (((base | size) & PAGE_MASK) != 0) {
569 				warnx("passthru device %d/%d/%d BAR %d: "
570 				    "base %#lx or size %#lx not page aligned\n",
571 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
572 				    sc->psc_sel.pc_func, i, base, size);
573 				return (-1);
574 			}
575 		}
576 
577 		/* Cache information about the "real" BAR */
578 		sc->psc_bar[i].type = bartype;
579 		sc->psc_bar[i].size = size;
580 		sc->psc_bar[i].addr = base;
581 
582 		/* Allocate the BAR in the guest I/O or MMIO space */
583 		error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
584 		if (error)
585 			return (-1);
586 
587 		/* The MSI-X table needs special handling */
588 		if (i == pci_msix_table_bar(pi)) {
589 			error = init_msix_table(ctx, sc, base);
590 			if (error)
591 				return (-1);
592 		} else if (bartype != PCIBAR_IO) {
593 			/* Map the physical BAR in the guest MMIO space */
594 			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
595 				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
596 				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
597 			if (error)
598 				return (-1);
599 		}
600 
601 		/*
602 		 * 64-bit BAR takes up two slots so skip the next one.
603 		 */
604 		if (bartype == PCIBAR_MEM64) {
605 			i++;
606 			assert(i <= PCI_BARMAX);
607 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
608 		}
609 	}
610 	return (0);
611 }
612 
613 static int
614 cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
615 {
616 	int error;
617 	struct passthru_softc *sc;
618 
619 	error = 1;
620 	sc = pi->pi_arg;
621 
622 	bzero(&sc->psc_sel, sizeof(struct pcisel));
623 	sc->psc_sel.pc_bus = bus;
624 	sc->psc_sel.pc_dev = slot;
625 	sc->psc_sel.pc_func = func;
626 
627 	if (cfginitmsi(sc) != 0) {
628 		warnx("failed to initialize MSI for PCI %d/%d/%d",
629 		    bus, slot, func);
630 		goto done;
631 	}
632 
633 	if (cfginitbar(ctx, sc) != 0) {
634 		warnx("failed to initialize BARs for PCI %d/%d/%d",
635 		    bus, slot, func);
636 		goto done;
637 	}
638 
639 	error = 0;				/* success */
640 done:
641 	return (error);
642 }
643 
644 static int
645 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
646 {
647 	int bus, slot, func, error, memflags;
648 	struct passthru_softc *sc;
649 #ifndef WITHOUT_CAPSICUM
650 	cap_rights_t rights;
651 	cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR };
652 	cap_ioctl_t io_ioctls[] = { IODEV_PIO };
653 #endif
654 
655 	sc = NULL;
656 	error = 1;
657 
658 #ifndef WITHOUT_CAPSICUM
659 	cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
660 #endif
661 
662 	memflags = vm_get_memflags(ctx);
663 	if (!(memflags & VM_MEM_F_WIRED)) {
664 		warnx("passthru requires guest memory to be wired");
665 		goto done;
666 	}
667 
668 	if (pcifd < 0) {
669 		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
670 		if (pcifd < 0) {
671 			warn("failed to open %s", _PATH_DEVPCI);
672 			goto done;
673 		}
674 	}
675 
676 #ifndef WITHOUT_CAPSICUM
677 	if (cap_rights_limit(pcifd, &rights) == -1 && errno != ENOSYS)
678 		errx(EX_OSERR, "Unable to apply rights for sandbox");
679 	if (cap_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1 && errno != ENOSYS)
680 		errx(EX_OSERR, "Unable to apply rights for sandbox");
681 #endif
682 
683 	if (iofd < 0) {
684 		iofd = open(_PATH_DEVIO, O_RDWR, 0);
685 		if (iofd < 0) {
686 			warn("failed to open %s", _PATH_DEVIO);
687 			goto done;
688 		}
689 	}
690 
691 #ifndef WITHOUT_CAPSICUM
692 	if (cap_rights_limit(iofd, &rights) == -1 && errno != ENOSYS)
693 		errx(EX_OSERR, "Unable to apply rights for sandbox");
694 	if (cap_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1 && errno != ENOSYS)
695 		errx(EX_OSERR, "Unable to apply rights for sandbox");
696 #endif
697 
698 	if (memfd < 0) {
699 		memfd = open(_PATH_MEM, O_RDWR, 0);
700 		if (memfd < 0) {
701 			warn("failed to open %s", _PATH_MEM);
702 			goto done;
703 		}
704 	}
705 
706 #ifndef WITHOUT_CAPSICUM
707 	cap_rights_clear(&rights, CAP_IOCTL);
708 	cap_rights_set(&rights, CAP_MMAP_RW);
709 	if (cap_rights_limit(memfd, &rights) == -1 && errno != ENOSYS)
710 		errx(EX_OSERR, "Unable to apply rights for sandbox");
711 #endif
712 
713 	if (opts == NULL ||
714 	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
715 		warnx("invalid passthru options");
716 		goto done;
717 	}
718 
719 	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
720 		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
721 		    bus, slot, func);
722 		goto done;
723 	}
724 
725 	sc = calloc(1, sizeof(struct passthru_softc));
726 
727 	pi->pi_arg = sc;
728 	sc->psc_pi = pi;
729 
730 	/* initialize config space */
731 	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
732 		goto done;
733 
734 	error = 0;		/* success */
735 done:
736 	if (error) {
737 		free(sc);
738 		vm_unassign_pptdev(ctx, bus, slot, func);
739 	}
740 	return (error);
741 }
742 
743 static int
744 bar_access(int coff)
745 {
746 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
747 		return (1);
748 	else
749 		return (0);
750 }
751 
752 static int
753 msicap_access(struct passthru_softc *sc, int coff)
754 {
755 	int caplen;
756 
757 	if (sc->psc_msi.capoff == 0)
758 		return (0);
759 
760 	caplen = msi_caplen(sc->psc_msi.msgctrl);
761 
762 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
763 		return (1);
764 	else
765 		return (0);
766 }
767 
768 static int
769 msixcap_access(struct passthru_softc *sc, int coff)
770 {
771 	if (sc->psc_msix.capoff == 0)
772 		return (0);
773 
774 	return (coff >= sc->psc_msix.capoff &&
775 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
776 }
777 
778 static int
779 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
780 		 int coff, int bytes, uint32_t *rv)
781 {
782 	struct passthru_softc *sc;
783 
784 	sc = pi->pi_arg;
785 
786 	/*
787 	 * PCI BARs and MSI capability is emulated.
788 	 */
789 	if (bar_access(coff) || msicap_access(sc, coff))
790 		return (-1);
791 
792 #ifdef LEGACY_SUPPORT
793 	/*
794 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
795 	 * natively.
796 	 */
797 	if (sc->psc_msi.emulated) {
798 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
799 			return (-1);
800 	}
801 #endif
802 
803 	/* Everything else just read from the device's config space */
804 	*rv = read_config(&sc->psc_sel, coff, bytes);
805 
806 	return (0);
807 }
808 
809 static int
810 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
811 		  int coff, int bytes, uint32_t val)
812 {
813 	int error, msix_table_entries, i;
814 	struct passthru_softc *sc;
815 
816 	sc = pi->pi_arg;
817 
818 	/*
819 	 * PCI BARs are emulated
820 	 */
821 	if (bar_access(coff))
822 		return (-1);
823 
824 	/*
825 	 * MSI capability is emulated
826 	 */
827 	if (msicap_access(sc, coff)) {
828 		msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
829 
830 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
831 			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
832 			pi->pi_msi.addr, pi->pi_msi.msg_data,
833 			pi->pi_msi.maxmsgnum);
834 		if (error != 0)
835 			err(1, "vm_setup_pptdev_msi");
836 		return (0);
837 	}
838 
839 	if (msixcap_access(sc, coff)) {
840 		msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
841 		if (pi->pi_msix.enabled) {
842 			msix_table_entries = pi->pi_msix.table_count;
843 			for (i = 0; i < msix_table_entries; i++) {
844 				error = vm_setup_pptdev_msix(ctx, vcpu,
845 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
846 				    sc->psc_sel.pc_func, i,
847 				    pi->pi_msix.table[i].addr,
848 				    pi->pi_msix.table[i].msg_data,
849 				    pi->pi_msix.table[i].vector_control);
850 
851 				if (error)
852 					err(1, "vm_setup_pptdev_msix");
853 			}
854 		}
855 		return (0);
856 	}
857 
858 #ifdef LEGACY_SUPPORT
859 	/*
860 	 * If this device does not support MSI natively then we cannot let
861 	 * the guest disable legacy interrupts from the device. It is the
862 	 * legacy interrupt that is triggering the virtual MSI to the guest.
863 	 */
864 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
865 		if (coff == PCIR_COMMAND && bytes == 2)
866 			val &= ~PCIM_CMD_INTxDIS;
867 	}
868 #endif
869 
870 	write_config(&sc->psc_sel, coff, bytes, val);
871 
872 	return (0);
873 }
874 
875 static void
876 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
877 	       uint64_t offset, int size, uint64_t value)
878 {
879 	struct passthru_softc *sc;
880 	struct iodev_pio_req pio;
881 
882 	sc = pi->pi_arg;
883 
884 	if (baridx == pci_msix_table_bar(pi)) {
885 		msix_table_write(ctx, vcpu, sc, offset, size, value);
886 	} else {
887 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
888 		bzero(&pio, sizeof(struct iodev_pio_req));
889 		pio.access = IODEV_PIO_WRITE;
890 		pio.port = sc->psc_bar[baridx].addr + offset;
891 		pio.width = size;
892 		pio.val = value;
893 
894 		(void)ioctl(iofd, IODEV_PIO, &pio);
895 	}
896 }
897 
898 static uint64_t
899 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
900 	      uint64_t offset, int size)
901 {
902 	struct passthru_softc *sc;
903 	struct iodev_pio_req pio;
904 	uint64_t val;
905 
906 	sc = pi->pi_arg;
907 
908 	if (baridx == pci_msix_table_bar(pi)) {
909 		val = msix_table_read(sc, offset, size);
910 	} else {
911 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
912 		bzero(&pio, sizeof(struct iodev_pio_req));
913 		pio.access = IODEV_PIO_READ;
914 		pio.port = sc->psc_bar[baridx].addr + offset;
915 		pio.width = size;
916 		pio.val = 0;
917 
918 		(void)ioctl(iofd, IODEV_PIO, &pio);
919 
920 		val = pio.val;
921 	}
922 
923 	return (val);
924 }
925 
926 struct pci_devemu passthru = {
927 	.pe_emu		= "passthru",
928 	.pe_init	= passthru_init,
929 	.pe_cfgwrite	= passthru_cfgwrite,
930 	.pe_cfgread	= passthru_cfgread,
931 	.pe_barwrite 	= passthru_write,
932 	.pe_barread    	= passthru_read,
933 };
934 PCI_EMUL_SET(passthru);
935