1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/mman.h>
34 #include <sys/pciio.h>
35 #include <sys/ioctl.h>
36 #include <sys/stat.h>
37
38 #include <sys/pci.h>
39
40 #include <dev/io/iodev.h>
41 #include <dev/pci/pcireg.h>
42
43 #include <machine/iodev.h>
44
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <err.h>
49 #include <errno.h>
50 #include <fcntl.h>
51 #include <sysexits.h>
52 #include <unistd.h>
53
54 #include <machine/vmm.h>
55 #include <vmmapi.h>
56 #include <sys/ppt_dev.h>
57
58 #include "config.h"
59 #include "debug.h"
60 #include "pci_passthru.h"
61 #include "mem.h"
62
63 #define LEGACY_SUPPORT 1
64
65 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
66 #define MSIX_CAPLEN 12
67
68 struct passthru_softc {
69 struct pci_devinst *psc_pi;
70 /* ROM is handled like a BAR */
71 struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1];
72 struct {
73 int capoff;
74 int msgctrl;
75 int emulated;
76 } psc_msi;
77 struct {
78 int capoff;
79 } psc_msix;
80 int pptfd;
81 int msi_limit;
82 int msix_limit;
83
84 cfgread_handler psc_pcir_rhandler[PCI_REGMAX + 1];
85 cfgwrite_handler psc_pcir_whandler[PCI_REGMAX + 1];
86 };
87
88 static int
msi_caplen(int msgctrl)89 msi_caplen(int msgctrl)
90 {
91 int len;
92
93 len = 10; /* minimum length of msi capability */
94
95 if (msgctrl & PCIM_MSICTRL_64BIT)
96 len += 4;
97
98 #if 0
99 /*
100 * Ignore the 'mask' and 'pending' bits in the MSI capability.
101 * We'll let the guest manipulate them directly.
102 */
103 if (msgctrl & PCIM_MSICTRL_VECTOR)
104 len += 10;
105 #endif
106
107 return (len);
108 }
109
110 static uint32_t
passthru_read_config(const struct passthru_softc * sc,long reg,int width)111 passthru_read_config(const struct passthru_softc *sc, long reg, int width)
112 {
113 struct ppt_cfg_io pi;
114
115 pi.pci_off = reg;
116 pi.pci_width = width;
117
118 if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) {
119 return (0);
120 }
121 return (pi.pci_data);
122 }
123
124 static void
passthru_write_config(const struct passthru_softc * sc,long reg,int width,uint32_t data)125 passthru_write_config(const struct passthru_softc *sc, long reg, int width,
126 uint32_t data)
127 {
128 struct ppt_cfg_io pi;
129
130 pi.pci_off = reg;
131 pi.pci_width = width;
132 pi.pci_data = data;
133
134 (void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi);
135 }
136
137 static int
passthru_get_bar(struct passthru_softc * sc,int bar,enum pcibar_type * type,uint64_t * base,uint64_t * size)138 passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type,
139 uint64_t *base, uint64_t *size)
140 {
141 struct ppt_bar_query pb;
142
143 pb.pbq_baridx = bar;
144
145 if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) {
146 return (-1);
147 }
148
149 switch (pb.pbq_type) {
150 case PCI_ADDR_IO:
151 *type = PCIBAR_IO;
152 break;
153 case PCI_ADDR_MEM32:
154 *type = PCIBAR_MEM32;
155 break;
156 case PCI_ADDR_MEM64:
157 *type = PCIBAR_MEM64;
158 break;
159 default:
160 err(1, "unrecognized BAR type: %u\n", pb.pbq_type);
161 break;
162 }
163
164 *base = pb.pbq_base;
165 *size = pb.pbq_size;
166 return (0);
167 }
168
169 static int
passthru_dev_open(const char * path,int * pptfdp)170 passthru_dev_open(const char *path, int *pptfdp)
171 {
172 int pptfd;
173
174 if ((pptfd = open(path, O_RDWR)) < 0) {
175 return (errno);
176 }
177
178 /* XXX: verify fd with ioctl? */
179 *pptfdp = pptfd;
180 return (0);
181 }
182
183 #ifdef LEGACY_SUPPORT
184 static int
passthru_add_msicap(struct pci_devinst * pi,int msgnum,int nextptr)185 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
186 {
187 int capoff;
188 struct msicap msicap;
189 u_char *capdata;
190
191 pci_populate_msicap(&msicap, msgnum, nextptr);
192
193 /*
194 * XXX
195 * Copy the msi capability structure in the last 16 bytes of the
196 * config space. This is wrong because it could shadow something
197 * useful to the device.
198 */
199 capoff = 256 - roundup(sizeof(msicap), 4);
200 capdata = (u_char *)&msicap;
201 for (size_t i = 0; i < sizeof(msicap); i++)
202 pci_set_cfgdata8(pi, capoff + i, capdata[i]);
203
204 return (capoff);
205 }
206 #endif /* LEGACY_SUPPORT */
207
208 static void
passthru_intr_limit(struct passthru_softc * sc,struct msixcap * msixcap)209 passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap)
210 {
211 struct pci_devinst *pi = sc->psc_pi;
212 int off;
213
214 /* Reduce the number of MSI vectors if higher than OS limit */
215 if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) {
216 int msi_limit, mmc;
217
218 msi_limit =
219 sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 :
220 sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 :
221 sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 :
222 sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 :
223 sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 :
224 PCIM_MSICTRL_MMC_1;
225 mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK;
226
227 if (mmc > msi_limit) {
228 sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK;
229 sc->psc_msi.msgctrl |= msi_limit;
230 pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl);
231 }
232 }
233
234 /* Reduce the number of MSI-X vectors if higher than OS limit */
235 if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) {
236 if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) {
237 msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE;
238 msixcap->msgctrl |= sc->msix_limit - 1;
239 pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl);
240 }
241 }
242 }
243
244 static int
cfginitmsi(struct passthru_softc * sc)245 cfginitmsi(struct passthru_softc *sc)
246 {
247 int i, ptr, capptr, cap, sts, caplen, table_size;
248 uint32_t u32;
249 struct pci_devinst *pi = sc->psc_pi;
250 struct msixcap msixcap;
251 char *msixcap_ptr;
252
253 /*
254 * Parse the capabilities and cache the location of the MSI
255 * and MSI-X capabilities.
256 */
257 sts = passthru_read_config(sc, PCIR_STATUS, 2);
258 if (sts & PCIM_STATUS_CAPPRESENT) {
259 ptr = passthru_read_config(sc, PCIR_CAP_PTR, 1);
260 while (ptr != 0 && ptr != 0xff) {
261 cap = passthru_read_config(sc, ptr + PCICAP_ID, 1);
262 if (cap == PCIY_MSI) {
263 /*
264 * Copy the MSI capability into the config
265 * space of the emulated pci device
266 */
267 sc->psc_msi.capoff = ptr;
268 sc->psc_msi.msgctrl = passthru_read_config(sc,
269 ptr + 2, 2);
270 sc->psc_msi.emulated = 0;
271 caplen = msi_caplen(sc->psc_msi.msgctrl);
272 capptr = ptr;
273 while (caplen > 0) {
274 u32 = passthru_read_config(sc,
275 capptr, 4);
276 pci_set_cfgdata32(pi, capptr, u32);
277 caplen -= 4;
278 capptr += 4;
279 }
280 } else if (cap == PCIY_MSIX) {
281 /*
282 * Copy the MSI-X capability
283 */
284 sc->psc_msix.capoff = ptr;
285 caplen = 12;
286 msixcap_ptr = (char *)&msixcap;
287 capptr = ptr;
288 while (caplen > 0) {
289 u32 = passthru_read_config(sc,
290 capptr, 4);
291 memcpy(msixcap_ptr, &u32, 4);
292 pci_set_cfgdata32(pi, capptr, u32);
293 caplen -= 4;
294 capptr += 4;
295 msixcap_ptr += 4;
296 }
297 }
298 ptr = passthru_read_config(sc, ptr + PCICAP_NEXTPTR, 1);
299 }
300 }
301
302 passthru_intr_limit(sc, &msixcap);
303
304 if (sc->psc_msix.capoff != 0) {
305 pi->pi_msix.pba_bar =
306 msixcap.pba_info & PCIM_MSIX_BIR_MASK;
307 pi->pi_msix.pba_offset =
308 msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
309 pi->pi_msix.table_bar =
310 msixcap.table_info & PCIM_MSIX_BIR_MASK;
311 pi->pi_msix.table_offset =
312 msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
313 pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
314 pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
315
316 /* Allocate the emulated MSI-X table array */
317 table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
318 pi->pi_msix.table = calloc(1, table_size);
319
320 /* Mask all table entries */
321 for (i = 0; i < pi->pi_msix.table_count; i++) {
322 pi->pi_msix.table[i].vector_control |=
323 PCIM_MSIX_VCTRL_MASK;
324 }
325 }
326
327 #ifdef LEGACY_SUPPORT
328 /*
329 * If the passthrough device does not support MSI then craft a
330 * MSI capability for it. We link the new MSI capability at the
331 * head of the list of capabilities.
332 */
333 if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
334 int origptr, msiptr;
335 origptr = passthru_read_config(sc, PCIR_CAP_PTR, 1);
336 msiptr = passthru_add_msicap(pi, 1, origptr);
337 sc->psc_msi.capoff = msiptr;
338 sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
339 sc->psc_msi.emulated = 1;
340 pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
341 }
342 #endif
343
344 /* Make sure one of the capabilities is present */
345 if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
346 return (-1);
347 else
348 return (0);
349 }
350
351 static uint64_t
msix_table_read(struct passthru_softc * sc,uint64_t offset,int size)352 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
353 {
354 struct pci_devinst *pi;
355 struct msix_table_entry *entry;
356 uint8_t *src8;
357 uint16_t *src16;
358 uint32_t *src32;
359 uint64_t *src64;
360 uint64_t data;
361 size_t entry_offset;
362 uint32_t table_offset;
363 int index, table_count;
364
365 pi = sc->psc_pi;
366
367 table_offset = pi->pi_msix.table_offset;
368 table_count = pi->pi_msix.table_count;
369 if (offset < table_offset ||
370 offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) {
371 switch (size) {
372 case 1:
373 src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset);
374 data = *src8;
375 break;
376 case 2:
377 src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset);
378 data = *src16;
379 break;
380 case 4:
381 src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset);
382 data = *src32;
383 break;
384 case 8:
385 src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset);
386 data = *src64;
387 break;
388 default:
389 return (-1);
390 }
391 return (data);
392 }
393
394 offset -= table_offset;
395 index = offset / MSIX_TABLE_ENTRY_SIZE;
396 assert(index < table_count);
397
398 entry = &pi->pi_msix.table[index];
399 entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
400
401 switch (size) {
402 case 1:
403 src8 = (uint8_t *)((uint8_t *)entry + entry_offset);
404 data = *src8;
405 break;
406 case 2:
407 src16 = (uint16_t *)((uint8_t *)entry + entry_offset);
408 data = *src16;
409 break;
410 case 4:
411 src32 = (uint32_t *)((uint8_t *)entry + entry_offset);
412 data = *src32;
413 break;
414 case 8:
415 src64 = (uint64_t *)((uint8_t *)entry + entry_offset);
416 data = *src64;
417 break;
418 default:
419 return (-1);
420 }
421
422 return (data);
423 }
424
425 static void
msix_table_write(struct vmctx * ctx,struct passthru_softc * sc,uint64_t offset,int size,uint64_t data)426 msix_table_write(struct vmctx *ctx, struct passthru_softc *sc,
427 uint64_t offset, int size, uint64_t data)
428 {
429 struct pci_devinst *pi;
430 struct msix_table_entry *entry;
431 uint8_t *dest8;
432 uint16_t *dest16;
433 uint32_t *dest32;
434 uint64_t *dest64;
435 size_t entry_offset;
436 uint32_t table_offset, vector_control;
437 int index, table_count;
438
439 pi = sc->psc_pi;
440
441 table_offset = pi->pi_msix.table_offset;
442 table_count = pi->pi_msix.table_count;
443 if (offset < table_offset ||
444 offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) {
445 switch (size) {
446 case 1:
447 dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset);
448 *dest8 = data;
449 break;
450 case 2:
451 dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset);
452 *dest16 = data;
453 break;
454 case 4:
455 dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset);
456 *dest32 = data;
457 break;
458 case 8:
459 dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset);
460 *dest64 = data;
461 break;
462 }
463 return;
464 }
465
466 offset -= table_offset;
467 index = offset / MSIX_TABLE_ENTRY_SIZE;
468 assert(index < table_count);
469
470 entry = &pi->pi_msix.table[index];
471 entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
472
473 /* Only 4 byte naturally-aligned writes are supported */
474 assert(size == 4);
475 assert(entry_offset % 4 == 0);
476
477 vector_control = entry->vector_control;
478 dest32 = (uint32_t *)((uint8_t *)entry + entry_offset);
479 *dest32 = data;
480 /* If MSI-X hasn't been enabled, do nothing */
481 if (pi->pi_msix.enabled) {
482 /* If the entry is masked, don't set it up */
483 if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
484 (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
485 (void) vm_setup_pptdev_msix(ctx, sc->pptfd,
486 index, entry->addr, entry->msg_data,
487 entry->vector_control);
488 }
489 }
490 }
491
492 static int
init_msix_table(struct vmctx * ctx __unused,struct passthru_softc * sc)493 init_msix_table(struct vmctx *ctx __unused, struct passthru_softc *sc)
494 {
495 struct pci_devinst *pi = sc->psc_pi;
496 uint32_t table_size, table_offset;
497 int i;
498
499 i = pci_msix_table_bar(pi);
500 assert(i >= 0);
501
502 /*
503 * Map the region of the BAR containing the MSI-X table. This is
504 * necessary for two reasons:
505 * 1. The PBA may reside in the first or last page containing the MSI-X
506 * table.
507 * 2. While PCI devices are not supposed to use the page(s) containing
508 * the MSI-X table for other purposes, some do in practice.
509 */
510
511 /*
512 * Mapping pptfd provides access to the BAR containing the MSI-X
513 * table. See ppt_devmap() in usr/src/uts/intel/io/vmm/io/ppt.c
514 *
515 * This maps the whole BAR and then mprotect(PROT_NONE) is used below
516 * to prevent access to pages that don't contain the MSI-X table.
517 * When porting this, it was tempting to just map the MSI-X table pages
518 * but that would mean updating everywhere that assumes that
519 * pi->pi_msix.mapped_addr points to the start of the BAR. For now,
520 * keep closer to upstream.
521 */
522 pi->pi_msix.mapped_size = sc->psc_bar[i].size;
523 pi->pi_msix.mapped_addr = (uint8_t *)mmap(NULL, pi->pi_msix.mapped_size,
524 PROT_READ | PROT_WRITE, MAP_SHARED, sc->pptfd, 0);
525 if (pi->pi_msix.mapped_addr == MAP_FAILED) {
526 warn("Failed to map MSI-X table BAR on %d", sc->pptfd);
527 return (-1);
528 }
529
530 table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
531
532 table_size = pi->pi_msix.table_offset - table_offset;
533 table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
534 table_size = roundup2(table_size, 4096);
535
536 /*
537 * Unmap any pages not containing the table, we do not need to emulate
538 * accesses to them. Avoid releasing address space to help ensure that
539 * a buggy out-of-bounds access causes a crash.
540 */
541 if (table_offset != 0)
542 if (mprotect((caddr_t)pi->pi_msix.mapped_addr, table_offset,
543 PROT_NONE) != 0)
544 warn("Failed to unmap MSI-X table BAR region");
545 if (table_offset + table_size != pi->pi_msix.mapped_size)
546 if (mprotect((caddr_t)
547 pi->pi_msix.mapped_addr + table_offset + table_size,
548 pi->pi_msix.mapped_size - (table_offset + table_size),
549 PROT_NONE) != 0)
550 warn("Failed to unmap MSI-X table BAR region");
551
552 return (0);
553 }
554
555 static int
cfginitbar(struct vmctx * ctx __unused,struct passthru_softc * sc)556 cfginitbar(struct vmctx *ctx __unused, struct passthru_softc *sc)
557 {
558 struct pci_devinst *pi = sc->psc_pi;
559 uint_t i;
560
561 /*
562 * Initialize BAR registers
563 */
564 for (i = 0; i <= PCI_BARMAX; i++) {
565 enum pcibar_type bartype;
566 uint64_t base, size;
567 int error;
568
569 if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) {
570 continue;
571 }
572
573 if (bartype != PCIBAR_IO) {
574 if (((base | size) & PAGE_MASK) != 0) {
575 warnx("passthru device %d BAR %d: "
576 "base %#lx or size %#lx not page aligned\n",
577 sc->pptfd, i, base, size);
578 return (-1);
579 }
580 }
581
582 /* Cache information about the "real" BAR */
583 sc->psc_bar[i].type = bartype;
584 sc->psc_bar[i].size = size;
585 sc->psc_bar[i].addr = base;
586 sc->psc_bar[i].lobits = 0;
587
588 /* Allocate the BAR in the guest I/O or MMIO space */
589 error = pci_emul_alloc_bar(pi, i, bartype, size);
590 if (error)
591 return (-1);
592
593 /* Use same lobits as physical bar */
594 uint8_t lobits = passthru_read_config(sc, PCIR_BAR(i), 0x01);
595 if (bartype == PCIBAR_MEM32 || bartype == PCIBAR_MEM64) {
596 lobits &= ~PCIM_BAR_MEM_BASE;
597 } else {
598 lobits &= ~PCIM_BAR_IO_BASE;
599 }
600 sc->psc_bar[i].lobits = lobits;
601 pi->pi_bar[i].lobits = lobits;
602
603 /*
604 * 64-bit BAR takes up two slots so skip the next one.
605 */
606 if (bartype == PCIBAR_MEM64) {
607 i++;
608 assert(i <= PCI_BARMAX);
609 sc->psc_bar[i].type = PCIBAR_MEMHI64;
610 }
611 }
612 return (0);
613 }
614
615 static int
cfginit(struct vmctx * ctx,struct passthru_softc * sc)616 cfginit(struct vmctx *ctx, struct passthru_softc *sc)
617 {
618 int error;
619 struct pci_devinst *pi = sc->psc_pi;
620 uint8_t intline, intpin;
621
622 /*
623 * Copy physical PCI header to virtual config space. INTLINE and INTPIN
624 * shouldn't be aligned with their physical value and they are already
625 * set by pci_emul_init().
626 */
627 intline = pci_get_cfgdata8(pi, PCIR_INTLINE);
628 intpin = pci_get_cfgdata8(pi, PCIR_INTPIN);
629 for (int i = 0; i <= PCIR_MAXLAT; i += 4) {
630 #ifdef __FreeBSD__
631 pci_set_cfgdata32(pi, i, read_config(&sc->psc_sel, i, 4));
632 #else
633 pci_set_cfgdata32(pi, i, passthru_read_config(sc, i, 4));
634 #endif
635 }
636
637 pci_set_cfgdata8(pi, PCIR_INTLINE, intline);
638 pci_set_cfgdata8(pi, PCIR_INTPIN, intpin);
639
640 if (cfginitmsi(sc) != 0) {
641 warnx("failed to initialize MSI for PCI %d", sc->pptfd);
642 return (-1);
643 }
644
645 if (cfginitbar(ctx, sc) != 0) {
646 warnx("failed to initialize BARs for PCI %d", sc->pptfd);
647 return (-1);
648 }
649
650 passthru_write_config(sc, PCIR_COMMAND, 2,
651 pci_get_cfgdata16(pi, PCIR_COMMAND));
652
653 /*
654 * We need to do this after PCIR_COMMAND got possibly updated, e.g.,
655 * a BAR was enabled.
656 */
657 if (pci_msix_table_bar(pi) >= 0) {
658 error = init_msix_table(ctx, sc);
659 if (error != 0) {
660 warnx("failed to initialize MSI-X table for PCI %d",
661 sc->pptfd);
662 goto done;
663 }
664 }
665
666 /* Emulate most PCI header register. */
667 if ((error = set_pcir_handler(sc, 0, PCIR_MAXLAT + 1,
668 passthru_cfgread_emulate, passthru_cfgwrite_emulate)) != 0)
669 goto done;
670
671 /* Allow access to the physical command and status register. */
672 if ((error = set_pcir_handler(sc, PCIR_COMMAND, 0x04, NULL, NULL)) != 0)
673 goto done;
674
675 error = 0; /* success */
676 done:
677 return (error);
678 }
679
680 int
set_pcir_handler(struct passthru_softc * sc,int reg,int len,cfgread_handler rhandler,cfgwrite_handler whandler)681 set_pcir_handler(struct passthru_softc *sc, int reg, int len,
682 cfgread_handler rhandler, cfgwrite_handler whandler)
683 {
684 if (reg > PCI_REGMAX || reg + len > PCI_REGMAX + 1)
685 return (-1);
686
687 for (int i = reg; i < reg + len; ++i) {
688 assert(sc->psc_pcir_rhandler[i] == NULL || rhandler == NULL);
689 assert(sc->psc_pcir_whandler[i] == NULL || whandler == NULL);
690 sc->psc_pcir_rhandler[i] = rhandler;
691 sc->psc_pcir_whandler[i] = whandler;
692 }
693
694 return (0);
695 }
696
697 static int
passthru_legacy_config(nvlist_t * nvl,const char * opt)698 passthru_legacy_config(nvlist_t *nvl, const char *opt)
699 {
700 char *config, *name, *tofree, *value;
701
702 if (opt == NULL)
703 return (0);
704
705 config = tofree = strdup(opt);
706 while ((name = strsep(&config, ",")) != NULL) {
707 value = strchr(name, '=');
708 if (value != NULL) {
709 *value++ = '\0';
710 set_config_value_node(nvl, name, value);
711 } else {
712 if (strncmp(name, "/dev/ppt", 8) != 0) {
713 EPRINTLN("passthru: invalid path \"%s\"", name);
714 free(tofree);
715 return (-1);
716 }
717 set_config_value_node(nvl, "path", name);
718 }
719 }
720 free(tofree);
721 return (0);
722 }
723
724 static int
passthru_init_rom(struct vmctx * const ctx __unused,struct passthru_softc * const sc,const char * const romfile)725 passthru_init_rom(struct vmctx *const ctx __unused,
726 struct passthru_softc *const sc, const char *const romfile)
727 {
728 if (romfile == NULL) {
729 return (0);
730 }
731
732 const int fd = open(romfile, O_RDONLY);
733 if (fd < 0) {
734 warnx("%s: can't open romfile \"%s\"", __func__, romfile);
735 return (-1);
736 }
737
738 struct stat sbuf;
739 if (fstat(fd, &sbuf) < 0) {
740 warnx("%s: can't fstat romfile \"%s\"", __func__, romfile);
741 close(fd);
742 return (-1);
743 }
744 const uint64_t rom_size = sbuf.st_size;
745
746 void *const rom_data = mmap(NULL, rom_size, PROT_READ, MAP_SHARED, fd,
747 0);
748 if (rom_data == MAP_FAILED) {
749 warnx("%s: unable to mmap romfile \"%s\" (%d)", __func__,
750 romfile, errno);
751 close(fd);
752 return (-1);
753 }
754
755 void *rom_addr;
756 int error = pci_emul_alloc_rom(sc->psc_pi, rom_size, &rom_addr);
757 if (error) {
758 warnx("%s: failed to alloc rom segment", __func__);
759 munmap(rom_data, rom_size);
760 close(fd);
761 return (error);
762 }
763 memcpy(rom_addr, rom_data, rom_size);
764
765 sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM;
766 sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr;
767 sc->psc_bar[PCI_ROM_IDX].size = rom_size;
768
769 munmap(rom_data, rom_size);
770 close(fd);
771
772 return (0);
773 }
774
775 static int
passthru_init(struct pci_devinst * pi,nvlist_t * nvl)776 passthru_init(struct pci_devinst *pi, nvlist_t *nvl)
777 {
778 int error, memflags, pptfd;
779 struct passthru_softc *sc;
780 const char *path;
781 struct vmctx *ctx = pi->pi_vmctx;
782
783 pptfd = -1;
784 sc = NULL;
785 error = 1;
786
787 memflags = vm_get_memflags(ctx);
788 if (!(memflags & VM_MEM_F_WIRED)) {
789 warnx("passthru requires guest memory to be wired");
790 goto done;
791 }
792
793 path = get_config_value_node(nvl, "path");
794 if (path == NULL || passthru_dev_open(path, &pptfd) != 0) {
795 warnx("invalid passthru options");
796 goto done;
797 }
798
799 if (vm_assign_pptdev(ctx, pptfd) != 0) {
800 warnx("PCI device at %d is not using the ppt driver", pptfd);
801 goto done;
802 }
803
804 sc = calloc(1, sizeof(struct passthru_softc));
805
806 pi->pi_arg = sc;
807 sc->psc_pi = pi;
808 sc->pptfd = pptfd;
809
810 if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit,
811 &sc->msix_limit)) != 0)
812 goto done;
813
814 #ifndef __FreeBSD__
815 /*
816 * If this function uses legacy interrupt messages, then request one for
817 * the guest in case drivers expect to see it. Note that nothing in the
818 * hypervisor is currently wired up do deliver such an interrupt should
819 * the guest actually rely upon it.
820 */
821 uint8_t intpin = passthru_read_config(sc, PCIR_INTPIN, 1);
822 if (intpin > 0 && intpin < 5)
823 pci_lintr_request(sc->psc_pi);
824 #endif
825
826 /* initialize config space */
827 if ((error = cfginit(ctx, sc)) != 0)
828 goto done;
829
830 /* initialize ROM */
831 if ((error = passthru_init_rom(ctx, sc,
832 get_config_value_node(nvl, "rom"))) != 0) {
833 goto done;
834 }
835
836 done:
837 if (error) {
838 free(sc);
839 if (pptfd != -1)
840 vm_unassign_pptdev(ctx, pptfd);
841 }
842 return (error);
843 }
844
845 static int
msicap_access(struct passthru_softc * sc,int coff)846 msicap_access(struct passthru_softc *sc, int coff)
847 {
848 int caplen;
849
850 if (sc->psc_msi.capoff == 0)
851 return (0);
852
853 caplen = msi_caplen(sc->psc_msi.msgctrl);
854
855 if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
856 return (1);
857 else
858 return (0);
859 }
860
861 static int
msixcap_access(struct passthru_softc * sc,int coff)862 msixcap_access(struct passthru_softc *sc, int coff)
863 {
864 if (sc->psc_msix.capoff == 0)
865 return (0);
866
867 return (coff >= sc->psc_msix.capoff &&
868 coff < sc->psc_msix.capoff + MSIX_CAPLEN);
869 }
870
871 static int
passthru_cfgread_default(struct passthru_softc * sc,struct pci_devinst * pi __unused,int coff,int bytes,uint32_t * rv)872 passthru_cfgread_default(struct passthru_softc *sc,
873 struct pci_devinst *pi __unused, int coff, int bytes, uint32_t *rv)
874 {
875 /*
876 * MSI capability is emulated.
877 */
878 if (msicap_access(sc, coff) || msixcap_access(sc, coff))
879 return (-1);
880
881 /*
882 * MSI-X is also emulated since a limit on interrupts may be imposed by
883 * the OS, altering the perceived register state.
884 */
885 if (msixcap_access(sc, coff))
886 return (-1);
887
888 /*
889 * Emulate the command register. If a single read reads both the
890 * command and status registers, read the status register from the
891 * device's config space.
892 */
893 if (coff == PCIR_COMMAND) {
894 if (bytes <= 2)
895 return (-1);
896 *rv = passthru_read_config(sc, PCIR_STATUS, 2) << 16 |
897 pci_get_cfgdata16(pi, PCIR_COMMAND);
898 return (0);
899 }
900
901 /* Everything else just read from the device's config space */
902 *rv = passthru_read_config(sc, coff, bytes);
903
904 return (0);
905 }
906
907 int
passthru_cfgread_emulate(struct passthru_softc * sc __unused,struct pci_devinst * pi __unused,int coff __unused,int bytes __unused,uint32_t * rv __unused)908 passthru_cfgread_emulate(struct passthru_softc *sc __unused,
909 struct pci_devinst *pi __unused, int coff __unused, int bytes __unused,
910 uint32_t *rv __unused)
911 {
912 return (-1);
913 }
914
915 static int
passthru_cfgread(struct pci_devinst * pi,int coff,int bytes,uint32_t * rv)916 passthru_cfgread(struct pci_devinst *pi, int coff, int bytes, uint32_t *rv)
917 {
918 struct passthru_softc *sc;
919
920 sc = pi->pi_arg;
921
922 if (sc->psc_pcir_rhandler[coff] != NULL)
923 return (sc->psc_pcir_rhandler[coff](sc, pi, coff, bytes, rv));
924
925 return (passthru_cfgread_default(sc, pi, coff, bytes, rv));
926 }
927
928 static int
passthru_cfgwrite_default(struct passthru_softc * sc,struct pci_devinst * pi,int coff,int bytes,uint32_t val)929 passthru_cfgwrite_default(struct passthru_softc *sc, struct pci_devinst *pi,
930 int coff, int bytes, uint32_t val)
931 {
932 int error, msix_table_entries, i;
933 uint16_t cmd_old;
934 struct vmctx *ctx = pi->pi_vmctx;
935
936 /*
937 * MSI capability is emulated
938 */
939 if (msicap_access(sc, coff)) {
940 pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
941 PCIY_MSI);
942 error = vm_setup_pptdev_msi(ctx, sc->pptfd,
943 pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum);
944 if (error != 0)
945 err(1, "vm_setup_pptdev_msi");
946 return (0);
947 }
948
949 if (msixcap_access(sc, coff)) {
950 pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
951 PCIY_MSIX);
952 if (pi->pi_msix.enabled) {
953 msix_table_entries = pi->pi_msix.table_count;
954 for (i = 0; i < msix_table_entries; i++) {
955 error = vm_setup_pptdev_msix(ctx,
956 sc->pptfd, i,
957 pi->pi_msix.table[i].addr,
958 pi->pi_msix.table[i].msg_data,
959 pi->pi_msix.table[i].vector_control);
960
961 if (error)
962 err(1, "vm_setup_pptdev_msix");
963 }
964 } else {
965 error = vm_disable_pptdev_msix(ctx, sc->pptfd);
966 if (error)
967 err(1, "vm_disable_pptdev_msix");
968 }
969 return (0);
970 }
971
972 #ifdef LEGACY_SUPPORT
973 /*
974 * If this device does not support MSI natively then we cannot let
975 * the guest disable legacy interrupts from the device. It is the
976 * legacy interrupt that is triggering the virtual MSI to the guest.
977 */
978 if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
979 if (coff == PCIR_COMMAND && bytes == 2)
980 val &= ~PCIM_CMD_INTxDIS;
981 }
982 #endif
983
984 passthru_write_config(sc, coff, bytes, val);
985 if (coff == PCIR_COMMAND) {
986 cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
987 if (bytes == 1)
988 pci_set_cfgdata8(pi, PCIR_COMMAND, val);
989 else if (bytes == 2)
990 pci_set_cfgdata16(pi, PCIR_COMMAND, val);
991 pci_emul_cmd_changed(pi, cmd_old);
992 }
993
994 return (0);
995 }
996
997 int
passthru_cfgwrite_emulate(struct passthru_softc * sc __unused,struct pci_devinst * pi __unused,int coff __unused,int bytes __unused,uint32_t val __unused)998 passthru_cfgwrite_emulate(struct passthru_softc *sc __unused,
999 struct pci_devinst *pi __unused, int coff __unused, int bytes __unused,
1000 uint32_t val __unused)
1001 {
1002 return (-1);
1003 }
1004
1005 static int
passthru_cfgwrite(struct pci_devinst * pi,int coff,int bytes,uint32_t val)1006 passthru_cfgwrite(struct pci_devinst *pi, int coff, int bytes, uint32_t val)
1007 {
1008 struct passthru_softc *sc;
1009
1010 sc = pi->pi_arg;
1011
1012 if (sc->psc_pcir_whandler[coff] != NULL)
1013 return (sc->psc_pcir_whandler[coff](sc, pi, coff, bytes, val));
1014
1015 return (passthru_cfgwrite_default(sc, pi, coff, bytes, val));
1016 }
1017
1018 static void
passthru_write(struct pci_devinst * pi,int baridx,uint64_t offset,int size,uint64_t value)1019 passthru_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
1020 uint64_t value)
1021 {
1022 struct passthru_softc *sc = pi->pi_arg;
1023 struct vmctx *ctx = pi->pi_vmctx;
1024
1025 if (baridx == pci_msix_table_bar(pi)) {
1026 msix_table_write(ctx, sc, offset, size, value);
1027 } else {
1028 struct ppt_bar_io pbi;
1029
1030 assert(pi->pi_bar[baridx].type == PCIBAR_IO);
1031
1032 pbi.pbi_bar = baridx;
1033 pbi.pbi_width = size;
1034 pbi.pbi_off = offset;
1035 pbi.pbi_data = value;
1036 (void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi);
1037 }
1038 }
1039
1040 static uint64_t
passthru_read(struct pci_devinst * pi,int baridx,uint64_t offset,int size)1041 passthru_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
1042 {
1043 struct passthru_softc *sc = pi->pi_arg;
1044 uint64_t val;
1045
1046 if (baridx == pci_msix_table_bar(pi)) {
1047 val = msix_table_read(sc, offset, size);
1048 } else {
1049 struct ppt_bar_io pbi;
1050
1051 assert(pi->pi_bar[baridx].type == PCIBAR_IO);
1052
1053 pbi.pbi_bar = baridx;
1054 pbi.pbi_width = size;
1055 pbi.pbi_off = offset;
1056 if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) {
1057 val = pbi.pbi_data;
1058 } else {
1059 val = 0;
1060 }
1061 }
1062
1063 return (val);
1064 }
1065
1066 static void
passthru_msix_addr(struct vmctx * ctx,struct pci_devinst * pi,int baridx,int enabled,uint64_t address)1067 passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
1068 int enabled, uint64_t address)
1069 {
1070 struct passthru_softc *sc;
1071 size_t remaining;
1072 uint32_t table_size, table_offset;
1073
1074 sc = pi->pi_arg;
1075 table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
1076 if (table_offset > 0) {
1077 if (!enabled) {
1078 if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
1079 table_offset) != 0)
1080 warnx("pci_passthru: unmap_pptdev_mmio failed");
1081 } else {
1082 if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
1083 table_offset, sc->psc_bar[baridx].addr) != 0)
1084 warnx("pci_passthru: map_pptdev_mmio failed");
1085 }
1086 }
1087 table_size = pi->pi_msix.table_offset - table_offset;
1088 table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
1089 table_size = roundup2(table_size, 4096);
1090 remaining = pi->pi_bar[baridx].size - table_offset - table_size;
1091 if (remaining > 0) {
1092 address += table_offset + table_size;
1093 if (!enabled) {
1094 if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
1095 remaining) != 0)
1096 warnx("pci_passthru: unmap_pptdev_mmio failed");
1097 } else {
1098 if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
1099 remaining, sc->psc_bar[baridx].addr +
1100 table_offset + table_size) != 0)
1101 warnx("pci_passthru: map_pptdev_mmio failed");
1102 }
1103 }
1104 }
1105
1106 static void
passthru_mmio_addr(struct vmctx * ctx,struct pci_devinst * pi,int baridx,int enabled,uint64_t address)1107 passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
1108 int enabled, uint64_t address)
1109 {
1110 struct passthru_softc *sc;
1111
1112 sc = pi->pi_arg;
1113 if (!enabled) {
1114 if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
1115 sc->psc_bar[baridx].size) != 0)
1116 warnx("pci_passthru: unmap_pptdev_mmio failed");
1117 } else {
1118 if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
1119 sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0)
1120 warnx("pci_passthru: map_pptdev_mmio failed");
1121 }
1122 }
1123
1124 static void
passthru_addr_rom(struct pci_devinst * const pi,const int idx,const int enabled)1125 passthru_addr_rom(struct pci_devinst *const pi, const int idx,
1126 const int enabled)
1127 {
1128 const uint64_t addr = pi->pi_bar[idx].addr;
1129 const uint64_t size = pi->pi_bar[idx].size;
1130
1131 if (!enabled) {
1132 if (vm_munmap_memseg(pi->pi_vmctx, addr, size) != 0) {
1133 errx(4, "%s: munmap_memseg @ [%016lx - %016lx] failed",
1134 __func__, addr, addr + size);
1135 }
1136
1137 } else {
1138 if (vm_mmap_memseg(pi->pi_vmctx, addr, VM_PCIROM,
1139 pi->pi_romoffset, size, PROT_READ | PROT_EXEC) != 0) {
1140 errx(4, "%s: mmap_memseg @ [%016lx - %016lx] failed",
1141 __func__, addr, addr + size);
1142 }
1143 }
1144 }
1145
1146 static void
passthru_addr(struct pci_devinst * pi,int baridx,int enabled,uint64_t address)1147 passthru_addr(struct pci_devinst *pi, int baridx,
1148 int enabled, uint64_t address)
1149 {
1150 struct vmctx *ctx = pi->pi_vmctx;
1151
1152 switch (pi->pi_bar[baridx].type) {
1153 case PCIBAR_IO:
1154 /* IO BARs are emulated */
1155 break;
1156 case PCIBAR_ROM:
1157 passthru_addr_rom(pi, baridx, enabled);
1158 break;
1159 case PCIBAR_MEM32:
1160 case PCIBAR_MEM64:
1161 if (baridx == pci_msix_table_bar(pi))
1162 passthru_msix_addr(ctx, pi, baridx, enabled, address);
1163 else
1164 passthru_mmio_addr(ctx, pi, baridx, enabled, address);
1165 break;
1166 default:
1167 errx(4, "%s: invalid BAR type %d", __func__,
1168 pi->pi_bar[baridx].type);
1169 }
1170 }
1171
1172 static const struct pci_devemu passthru = {
1173 .pe_emu = "passthru",
1174 .pe_init = passthru_init,
1175 .pe_legacy_config = passthru_legacy_config,
1176 .pe_cfgwrite = passthru_cfgwrite,
1177 .pe_cfgread = passthru_cfgread,
1178 .pe_barwrite = passthru_write,
1179 .pe_barread = passthru_read,
1180 .pe_baraddr = passthru_addr,
1181 };
1182 PCI_EMUL_SET(passthru);
1183
1184 /*
1185 * This isn't the right place for these functions which, on FreeBSD, can
1186 * read or write from arbitrary devices. They are not supported on illumos;
1187 * not least because bhyve is generally run in a non-global zone which doesn't
1188 * have access to the devinfo tree.
1189 */
1190 uint32_t
read_config(const struct pcisel * sel __unused,long reg __unused,int width __unused)1191 read_config(const struct pcisel *sel __unused, long reg __unused,
1192 int width __unused)
1193 {
1194 return (-1);
1195 }
1196
1197 void
write_config(const struct pcisel * sel __unused,long reg __unused,int width __unused,uint32_t data __unused)1198 write_config(const struct pcisel *sel __unused, long reg __unused,
1199 int width __unused, uint32_t data __unused)
1200 {
1201 errx(4, "write_config() unimplemented on illumos");
1202 }
1203