1 /*
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39 * Copyright 2025 Oxide Computer Company
40 */
41
42
43 #include <sys/param.h>
44 #include <sys/linker_set.h>
45 #include <sys/ioctl.h>
46 #include <sys/uio.h>
47 #include <sys/viona_io.h>
48
49 #include <errno.h>
50 #include <fcntl.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <stdint.h>
54 #include <string.h>
55 #include <strings.h>
56 #include <unistd.h>
57 #include <assert.h>
58 #include <pthread.h>
59 #include <signal.h>
60 #include <stdbool.h>
61 #include <poll.h>
62 #include <libdladm.h>
63 #include <libdllink.h>
64 #include <libdlvnic.h>
65
66 #include <machine/vmm.h>
67 #include <vmmapi.h>
68
69 #include "bhyverun.h"
70 #include "config.h"
71 #include "debug.h"
72 #include "pci_emul.h"
73 #include "virtio.h"
74 #include "iov.h"
75 #include "virtio_net.h"
76
77 #define VIONA_RINGSZ 1024
78 #define VIONA_CTLQ_SIZE 64
79 #define VIONA_CTLQ_MAXSEGS 32
80
81 /*
82 * PCI config-space register offsets
83 */
84 #define VIONA_R_CFG0 24
85 #define VIONA_R_CFG1 25
86 #define VIONA_R_CFG2 26
87 #define VIONA_R_CFG3 27
88 #define VIONA_R_CFG4 28
89 #define VIONA_R_CFG5 29
90 #define VIONA_R_CFG6 30
91 #define VIONA_R_CFG7 31
92 #define VIONA_R_MAX 31
93
94 #define VIONA_REGSZ (VIONA_R_MAX + 1)
95
96 /*
97 * Queue definitions.
98 */
99 #define VIONA_RXQ 0
100 #define VIONA_TXQ 1
101 #define VIONA_CTLQ 2
102
103 #define VIONA_MAXQ 3
104
105 /*
106 * Supplementary host capabilities provided in the userspace component.
107 */
108 #define VIONA_S_HOSTCAPS_USERSPACE ( \
109 VIRTIO_NET_F_CTRL_VQ | \
110 VIRTIO_NET_F_CTRL_RX)
111
112 /*
113 * Debug printf
114 */
115 static volatile int pci_viona_debug;
116 #define DPRINTF(fmt, arg...) \
117 do { \
118 if (pci_viona_debug) { \
119 FPRINTLN(stdout, fmt, ##arg); \
120 fflush(stdout); \
121 } \
122 } while (0)
123 #define WPRINTF(fmt, arg...) FPRINTLN(stderr, fmt, ##arg)
124
125 /*
126 * Per-device softc
127 */
128 struct pci_viona_softc {
129 struct virtio_softc vsc_vs;
130 struct virtio_consts vsc_consts;
131 struct vqueue_info vsc_queues[VIONA_MAXQ];
132 pthread_mutex_t vsc_mtx;
133
134 datalink_id_t vsc_linkid;
135 int vsc_vnafd;
136
137 /* Configurable parameters */
138 char vsc_linkname[MAXLINKNAMELEN];
139 uint32_t vsc_feature_mask;
140 uint16_t vsc_vq_size;
141
142 uint8_t vsc_macaddr[6];
143 uint16_t vsc_mtu;
144
145 bool vsc_resetting;
146 bool vsc_msix_active;
147
148 viona_promisc_t vsc_promisc; /* Current promisc mode */
149 bool vsc_promisc_promisc; /* PROMISC enabled */
150 bool vsc_promisc_allmulti; /* ALLMULTI enabled */
151 bool vsc_promisc_umac; /* unicast MACs sent */
152 bool vsc_promisc_mmac; /* multicast MACs sent */
153 };
154
155 static struct virtio_consts viona_vi_consts = {
156 .vc_name = "viona",
157 .vc_nvq = VIONA_MAXQ,
158 /*
159 * We use the common bhyve virtio framework so that we can call
160 * the utility functions to work with the queues handled in userspace.
161 * The framework PCI read/write functions are not used so these
162 * callbacks will not be invoked.
163 */
164 .vc_cfgsize = 0,
165 .vc_reset = NULL,
166 .vc_qnotify = NULL,
167 .vc_cfgread = NULL,
168 .vc_cfgwrite = NULL,
169 .vc_apply_features = NULL,
170 /*
171 * The following field is populated using the response from the
172 * viona driver during initialisation, augmented with the additional
173 * capabilities emulated in userspace.
174 */
175 .vc_hv_caps = 0,
176 };
177
178 /*
179 * Return the size of IO BAR that maps virtio header and device specific
180 * region. The size would vary depending on whether MSI-X is enabled or
181 * not.
182 */
183 static uint64_t
pci_viona_iosize(struct pci_devinst * pi)184 pci_viona_iosize(struct pci_devinst *pi)
185 {
186 if (pci_msix_enabled(pi)) {
187 return (VIONA_REGSZ);
188 } else {
189 return (VIONA_REGSZ -
190 (VIRTIO_PCI_CONFIG_OFF(1) - VIRTIO_PCI_CONFIG_OFF(0)));
191 }
192 }
193
194 static uint16_t
pci_viona_qsize(struct pci_viona_softc * sc,int qnum)195 pci_viona_qsize(struct pci_viona_softc *sc, int qnum)
196 {
197 if (qnum == VIONA_CTLQ)
198 return (VIONA_CTLQ_SIZE);
199
200 return (sc->vsc_vq_size);
201 }
202
203 static void
pci_viona_ring_reset(struct pci_viona_softc * sc,int ring)204 pci_viona_ring_reset(struct pci_viona_softc *sc, int ring)
205 {
206 assert(ring < VIONA_MAXQ);
207
208 switch (ring) {
209 case VIONA_RXQ:
210 case VIONA_TXQ:
211 break;
212 case VIONA_CTLQ:
213 default:
214 return;
215 }
216
217 for (;;) {
218 int res;
219
220 res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_RESET, ring);
221 if (res == 0) {
222 break;
223 } else if (errno != EINTR) {
224 WPRINTF("ioctl viona ring %d reset failed %d",
225 ring, errno);
226 return;
227 }
228 }
229 }
230
231 static void
pci_viona_update_status(struct pci_viona_softc * sc,uint32_t value)232 pci_viona_update_status(struct pci_viona_softc *sc, uint32_t value)
233 {
234
235 if (value == 0) {
236 DPRINTF("viona: device reset requested !");
237
238 vi_reset_dev(&sc->vsc_vs);
239 pci_viona_ring_reset(sc, VIONA_RXQ);
240 pci_viona_ring_reset(sc, VIONA_TXQ);
241 }
242
243 sc->vsc_vs.vs_status = value;
244 }
245
246 static const char *
pci_viona_promisc_descr(viona_promisc_t mode)247 pci_viona_promisc_descr(viona_promisc_t mode)
248 {
249 switch (mode) {
250 case VIONA_PROMISC_NONE:
251 return ("none");
252 case VIONA_PROMISC_MULTI:
253 return ("multicast");
254 case VIONA_PROMISC_ALL:
255 return ("all");
256 default:
257 abort();
258 }
259 }
260
261 static int
pci_viona_eval_promisc(struct pci_viona_softc * sc)262 pci_viona_eval_promisc(struct pci_viona_softc *sc)
263 {
264 viona_promisc_t mode = VIONA_PROMISC_NONE;
265 int err = 0;
266
267 /*
268 * If the guest has explicitly requested promiscuous mode or has sent a
269 * non-empty unicast MAC address table, then set viona to promiscuous
270 * mode. Otherwise, if the guest has explicitly requested multicast
271 * promiscuity or has sent a non-empty multicast MAC address table,
272 * then set viona to multicast promiscuous mode.
273 */
274 if (sc->vsc_promisc_promisc || sc->vsc_promisc_umac)
275 mode = VIONA_PROMISC_ALL;
276 else if (sc->vsc_promisc_allmulti || sc->vsc_promisc_mmac)
277 mode = VIONA_PROMISC_MULTI;
278
279 if (mode != sc->vsc_promisc) {
280 DPRINTF("viona: setting promiscuous mode to %d (%s)",
281 mode, pci_viona_promisc_descr(mode));
282 DPRINTF(" promisc=%u, umac=%u, allmulti=%u, mmac=%u",
283 sc->vsc_promisc_promisc, sc->vsc_promisc_umac,
284 sc->vsc_promisc_allmulti, sc->vsc_promisc_mmac);
285
286 err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_PROMISC, mode);
287 if (err == 0)
288 sc->vsc_promisc = mode;
289 else
290 WPRINTF("ioctl viona set promisc failed %d", errno);
291 }
292
293 return (err);
294 }
295
296 static uint8_t
pci_viona_control_rx(struct vqueue_info * vq,const virtio_net_ctrl_hdr_t * hdr,struct iovec * iov,size_t niov)297 pci_viona_control_rx(struct vqueue_info *vq, const virtio_net_ctrl_hdr_t *hdr,
298 struct iovec *iov, size_t niov)
299 {
300 struct pci_viona_softc *sc = (struct pci_viona_softc *)vq->vq_vs;
301 uint8_t v;
302
303 if (iov[0].iov_len != sizeof (uint8_t) || niov != 1) {
304 EPRINTLN("viona: bad control RX data");
305 return (VIRTIO_NET_CQ_ERR);
306 }
307
308 v = *(uint8_t *)iov[0].iov_base;
309
310 switch (hdr->vnch_command) {
311 case VIRTIO_NET_CTRL_RX_PROMISC:
312 DPRINTF("viona: ctrl RX promisc %d", v);
313 sc->vsc_promisc_promisc = (v != 0);
314 break;
315 case VIRTIO_NET_CTRL_RX_ALLMULTI:
316 DPRINTF("viona: ctrl RX allmulti %d", v);
317 sc->vsc_promisc_allmulti = (v != 0);
318 break;
319 default:
320 /*
321 * VIRTIO_NET_F_CTRL_RX_EXTRA was not offered so no other
322 * commands are expected.
323 */
324 EPRINTLN("viona: unrecognised RX control cmd %u",
325 hdr->vnch_command);
326 return (VIRTIO_NET_CQ_ERR);
327 }
328
329 if (pci_viona_eval_promisc(sc) == 0)
330 return (VIRTIO_NET_CQ_OK);
331 return (VIRTIO_NET_CQ_ERR);
332 }
333
334 static void
pci_viona_control_mac_dump(const char * tag,const struct iovec * iov)335 pci_viona_control_mac_dump(const char *tag, const struct iovec *iov)
336 {
337 virtio_net_ctrl_mac_t *table = (virtio_net_ctrl_mac_t *)iov->iov_base;
338 ether_addr_t *mac = &table->vncm_mac;
339
340 DPRINTF("-- %s MAC TABLE (entries: %u)", tag, table->vncm_entries);
341
342 if (table->vncm_entries * ETHERADDRL !=
343 iov->iov_len - sizeof (table->vncm_entries)) {
344 DPRINTF(" Bad table size %u", iov->iov_len);
345 return;
346 }
347
348 for (uint32_t i = 0; i < table->vncm_entries; i++) {
349 DPRINTF(" [%2d] %s", i, ether_ntoa((struct ether_addr *)mac));
350 mac++;
351 }
352 }
353
354 static uint8_t
pci_viona_control_mac(struct vqueue_info * vq,const virtio_net_ctrl_hdr_t * hdr,struct iovec * iov,size_t niov)355 pci_viona_control_mac(struct vqueue_info *vq, const virtio_net_ctrl_hdr_t *hdr,
356 struct iovec *iov, size_t niov)
357 {
358 struct pci_viona_softc *sc = (struct pci_viona_softc *)vq->vq_vs;
359
360 switch (hdr->vnch_command) {
361 case VIRTIO_NET_CTRL_MAC_TABLE_SET: {
362 virtio_net_ctrl_mac_t *table;
363
364 DPRINTF("viona: ctrl MAC table set");
365
366 if (niov != 2) {
367 EPRINTLN("viona: bad control MAC data");
368 return (VIRTIO_NET_CQ_ERR);
369 }
370
371 /*
372 * We advertise VIRTIO_NET_F_CTRL_RX and therefore need to
373 * accept VIRTIO_NET_CTRL_MAC, but we don't support passing
374 * changes in the MAC address lists down to viona.
375 * Instead, we set flags to indicate if the guest has sent
376 * any MAC addresses for each table, and use these to determine
377 * the resulting promiscuous mode, see pci_viona_eval_promisc()
378 * above.
379 */
380
381 /* Unicast MAC table */
382 table = (virtio_net_ctrl_mac_t *)iov[0].iov_base;
383 sc->vsc_promisc_umac = (table->vncm_entries != 0);
384 if (pci_viona_debug)
385 pci_viona_control_mac_dump("UNICAST", &iov[0]);
386
387 /* Multicast MAC table */
388 table = (virtio_net_ctrl_mac_t *)iov[1].iov_base;
389 sc->vsc_promisc_mmac = (table->vncm_entries != 0);
390 if (pci_viona_debug)
391 pci_viona_control_mac_dump("MULTICAST", &iov[1]);
392
393 break;
394 }
395 case VIRTIO_NET_CTRL_MAC_ADDR_SET:
396 /* disallow setting the primary filter MAC address */
397 DPRINTF("viona: ctrl MAC addr set %d", niov);
398 return (VIRTIO_NET_CQ_ERR);
399 default:
400 EPRINTLN("viona: unrecognised MAC control cmd %u",
401 hdr->vnch_command);
402 return (VIRTIO_NET_CQ_ERR);
403 }
404
405 if (pci_viona_eval_promisc(sc) == 0)
406 return (VIRTIO_NET_CQ_OK);
407 return (VIRTIO_NET_CQ_ERR);
408 }
409
410 static void
pci_viona_control(struct vqueue_info * vq)411 pci_viona_control(struct vqueue_info *vq)
412 {
413 struct iovec iov[VIONA_CTLQ_MAXSEGS + 1];
414 const virtio_net_ctrl_hdr_t *hdr;
415 struct iovec *siov = iov;
416 struct vi_req req = { 0 };
417 uint8_t *ackp;
418 size_t nsiov;
419 uint32_t len;
420 int n;
421
422 n = vq_getchain(vq, iov, VIONA_CTLQ_MAXSEGS, &req);
423
424 assert(n >= 1 && n <= VIONA_CTLQ_MAXSEGS);
425
426 /*
427 * Since we have not negotiated VIRTIO_F_ANY_LAYOUT, we expect the
428 * control message to be laid out in at least three descriptors as
429 * follows:
430 * header - sizeof (virtio_net_ctrl_hdr_t)
431 * data[] - at least one descriptor, varying size
432 * ack - uint8_t, flagged as writable
433 * Check the incoming message to make sure it matches this layout and
434 * drop the entire chain if not.
435 */
436 if (n < 3 || req.writable != 1 || req.readable + 1 != n ||
437 iov[req.readable].iov_len != sizeof (uint8_t)) {
438 EPRINTLN("viona: bad control chain, len=%d, w=%d, r=%d",
439 n, req.writable, req.readable);
440 goto drop;
441 }
442
443 hdr = (const virtio_net_ctrl_hdr_t *)iov[0].iov_base;
444 if (iov[0].iov_len < sizeof (virtio_net_ctrl_hdr_t)) {
445 EPRINTLN("viona: control header too short: %u", iov[0].iov_len);
446 goto drop;
447 }
448
449 /*
450 * Writable iovecs start at iov[req.readable], and we've already
451 * checked that there is only one writable, it's at the end, and the
452 * right size; it's the acknowledgement byte.
453 */
454 ackp = (uint8_t *)iov[req.readable].iov_base;
455
456 siov = &iov[1];
457 nsiov = n - 2;
458
459 switch (hdr->vnch_class) {
460 case VIRTIO_NET_CTRL_RX:
461 *ackp = pci_viona_control_rx(vq, hdr, siov, nsiov);
462 break;
463 case VIRTIO_NET_CTRL_MAC:
464 *ackp = pci_viona_control_mac(vq, hdr, siov, nsiov);
465 break;
466 default:
467 EPRINTLN("viona: unrecognised control class %u, cmd %u",
468 hdr->vnch_class, hdr->vnch_command);
469 *ackp = VIRTIO_NET_CQ_ERR;
470 break;
471 }
472
473 drop:
474 len = 0;
475 for (uint_t i = 0; i < n; i++)
476 len += iov[i].iov_len;
477
478 vq_relchain(vq, req.idx, len);
479 }
480
481 static void
pci_viona_process_ctrlq(struct vqueue_info * vq)482 pci_viona_process_ctrlq(struct vqueue_info *vq)
483 {
484 for (;;) {
485 vq_kick_disable(vq);
486
487 while (vq_has_descs(vq))
488 pci_viona_control(vq);
489
490 vq_kick_enable(vq);
491
492 /*
493 * One more check in case a late addition raced with
494 * re-enabling kicks. Note that vq_kick_enable() includes a
495 * memory barrier.
496 */
497
498 if (!vq_has_descs(vq))
499 break;
500 }
501
502 vq_endchains(vq, /* used_all_avail= */1);
503 }
504
505 static void *
pci_viona_poll_thread(void * param)506 pci_viona_poll_thread(void *param)
507 {
508 struct pci_viona_softc *sc = param;
509 pollfd_t pollset;
510 const int fd = sc->vsc_vnafd;
511
512 pollset.fd = fd;
513 pollset.events = POLLRDBAND;
514
515 for (;;) {
516 if (poll(&pollset, 1, -1) < 0) {
517 if (errno == EINTR || errno == EAGAIN) {
518 continue;
519 } else {
520 WPRINTF("pci_viona_poll_thread poll() error %d",
521 errno);
522 break;
523 }
524 }
525 if (pollset.revents & POLLRDBAND) {
526 vioc_intr_poll_t vip;
527 uint_t i;
528 int res;
529 bool assert_lintr = false;
530 const bool do_msix = pci_msix_enabled(sc->vsc_vs.vs_pi);
531
532 res = ioctl(fd, VNA_IOC_INTR_POLL, &vip);
533 for (i = 0; res > 0 && i < VIONA_VQ_MAX; i++) {
534 if (vip.vip_status[i] == 0) {
535 continue;
536 }
537 if (do_msix) {
538 pci_generate_msix(sc->vsc_vs.vs_pi,
539 sc->vsc_queues[i].vq_msix_idx);
540 } else {
541 assert_lintr = true;
542 }
543 res = ioctl(fd, VNA_IOC_RING_INTR_CLR, i);
544 if (res != 0) {
545 WPRINTF("ioctl viona vq %d intr "
546 "clear failed %d", i, errno);
547 }
548 }
549 if (assert_lintr) {
550 pthread_mutex_lock(&sc->vsc_mtx);
551 sc->vsc_vs.vs_isr |= VIRTIO_PCI_ISR_INTR;
552 pci_lintr_assert(sc->vsc_vs.vs_pi);
553 pthread_mutex_unlock(&sc->vsc_mtx);
554 }
555 }
556 }
557
558 pthread_exit(NULL);
559 }
560
561 static void
pci_viona_ring_init(struct pci_viona_softc * sc,uint64_t pfn)562 pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
563 {
564 int qnum = sc->vsc_vs.vs_curq;
565 vioc_ring_init_t vna_ri;
566 int error;
567
568 assert(qnum < VIONA_MAXQ);
569
570 if (qnum == VIONA_CTLQ) {
571 vi_vq_init(&sc->vsc_vs, pfn);
572 return;
573 }
574
575 sc->vsc_queues[qnum].vq_pfn = (pfn << VRING_PFN);
576 vna_ri.ri_index = qnum;
577 vna_ri.ri_qsize = pci_viona_qsize(sc, qnum);
578 vna_ri.ri_qaddr = (pfn << VRING_PFN);
579 error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_INIT, &vna_ri);
580
581 if (error != 0) {
582 WPRINTF("ioctl viona ring %u init failed %d", qnum, errno);
583 }
584 }
585
586 static int
pci_viona_viona_init(struct vmctx * ctx,struct pci_viona_softc * sc)587 pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
588 {
589 vioc_create_t vna_create;
590 int error;
591
592 sc->vsc_vnafd = open("/dev/viona", O_RDWR | O_EXCL);
593 if (sc->vsc_vnafd == -1) {
594 WPRINTF("open viona ctl failed: %d", errno);
595 return (-1);
596 }
597
598 vna_create.c_linkid = sc->vsc_linkid;
599 vna_create.c_vmfd = vm_get_device_fd(ctx);
600 error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
601 if (error != 0) {
602 (void) close(sc->vsc_vnafd);
603 WPRINTF("ioctl viona create failed %d", errno);
604 return (-1);
605 }
606
607 return (0);
608 }
609
610 static int
pci_viona_legacy_config(nvlist_t * nvl,const char * opt)611 pci_viona_legacy_config(nvlist_t *nvl, const char *opt)
612 {
613 char *config, *name, *tofree, *value;
614
615 if (opt == NULL)
616 return (0);
617
618 config = tofree = strdup(opt);
619 while ((name = strsep(&config, ",")) != NULL) {
620 value = strchr(name, '=');
621 if (value != NULL) {
622 *value++ = '\0';
623 set_config_value_node(nvl, name, value);
624 } else {
625 set_config_value_node(nvl, "vnic", name);
626 }
627 }
628 free(tofree);
629 return (0);
630 }
631
632 static int
pci_viona_parse_opts(struct pci_viona_softc * sc,nvlist_t * nvl)633 pci_viona_parse_opts(struct pci_viona_softc *sc, nvlist_t *nvl)
634 {
635 const char *value;
636 int err = 0;
637
638 sc->vsc_vq_size = VIONA_RINGSZ;
639 sc->vsc_feature_mask = 0;
640 sc->vsc_linkname[0] = '\0';
641
642 value = get_config_value_node(nvl, "feature_mask");
643 if (value != NULL) {
644 long num;
645
646 errno = 0;
647 num = strtol(value, NULL, 0);
648 if (errno != 0 || num < 0) {
649 fprintf(stderr,
650 "viona: invalid mask '%s'", value);
651 } else {
652 sc->vsc_feature_mask = num;
653 }
654 }
655
656 value = get_config_value_node(nvl, "vqsize");
657 if (value != NULL) {
658 long num;
659
660 errno = 0;
661 num = strtol(value, NULL, 0);
662 if (errno != 0) {
663 fprintf(stderr,
664 "viona: invalid vsqize '%s'", value);
665 err = -1;
666 } else if (num <= 2 || num > 32768) {
667 fprintf(stderr,
668 "viona: vqsize out of range", num);
669 err = -1;
670 } else if ((1 << (ffs(num) - 1)) != num) {
671 fprintf(stderr,
672 "viona: vqsize must be power of 2", num);
673 err = -1;
674 } else {
675 sc->vsc_vq_size = num;
676 }
677 }
678
679 value = get_config_value_node(nvl, "vnic");
680 if (value == NULL) {
681 fprintf(stderr, "viona: vnic name required");
682 err = -1;
683 } else {
684 (void) strlcpy(sc->vsc_linkname, value, MAXLINKNAMELEN);
685 }
686
687 DPRINTF("viona=%p dev=%s vqsize=%x feature_mask=%x", sc,
688 sc->vsc_linkname, sc->vsc_vq_size, sc->vsc_feature_mask);
689 return (err);
690 }
691
692 static uint16_t
pci_viona_query_mtu(dladm_handle_t handle,datalink_id_t linkid)693 pci_viona_query_mtu(dladm_handle_t handle, datalink_id_t linkid)
694 {
695 char buf[DLADM_PROP_VAL_MAX];
696 char *propval = buf;
697 uint_t propcnt = 1;
698
699 if (dladm_get_linkprop(handle, linkid, DLADM_PROP_VAL_CURRENT, "mtu",
700 &propval, &propcnt) == DLADM_STATUS_OK && propcnt == 1) {
701 ulong_t parsed = strtoul(buf, NULL, 10);
702
703 /*
704 * The virtio spec notes that for devices implementing
705 * VIRTIO_NET_F_MTU, that the noted MTU MUST be between
706 * 68-65535, inclusive. Although the viona device does not
707 * offer that feature today (the reporting of the MTU to the
708 * guest), we can still use those bounds for how we configure
709 * the limits of the in-kernel emulation.
710 */
711 if (parsed >= 68 && parsed <= 65535) {
712 return (parsed);
713 }
714 }
715
716 /* Default to 1500 if query is unsuccessful */
717 return (1500);
718 }
719
720 static int
pci_viona_init(struct pci_devinst * pi,nvlist_t * nvl)721 pci_viona_init(struct pci_devinst *pi, nvlist_t *nvl)
722 {
723 dladm_handle_t handle;
724 dladm_status_t status;
725 dladm_vnic_attr_t attr;
726 char errmsg[DLADM_STRSIZE];
727 char tname[MAXCOMLEN + 1];
728 int error, i;
729 struct pci_viona_softc *sc;
730 const char *vnic;
731 pthread_t tid;
732
733 if (get_config_bool_default("viona.debug", false))
734 pci_viona_debug = 1;
735
736 vnic = get_config_value_node(nvl, "vnic");
737 if (vnic == NULL) {
738 WPRINTF("virtio-viona: vnic required");
739 return (1);
740 }
741
742 sc = malloc(sizeof (struct pci_viona_softc));
743 memset(sc, 0, sizeof (struct pci_viona_softc));
744
745 if (pci_viona_parse_opts(sc, nvl) != 0) {
746 free(sc);
747 return (1);
748 }
749
750 if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) {
751 WPRINTF("could not open /dev/dld");
752 free(sc);
753 return (1);
754 }
755
756 if ((status = dladm_name2info(handle, sc->vsc_linkname, &sc->vsc_linkid,
757 NULL, NULL, NULL)) != DLADM_STATUS_OK) {
758 WPRINTF("dladm_name2info() for %s failed: %s", vnic,
759 dladm_status2str(status, errmsg));
760 dladm_close(handle);
761 free(sc);
762 return (1);
763 }
764
765 if ((status = dladm_vnic_info(handle, sc->vsc_linkid, &attr,
766 DLADM_OPT_ACTIVE)) != DLADM_STATUS_OK) {
767 WPRINTF("dladm_vnic_info() for %s failed: %s", vnic,
768 dladm_status2str(status, errmsg));
769 dladm_close(handle);
770 free(sc);
771 return (1);
772 }
773 memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL);
774 sc->vsc_mtu = pci_viona_query_mtu(handle, sc->vsc_linkid);
775
776 dladm_close(handle);
777
778 error = pci_viona_viona_init(pi->pi_vmctx, sc);
779 if (error != 0) {
780 free(sc);
781 return (1);
782 }
783
784 if (ioctl(sc->vsc_vnafd, VNA_IOC_SET_MTU, sc->vsc_mtu) != 0) {
785 WPRINTF("error setting viona MTU(%u): %s", sc->vsc_mtu,
786 strerror(errno));
787 }
788
789 error = pthread_create(&tid, NULL, pci_viona_poll_thread, sc);
790 assert(error == 0);
791 snprintf(tname, sizeof (tname), "vionapoll:%s", vnic);
792 pthread_set_name_np(tid, tname);
793
794 /* initialize config space */
795 pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
796 pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
797 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
798 pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_NETWORK);
799 pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
800
801 sc->vsc_consts = viona_vi_consts;
802 pthread_mutex_init(&sc->vsc_mtx, NULL);
803
804 /*
805 * The RX and TX queues are handled in the kernel component of
806 * viona; however The control queue is emulated in userspace.
807 */
808 sc->vsc_queues[VIONA_CTLQ].vq_qsize = pci_viona_qsize(sc, VIONA_CTLQ);
809
810 vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues);
811 sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
812
813 /*
814 * Guests that do not support CTRL_RX_MAC still generally need to
815 * receive multicast packets. Guests that do support this feature will
816 * end up setting this flag indirectly via messages on the control
817 * queue but it does not hurt to default to multicast promiscuity here
818 * and it is what older version of viona did.
819 */
820 sc->vsc_promisc_mmac = true;
821 pci_viona_eval_promisc(sc);
822
823 /* MSI-X support */
824 for (i = 0; i < VIONA_MAXQ; i++)
825 sc->vsc_queues[i].vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
826
827 /* BAR 1 used to map MSI-X table and PBA */
828 if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) {
829 free(sc);
830 return (1);
831 }
832
833 /* BAR 0 for legacy-style virtio register access. */
834 error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
835 if (error != 0) {
836 WPRINTF("could not allocate virtio BAR");
837 free(sc);
838 return (1);
839 }
840
841 /*
842 * Need a legacy interrupt for virtio compliance, even though MSI-X
843 * operation is _strongly_ suggested for adequate performance.
844 */
845 pci_lintr_request(pi);
846
847 return (0);
848 }
849
850 static uint64_t
viona_adjust_offset(struct pci_devinst * pi,uint64_t offset)851 viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
852 {
853 /*
854 * Device specific offsets used by guest would change based on
855 * whether MSI-X capability is enabled or not
856 */
857 if (!pci_msix_enabled(pi)) {
858 if (offset >= VIRTIO_PCI_CONFIG_OFF(0)) {
859 return (offset + (VIRTIO_PCI_CONFIG_OFF(1) -
860 VIRTIO_PCI_CONFIG_OFF(0)));
861 }
862 }
863
864 return (offset);
865 }
866
867 static void
pci_viona_ring_set_msix(struct pci_devinst * pi,uint_t ring)868 pci_viona_ring_set_msix(struct pci_devinst *pi, uint_t ring)
869 {
870 struct pci_viona_softc *sc = pi->pi_arg;
871 struct msix_table_entry mte;
872 uint16_t tab_index;
873 vioc_ring_msi_t vrm;
874 int res;
875
876 if (ring == VIONA_CTLQ)
877 return;
878
879 assert(ring <= VIONA_VQ_TX);
880
881 vrm.rm_index = ring;
882 vrm.rm_addr = 0;
883 vrm.rm_msg = 0;
884 tab_index = sc->vsc_queues[ring].vq_msix_idx;
885
886 if (tab_index != VIRTIO_MSI_NO_VECTOR && sc->vsc_msix_active) {
887 mte = pi->pi_msix.table[tab_index];
888 if ((mte.vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
889 vrm.rm_addr = mte.addr;
890 vrm.rm_msg = mte.msg_data;
891 }
892 }
893
894 res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_SET_MSI, &vrm);
895 if (res != 0) {
896 WPRINTF("ioctl viona set_msi %d failed %d", ring, errno);
897 }
898 }
899
900 static void
pci_viona_lintrupdate(struct pci_devinst * pi)901 pci_viona_lintrupdate(struct pci_devinst *pi)
902 {
903 struct pci_viona_softc *sc = pi->pi_arg;
904 bool msix_on = false;
905
906 pthread_mutex_lock(&sc->vsc_mtx);
907 msix_on = pci_msix_enabled(pi) && (pi->pi_msix.function_mask == 0);
908 if ((sc->vsc_msix_active && !msix_on) ||
909 (msix_on && !sc->vsc_msix_active)) {
910 uint_t i;
911
912 sc->vsc_msix_active = msix_on;
913 /* Update in-kernel ring configs */
914 for (i = 0; i <= VIONA_VQ_TX; i++) {
915 pci_viona_ring_set_msix(pi, i);
916 }
917 }
918 pthread_mutex_unlock(&sc->vsc_mtx);
919 }
920
921 static void
pci_viona_msix_update(struct pci_devinst * pi,uint64_t offset)922 pci_viona_msix_update(struct pci_devinst *pi, uint64_t offset)
923 {
924 struct pci_viona_softc *sc = pi->pi_arg;
925 uint_t tab_index, i;
926
927 pthread_mutex_lock(&sc->vsc_mtx);
928 if (!sc->vsc_msix_active) {
929 pthread_mutex_unlock(&sc->vsc_mtx);
930 return;
931 }
932
933 /*
934 * Rather than update every possible MSI-X vector, cheat and use the
935 * offset to calculate the entry within the table. Since this should
936 * only be called when a write to the table succeeds, the index should
937 * be valid.
938 */
939 tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
940
941 for (i = 0; i <= VIONA_VQ_TX; i++) {
942 if (sc->vsc_queues[i].vq_msix_idx != tab_index) {
943 continue;
944 }
945 pci_viona_ring_set_msix(pi, i);
946 }
947
948 pthread_mutex_unlock(&sc->vsc_mtx);
949 }
950
951 static void
pci_viona_qnotify(struct pci_viona_softc * sc,int ring)952 pci_viona_qnotify(struct pci_viona_softc *sc, int ring)
953 {
954 int error;
955
956 switch (ring) {
957 case VIONA_TXQ:
958 case VIONA_RXQ:
959 error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_KICK, ring);
960 if (error != 0) {
961 WPRINTF("ioctl viona ring %d kick failed %d",
962 ring, errno);
963 }
964 break;
965 case VIONA_CTLQ: {
966 struct vqueue_info *vq = &sc->vsc_queues[VIONA_CTLQ];
967
968 if (vq_has_descs(vq))
969 pci_viona_process_ctrlq(vq);
970 break;
971 }
972 }
973 }
974
975 static void
pci_viona_baraddr(struct pci_devinst * pi,int baridx,int enabled,uint64_t address)976 pci_viona_baraddr(struct pci_devinst *pi, int baridx, int enabled,
977 uint64_t address)
978 {
979 struct pci_viona_softc *sc = pi->pi_arg;
980 uint64_t ioport;
981 int error;
982
983 if (baridx != 0)
984 return;
985
986 if (enabled == 0) {
987 error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, 0);
988 if (error != 0)
989 WPRINTF("uninstall ioport hook failed %d", errno);
990 return;
991 }
992
993 /*
994 * Install ioport hook for virtqueue notification.
995 * This is part of the virtio common configuration area so the
996 * address does not change with MSI-X status.
997 */
998 ioport = address + VIRTIO_PCI_QUEUE_NOTIFY;
999 error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, ioport);
1000 if (error != 0) {
1001 WPRINTF("install ioport hook at %x failed %d",
1002 ioport, errno);
1003 }
1004 }
1005
1006 static void
pci_viona_write(struct pci_devinst * pi,int baridx,uint64_t offset,int size,uint64_t value)1007 pci_viona_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
1008 uint64_t value)
1009 {
1010 struct pci_viona_softc *sc = pi->pi_arg;
1011 void *ptr;
1012 int err = 0;
1013
1014 if (baridx == pci_msix_table_bar(pi) ||
1015 baridx == pci_msix_pba_bar(pi)) {
1016 if (pci_emul_msix_twrite(pi, offset, size, value) == 0) {
1017 pci_viona_msix_update(pi, offset);
1018 }
1019 return;
1020 }
1021
1022 assert(baridx == 0);
1023
1024 if (offset + size > pci_viona_iosize(pi)) {
1025 DPRINTF("viona_write: 2big, offset %ld size %d",
1026 offset, size);
1027 return;
1028 }
1029
1030 pthread_mutex_lock(&sc->vsc_mtx);
1031
1032 offset = viona_adjust_offset(pi, offset);
1033
1034 switch (offset) {
1035 case VIRTIO_PCI_GUEST_FEATURES:
1036 assert(size == 4);
1037 value &= ~(sc->vsc_feature_mask);
1038 err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value);
1039 if (err != 0) {
1040 WPRINTF("ioctl feature negotiation returned err = %d",
1041 errno);
1042 } else {
1043 sc->vsc_vs.vs_negotiated_caps = value;
1044 }
1045 break;
1046 case VIRTIO_PCI_QUEUE_PFN:
1047 assert(size == 4);
1048 pci_viona_ring_init(sc, value);
1049 break;
1050 case VIRTIO_PCI_QUEUE_SEL:
1051 assert(size == 2);
1052 assert(value < VIONA_MAXQ);
1053 sc->vsc_vs.vs_curq = value;
1054 break;
1055 case VIRTIO_PCI_QUEUE_NOTIFY:
1056 assert(size == 2);
1057 assert(value < VIONA_MAXQ);
1058 pci_viona_qnotify(sc, value);
1059 break;
1060 case VIRTIO_PCI_STATUS:
1061 assert(size == 1);
1062 pci_viona_update_status(sc, value);
1063 break;
1064 case VIRTIO_MSI_CONFIG_VECTOR:
1065 assert(size == 2);
1066 sc->vsc_vs.vs_msix_cfg_idx = value;
1067 break;
1068 case VIRTIO_MSI_QUEUE_VECTOR:
1069 assert(size == 2);
1070 assert(sc->vsc_vs.vs_curq < VIONA_MAXQ);
1071 sc->vsc_queues[sc->vsc_vs.vs_curq].vq_msix_idx = value;
1072 pci_viona_ring_set_msix(pi, sc->vsc_vs.vs_curq);
1073 break;
1074 case VIONA_R_CFG0:
1075 case VIONA_R_CFG1:
1076 case VIONA_R_CFG2:
1077 case VIONA_R_CFG3:
1078 case VIONA_R_CFG4:
1079 case VIONA_R_CFG5:
1080 assert((size + offset) <= (VIONA_R_CFG5 + 1));
1081 ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
1082 /*
1083 * The driver is allowed to change the MAC address
1084 */
1085 sc->vsc_macaddr[offset - VIONA_R_CFG0] = value;
1086 if (size == 1) {
1087 *(uint8_t *)ptr = value;
1088 } else if (size == 2) {
1089 *(uint16_t *)ptr = value;
1090 } else {
1091 *(uint32_t *)ptr = value;
1092 }
1093 break;
1094 case VIRTIO_PCI_HOST_FEATURES:
1095 case VIRTIO_PCI_QUEUE_NUM:
1096 case VIRTIO_PCI_ISR:
1097 case VIONA_R_CFG6:
1098 case VIONA_R_CFG7:
1099 DPRINTF("viona: write to readonly reg %ld", offset);
1100 break;
1101 default:
1102 DPRINTF("viona: unknown i/o write offset %ld", offset);
1103 value = 0;
1104 break;
1105 }
1106
1107 pthread_mutex_unlock(&sc->vsc_mtx);
1108 }
1109
1110 static uint64_t
pci_viona_read(struct pci_devinst * pi,int baridx,uint64_t offset,int size)1111 pci_viona_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
1112 {
1113 struct pci_viona_softc *sc = pi->pi_arg;
1114 void *ptr;
1115 uint64_t value;
1116 int err = 0;
1117
1118 if (baridx == pci_msix_table_bar(pi) ||
1119 baridx == pci_msix_pba_bar(pi)) {
1120 return (pci_emul_msix_tread(pi, offset, size));
1121 }
1122
1123 assert(baridx == 0);
1124
1125 if (offset + size > pci_viona_iosize(pi)) {
1126 DPRINTF("viona_read: 2big, offset %ld size %d",
1127 offset, size);
1128 return (0);
1129 }
1130
1131 pthread_mutex_lock(&sc->vsc_mtx);
1132
1133 offset = viona_adjust_offset(pi, offset);
1134
1135 switch (offset) {
1136 case VIRTIO_PCI_HOST_FEATURES:
1137 assert(size == 4);
1138 err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value);
1139 if (err != 0) {
1140 WPRINTF("ioctl get host features returned err = %d",
1141 errno);
1142 }
1143 value |= VIONA_S_HOSTCAPS_USERSPACE;
1144 value &= ~sc->vsc_feature_mask;
1145 sc->vsc_consts.vc_hv_caps = value;
1146 break;
1147 case VIRTIO_PCI_GUEST_FEATURES:
1148 assert(size == 4);
1149 value = sc->vsc_vs.vs_negotiated_caps; /* XXX never read ? */
1150 break;
1151 case VIRTIO_PCI_QUEUE_PFN:
1152 assert(size == 4);
1153 value = sc->vsc_queues[sc->vsc_vs.vs_curq].vq_pfn >> VRING_PFN;
1154 break;
1155 case VIRTIO_PCI_QUEUE_NUM:
1156 assert(size == 2);
1157 value = pci_viona_qsize(sc, sc->vsc_vs.vs_curq);
1158 break;
1159 case VIRTIO_PCI_QUEUE_SEL:
1160 assert(size == 2);
1161 value = sc->vsc_vs.vs_curq; /* XXX never read ? */
1162 break;
1163 case VIRTIO_PCI_QUEUE_NOTIFY:
1164 assert(size == 2);
1165 value = sc->vsc_vs.vs_curq; /* XXX never read ? */
1166 break;
1167 case VIRTIO_PCI_STATUS:
1168 assert(size == 1);
1169 value = sc->vsc_vs.vs_status;
1170 break;
1171 case VIRTIO_PCI_ISR:
1172 assert(size == 1);
1173 value = sc->vsc_vs.vs_isr;
1174 sc->vsc_vs.vs_isr = 0; /* a read clears this flag */
1175 if (value != 0) {
1176 pci_lintr_deassert(pi);
1177 }
1178 break;
1179 case VIRTIO_MSI_CONFIG_VECTOR:
1180 assert(size == 2);
1181 value = sc->vsc_vs.vs_msix_cfg_idx;
1182 break;
1183 case VIRTIO_MSI_QUEUE_VECTOR:
1184 assert(size == 2);
1185 assert(sc->vsc_vs.vs_curq < VIONA_MAXQ);
1186 value = sc->vsc_queues[sc->vsc_vs.vs_curq].vq_msix_idx;
1187 break;
1188 case VIONA_R_CFG0:
1189 case VIONA_R_CFG1:
1190 case VIONA_R_CFG2:
1191 case VIONA_R_CFG3:
1192 case VIONA_R_CFG4:
1193 case VIONA_R_CFG5:
1194 assert((size + offset) <= (VIONA_R_CFG5 + 1));
1195 ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
1196 if (size == 1) {
1197 value = *(uint8_t *)ptr;
1198 } else if (size == 2) {
1199 value = *(uint16_t *)ptr;
1200 } else {
1201 value = *(uint32_t *)ptr;
1202 }
1203 break;
1204 case VIONA_R_CFG6:
1205 assert(size != 4);
1206 value = 0x01; /* XXX link always up */
1207 break;
1208 case VIONA_R_CFG7:
1209 assert(size == 1);
1210 value = 0; /* XXX link status in LSB */
1211 break;
1212 default:
1213 DPRINTF("viona: unknown i/o read offset %ld", offset);
1214 value = 0;
1215 break;
1216 }
1217
1218 pthread_mutex_unlock(&sc->vsc_mtx);
1219
1220 return (value);
1221 }
1222
1223 struct pci_devemu pci_de_viona = {
1224 .pe_emu = "virtio-net-viona",
1225 .pe_init = pci_viona_init,
1226 .pe_legacy_config = pci_viona_legacy_config,
1227 .pe_barwrite = pci_viona_write,
1228 .pe_barread = pci_viona_read,
1229 .pe_baraddr = pci_viona_baraddr,
1230 .pe_lintrupdate = pci_viona_lintrupdate
1231 };
1232 PCI_EMUL_SET(pci_de_viona);
1233