xref: /illumos-gate/usr/src/cmd/bhyve/common/pci_virtio_viona.c (revision 5c4a5fe16715fb423db76577a6883b5bbecdbe45)
1 /*
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39  */
40 
41 
42 #include <sys/param.h>
43 #include <sys/linker_set.h>
44 #include <sys/ioctl.h>
45 #include <sys/uio.h>
46 #include <sys/viona_io.h>
47 
48 #include <errno.h>
49 #include <fcntl.h>
50 #include <stdio.h>
51 #include <stdlib.h>
52 #include <stdint.h>
53 #include <string.h>
54 #include <strings.h>
55 #include <unistd.h>
56 #include <assert.h>
57 #include <pthread.h>
58 #include <signal.h>
59 #include <stdbool.h>
60 #include <poll.h>
61 #include <libdladm.h>
62 #include <libdllink.h>
63 #include <libdlvnic.h>
64 
65 #include <machine/vmm.h>
66 #include <vmmapi.h>
67 
68 #include "bhyverun.h"
69 #include "config.h"
70 #include "debug.h"
71 #include "pci_emul.h"
72 #include "virtio.h"
73 #include "iov.h"
74 #include "virtio_net.h"
75 
76 #define	VIONA_RINGSZ		1024
77 #define	VIONA_CTLQ_SIZE		64
78 #define	VIONA_CTLQ_MAXSEGS	32
79 
80 /*
81  * PCI config-space register offsets
82  */
83 #define	VIONA_R_CFG0	24
84 #define	VIONA_R_CFG1	25
85 #define	VIONA_R_CFG2	26
86 #define	VIONA_R_CFG3	27
87 #define	VIONA_R_CFG4	28
88 #define	VIONA_R_CFG5	29
89 #define	VIONA_R_CFG6	30
90 #define	VIONA_R_CFG7	31
91 #define	VIONA_R_MAX	31
92 
93 #define	VIONA_REGSZ	(VIONA_R_MAX + 1)
94 
95 /*
96  * Queue definitions.
97  */
98 #define	VIONA_RXQ	0
99 #define	VIONA_TXQ	1
100 #define	VIONA_CTLQ	2
101 
102 #define	VIONA_MAXQ	3
103 
104 /*
105  * Supplementary host capabilities provided in the userspace component.
106  */
107 #define	VIONA_S_HOSTCAPS_USERSPACE	(	\
108 	VIRTIO_NET_F_CTRL_VQ |			\
109 	VIRTIO_NET_F_CTRL_RX)
110 
111 /*
112  * Debug printf
113  */
114 static volatile int pci_viona_debug;
115 #define	DPRINTF(fmt, arg...) \
116 	do { \
117 		if (pci_viona_debug) { \
118 			FPRINTLN(stdout, fmt, ##arg); \
119 			fflush(stdout); \
120 		} \
121 	} while (0)
122 #define	WPRINTF(fmt, arg...) FPRINTLN(stderr, fmt, ##arg)
123 
124 /*
125  * Per-device softc
126  */
127 struct pci_viona_softc {
128 	struct virtio_softc	vsc_vs;
129 	struct virtio_consts	vsc_consts;
130 	struct vqueue_info	vsc_queues[VIONA_MAXQ];
131 	pthread_mutex_t		vsc_mtx;
132 
133 	datalink_id_t	vsc_linkid;
134 	int		vsc_vnafd;
135 
136 	/* Configurable parameters */
137 	char		vsc_linkname[MAXLINKNAMELEN];
138 	uint32_t	vsc_feature_mask;
139 	uint16_t	vsc_vq_size;
140 
141 	uint8_t		vsc_macaddr[6];
142 
143 	bool		vsc_resetting;
144 	bool		vsc_msix_active;
145 
146 	viona_promisc_t	vsc_promisc;		/* Current promisc mode */
147 	bool		vsc_promisc_promisc;	/* PROMISC enabled */
148 	bool		vsc_promisc_allmulti;	/* ALLMULTI enabled */
149 	bool		vsc_promisc_umac;	/* unicast MACs sent */
150 	bool		vsc_promisc_mmac;	/* multicast MACs sent */
151 };
152 
153 static struct virtio_consts viona_vi_consts = {
154 	.vc_name		= "viona",
155 	.vc_nvq			= VIONA_MAXQ,
156 	/*
157 	 * We use the common bhyve virtio framework so that we can call
158 	 * the utility functions to work with the queues handled in userspace.
159 	 * The framework PCI read/write functions are not used so these
160 	 * callbacks will not be invoked.
161 	 */
162 	.vc_cfgsize		= 0,
163 	.vc_reset		= NULL,
164 	.vc_qnotify		= NULL,
165 	.vc_cfgread		= NULL,
166 	.vc_cfgwrite		= NULL,
167 	.vc_apply_features	= NULL,
168 	/*
169 	 * The following field is populated using the response from the
170 	 * viona driver during initialisation, augmented with the additional
171 	 * capabilities emulated in userspace.
172 	 */
173 	.vc_hv_caps		= 0,
174 };
175 
176 /*
177  * Return the size of IO BAR that maps virtio header and device specific
178  * region. The size would vary depending on whether MSI-X is enabled or
179  * not.
180  */
181 static uint64_t
pci_viona_iosize(struct pci_devinst * pi)182 pci_viona_iosize(struct pci_devinst *pi)
183 {
184 	if (pci_msix_enabled(pi)) {
185 		return (VIONA_REGSZ);
186 	} else {
187 		return (VIONA_REGSZ -
188 		    (VIRTIO_PCI_CONFIG_OFF(1) - VIRTIO_PCI_CONFIG_OFF(0)));
189 	}
190 }
191 
192 static uint16_t
pci_viona_qsize(struct pci_viona_softc * sc,int qnum)193 pci_viona_qsize(struct pci_viona_softc *sc, int qnum)
194 {
195 	if (qnum == VIONA_CTLQ)
196 		return (VIONA_CTLQ_SIZE);
197 
198 	return (sc->vsc_vq_size);
199 }
200 
201 static void
pci_viona_ring_reset(struct pci_viona_softc * sc,int ring)202 pci_viona_ring_reset(struct pci_viona_softc *sc, int ring)
203 {
204 	assert(ring < VIONA_MAXQ);
205 
206 	switch (ring) {
207 	case VIONA_RXQ:
208 	case VIONA_TXQ:
209 		break;
210 	case VIONA_CTLQ:
211 	default:
212 		return;
213 	}
214 
215 	for (;;) {
216 		int res;
217 
218 		res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_RESET, ring);
219 		if (res == 0) {
220 			break;
221 		} else if (errno != EINTR) {
222 			WPRINTF("ioctl viona ring %d reset failed %d",
223 			    ring, errno);
224 			return;
225 		}
226 	}
227 }
228 
229 static void
pci_viona_update_status(struct pci_viona_softc * sc,uint32_t value)230 pci_viona_update_status(struct pci_viona_softc *sc, uint32_t value)
231 {
232 
233 	if (value == 0) {
234 		DPRINTF("viona: device reset requested !");
235 
236 		vi_reset_dev(&sc->vsc_vs);
237 		pci_viona_ring_reset(sc, VIONA_RXQ);
238 		pci_viona_ring_reset(sc, VIONA_TXQ);
239 	}
240 
241 	sc->vsc_vs.vs_status = value;
242 }
243 
244 static const char *
pci_viona_promisc_descr(viona_promisc_t mode)245 pci_viona_promisc_descr(viona_promisc_t mode)
246 {
247 	switch (mode) {
248 	case VIONA_PROMISC_NONE:
249 		return ("none");
250 	case VIONA_PROMISC_MULTI:
251 		return ("multicast");
252 	case VIONA_PROMISC_ALL:
253 		return ("all");
254 	default:
255 		abort();
256 	}
257 }
258 
259 static int
pci_viona_eval_promisc(struct pci_viona_softc * sc)260 pci_viona_eval_promisc(struct pci_viona_softc *sc)
261 {
262 	viona_promisc_t mode = VIONA_PROMISC_NONE;
263 	int err = 0;
264 
265 	/*
266 	 * If the guest has explicitly requested promiscuous mode or has sent a
267 	 * non-empty unicast MAC address table, then set viona to promiscuous
268 	 * mode. Otherwise, if the guest has explicitly requested multicast
269 	 * promiscuity or has sent a non-empty multicast MAC address table,
270 	 * then set viona to multicast promiscuous mode.
271 	 */
272 	if (sc->vsc_promisc_promisc || sc->vsc_promisc_umac)
273 		mode = VIONA_PROMISC_ALL;
274 	else if (sc->vsc_promisc_allmulti || sc->vsc_promisc_mmac)
275 		mode = VIONA_PROMISC_MULTI;
276 
277 	if (mode != sc->vsc_promisc) {
278 		DPRINTF("viona: setting promiscuous mode to %d (%s)",
279 		    mode, pci_viona_promisc_descr(mode));
280 		DPRINTF("       promisc=%u, umac=%u, allmulti=%u, mmac=%u",
281 		    sc->vsc_promisc_promisc, sc->vsc_promisc_umac,
282 		    sc->vsc_promisc_allmulti, sc->vsc_promisc_mmac);
283 
284 		err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_PROMISC, mode);
285 		if (err == 0)
286 			sc->vsc_promisc = mode;
287 		else
288 			WPRINTF("ioctl viona set promisc failed %d", errno);
289 	}
290 
291 	return (err);
292 }
293 
294 static uint8_t
pci_viona_control_rx(struct vqueue_info * vq,const virtio_net_ctrl_hdr_t * hdr,struct iovec * iov,size_t niov)295 pci_viona_control_rx(struct vqueue_info *vq, const virtio_net_ctrl_hdr_t *hdr,
296     struct iovec *iov, size_t niov)
297 {
298 	struct pci_viona_softc *sc = (struct pci_viona_softc *)vq->vq_vs;
299 	uint8_t v;
300 
301 	if (iov[0].iov_len != sizeof (uint8_t) || niov != 1) {
302 		EPRINTLN("viona: bad control RX data");
303 		return (VIRTIO_NET_CQ_ERR);
304 	}
305 
306 	v = *(uint8_t *)iov[0].iov_base;
307 
308 	switch (hdr->vnch_command) {
309 	case VIRTIO_NET_CTRL_RX_PROMISC:
310 		DPRINTF("viona: ctrl RX promisc %d", v);
311 		sc->vsc_promisc_promisc = (v != 0);
312 		break;
313 	case VIRTIO_NET_CTRL_RX_ALLMULTI:
314 		DPRINTF("viona: ctrl RX allmulti %d", v);
315 		sc->vsc_promisc_allmulti = (v != 0);
316 		break;
317 	default:
318 		/*
319 		 * VIRTIO_NET_F_CTRL_RX_EXTRA was not offered so no other
320 		 * commands are expected.
321 		 */
322 		EPRINTLN("viona: unrecognised RX control cmd %u",
323 		    hdr->vnch_command);
324 		return (VIRTIO_NET_CQ_ERR);
325 	}
326 
327 	if (pci_viona_eval_promisc(sc) == 0)
328 		return (VIRTIO_NET_CQ_OK);
329 	return (VIRTIO_NET_CQ_ERR);
330 }
331 
332 static void
pci_viona_control_mac_dump(const char * tag,const struct iovec * iov)333 pci_viona_control_mac_dump(const char *tag, const struct iovec *iov)
334 {
335 	virtio_net_ctrl_mac_t *table = (virtio_net_ctrl_mac_t *)iov->iov_base;
336 	ether_addr_t *mac = &table->vncm_mac;
337 
338 	DPRINTF("-- %s MAC TABLE (entries: %u)", tag, table->vncm_entries);
339 
340 	if (table->vncm_entries * ETHERADDRL !=
341 	    iov->iov_len - sizeof (table->vncm_entries)) {
342 		DPRINTF("   Bad table size %u", iov->iov_len);
343 		return;
344 	}
345 
346 	for (uint32_t i = 0; i < table->vncm_entries; i++) {
347 		DPRINTF("   [%2d] %s", i, ether_ntoa((struct ether_addr *)mac));
348 		mac++;
349 	}
350 }
351 
352 static uint8_t
pci_viona_control_mac(struct vqueue_info * vq,const virtio_net_ctrl_hdr_t * hdr,struct iovec * iov,size_t niov)353 pci_viona_control_mac(struct vqueue_info *vq, const virtio_net_ctrl_hdr_t *hdr,
354     struct iovec *iov, size_t niov)
355 {
356 	struct pci_viona_softc *sc = (struct pci_viona_softc *)vq->vq_vs;
357 
358 	switch (hdr->vnch_command) {
359 	case VIRTIO_NET_CTRL_MAC_TABLE_SET: {
360 		virtio_net_ctrl_mac_t *table;
361 
362 		DPRINTF("viona: ctrl MAC table set");
363 
364 		if (niov != 2) {
365 			EPRINTLN("viona: bad control MAC data");
366 			return (VIRTIO_NET_CQ_ERR);
367 		}
368 
369 		/*
370 		 * We advertise VIRTIO_NET_F_CTRL_RX and therefore need to
371 		 * accept VIRTIO_NET_CTRL_MAC, but we don't support passing
372 		 * changes in the MAC address lists down to viona.
373 		 * Instead, we set flags to indicate if the guest has sent
374 		 * any MAC addresses for each table, and use these to determine
375 		 * the resulting promiscuous mode, see pci_viona_eval_promisc()
376 		 * above.
377 		 */
378 
379 		/* Unicast MAC table */
380 		table = (virtio_net_ctrl_mac_t *)iov[0].iov_base;
381 		sc->vsc_promisc_umac = (table->vncm_entries != 0);
382 		if (pci_viona_debug)
383 			pci_viona_control_mac_dump("UNICAST", &iov[0]);
384 
385 		/* Multicast MAC table */
386 		table = (virtio_net_ctrl_mac_t *)iov[1].iov_base;
387 		sc->vsc_promisc_mmac = (table->vncm_entries != 0);
388 		if (pci_viona_debug)
389 			pci_viona_control_mac_dump("MULTICAST", &iov[1]);
390 
391 		break;
392 	}
393 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
394 		/* disallow setting the primary filter MAC address */
395 		DPRINTF("viona: ctrl MAC addr set %d", niov);
396 		return (VIRTIO_NET_CQ_ERR);
397 	default:
398 		EPRINTLN("viona: unrecognised MAC control cmd %u",
399 		    hdr->vnch_command);
400 		return (VIRTIO_NET_CQ_ERR);
401 	}
402 
403 	if (pci_viona_eval_promisc(sc) == 0)
404 		return (VIRTIO_NET_CQ_OK);
405 	return (VIRTIO_NET_CQ_ERR);
406 }
407 
408 static void
pci_viona_control(struct vqueue_info * vq)409 pci_viona_control(struct vqueue_info *vq)
410 {
411 	struct iovec iov[VIONA_CTLQ_MAXSEGS + 1];
412 	const virtio_net_ctrl_hdr_t *hdr;
413 	struct iovec *siov = iov;
414 	struct vi_req req = { 0 };
415 	uint8_t *ackp;
416 	size_t nsiov;
417 	uint32_t len;
418 	int n;
419 
420 	n = vq_getchain(vq, iov, VIONA_CTLQ_MAXSEGS, &req);
421 
422 	assert(n >= 1 && n <= VIONA_CTLQ_MAXSEGS);
423 
424 	/*
425 	 * Since we have not negotiated VIRTIO_F_ANY_LAYOUT, we expect the
426 	 * control message to be laid out in at least three descriptors as
427 	 * follows:
428 	 *	header		- sizeof (virtio_net_ctrl_hdr_t)
429 	 *	data[]		- at least one descriptor, varying size
430 	 *	ack		- uint8_t, flagged as writable
431 	 * Check the incoming message to make sure it matches this layout and
432 	 * drop the entire chain if not.
433 	 */
434 	if (n < 3 || req.writable != 1 || req.readable + 1 != n ||
435 	    iov[req.readable].iov_len != sizeof (uint8_t)) {
436 		EPRINTLN("viona: bad control chain, len=%d, w=%d, r=%d",
437 		    n, req.writable, req.readable);
438 		goto drop;
439 	}
440 
441 	hdr = (const virtio_net_ctrl_hdr_t *)iov[0].iov_base;
442 	if (iov[0].iov_len < sizeof (virtio_net_ctrl_hdr_t)) {
443 		EPRINTLN("viona: control header too short: %u", iov[0].iov_len);
444 		goto drop;
445 	}
446 
447 	/*
448 	 * Writable iovecs start at iov[req.readable], and we've already
449 	 * checked that there is only one writable, it's at the end, and the
450 	 * right size; it's the acknowledgement byte.
451 	 */
452 	ackp = (uint8_t *)iov[req.readable].iov_base;
453 
454 	siov = &iov[1];
455 	nsiov = n - 2;
456 
457 	switch (hdr->vnch_class) {
458 	case VIRTIO_NET_CTRL_RX:
459 		*ackp = pci_viona_control_rx(vq, hdr, siov, nsiov);
460 		break;
461 	case VIRTIO_NET_CTRL_MAC:
462 		*ackp = pci_viona_control_mac(vq, hdr, siov, nsiov);
463 		break;
464 	default:
465 		EPRINTLN("viona: unrecognised control class %u, cmd %u",
466 		    hdr->vnch_class, hdr->vnch_command);
467 		*ackp = VIRTIO_NET_CQ_ERR;
468 		break;
469 	}
470 
471 drop:
472 	len = 0;
473 	for (uint_t i = 0; i < n; i++)
474 		len += iov[i].iov_len;
475 
476 	vq_relchain(vq, req.idx, len);
477 }
478 
479 static void
pci_viona_process_ctrlq(struct vqueue_info * vq)480 pci_viona_process_ctrlq(struct vqueue_info *vq)
481 {
482 	for (;;) {
483 		vq_kick_disable(vq);
484 
485 		while (vq_has_descs(vq))
486 			pci_viona_control(vq);
487 
488 		vq_kick_enable(vq);
489 
490 		/*
491 		 * One more check in case a late addition raced with
492 		 * re-enabling kicks. Note that vq_kick_enable() includes a
493 		 * memory barrier.
494 		 */
495 
496 		if (!vq_has_descs(vq))
497 			break;
498 	}
499 
500 	vq_endchains(vq, /* used_all_avail= */1);
501 }
502 
503 static void *
pci_viona_poll_thread(void * param)504 pci_viona_poll_thread(void *param)
505 {
506 	struct pci_viona_softc *sc = param;
507 	pollfd_t pollset;
508 	const int fd = sc->vsc_vnafd;
509 
510 	pollset.fd = fd;
511 	pollset.events = POLLRDBAND;
512 
513 	for (;;) {
514 		if (poll(&pollset, 1, -1) < 0) {
515 			if (errno == EINTR || errno == EAGAIN) {
516 				continue;
517 			} else {
518 				WPRINTF("pci_viona_poll_thread poll() error %d",
519 				    errno);
520 				break;
521 			}
522 		}
523 		if (pollset.revents & POLLRDBAND) {
524 			vioc_intr_poll_t vip;
525 			uint_t i;
526 			int res;
527 			bool assert_lintr = false;
528 			const bool do_msix = pci_msix_enabled(sc->vsc_vs.vs_pi);
529 
530 			res = ioctl(fd, VNA_IOC_INTR_POLL, &vip);
531 			for (i = 0; res > 0 && i < VIONA_VQ_MAX; i++) {
532 				if (vip.vip_status[i] == 0) {
533 					continue;
534 				}
535 				if (do_msix) {
536 					pci_generate_msix(sc->vsc_vs.vs_pi,
537 					    sc->vsc_queues[i].vq_msix_idx);
538 				} else {
539 					assert_lintr = true;
540 				}
541 				res = ioctl(fd, VNA_IOC_RING_INTR_CLR, i);
542 				if (res != 0) {
543 					WPRINTF("ioctl viona vq %d intr "
544 					    "clear failed %d", i, errno);
545 				}
546 			}
547 			if (assert_lintr) {
548 				pthread_mutex_lock(&sc->vsc_mtx);
549 				sc->vsc_vs.vs_isr |= VIRTIO_PCI_ISR_INTR;
550 				pci_lintr_assert(sc->vsc_vs.vs_pi);
551 				pthread_mutex_unlock(&sc->vsc_mtx);
552 			}
553 		}
554 	}
555 
556 	pthread_exit(NULL);
557 }
558 
559 static void
pci_viona_ring_init(struct pci_viona_softc * sc,uint64_t pfn)560 pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
561 {
562 	int			qnum = sc->vsc_vs.vs_curq;
563 	vioc_ring_init_t	vna_ri;
564 	int			error;
565 
566 	assert(qnum < VIONA_MAXQ);
567 
568 	if (qnum == VIONA_CTLQ) {
569 		vi_vq_init(&sc->vsc_vs, pfn);
570 		return;
571 	}
572 
573 	sc->vsc_queues[qnum].vq_pfn = (pfn << VRING_PFN);
574 	vna_ri.ri_index = qnum;
575 	vna_ri.ri_qsize = pci_viona_qsize(sc, qnum);
576 	vna_ri.ri_qaddr = (pfn << VRING_PFN);
577 	error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_INIT, &vna_ri);
578 
579 	if (error != 0) {
580 		WPRINTF("ioctl viona ring %u init failed %d", qnum, errno);
581 	}
582 }
583 
584 static int
pci_viona_viona_init(struct vmctx * ctx,struct pci_viona_softc * sc)585 pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
586 {
587 	vioc_create_t		vna_create;
588 	int			error;
589 
590 	sc->vsc_vnafd = open("/dev/viona", O_RDWR | O_EXCL);
591 	if (sc->vsc_vnafd == -1) {
592 		WPRINTF("open viona ctl failed: %d", errno);
593 		return (-1);
594 	}
595 
596 	vna_create.c_linkid = sc->vsc_linkid;
597 	vna_create.c_vmfd = vm_get_device_fd(ctx);
598 	error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
599 	if (error != 0) {
600 		(void) close(sc->vsc_vnafd);
601 		WPRINTF("ioctl viona create failed %d", errno);
602 		return (-1);
603 	}
604 
605 	return (0);
606 }
607 
608 static int
pci_viona_legacy_config(nvlist_t * nvl,const char * opt)609 pci_viona_legacy_config(nvlist_t *nvl, const char *opt)
610 {
611 	char *config, *name, *tofree, *value;
612 
613 	if (opt == NULL)
614 		return (0);
615 
616 	config = tofree = strdup(opt);
617 	while ((name = strsep(&config, ",")) != NULL) {
618 		value = strchr(name, '=');
619 		if (value != NULL) {
620 			*value++ = '\0';
621 			set_config_value_node(nvl, name, value);
622 		} else {
623 			set_config_value_node(nvl, "vnic", name);
624 		}
625 	}
626 	free(tofree);
627 	return (0);
628 }
629 
630 static int
pci_viona_parse_opts(struct pci_viona_softc * sc,nvlist_t * nvl)631 pci_viona_parse_opts(struct pci_viona_softc *sc, nvlist_t *nvl)
632 {
633 	const char *value;
634 	int err = 0;
635 
636 	sc->vsc_vq_size = VIONA_RINGSZ;
637 	sc->vsc_feature_mask = 0;
638 	sc->vsc_linkname[0] = '\0';
639 
640 	value = get_config_value_node(nvl, "feature_mask");
641 	if (value != NULL) {
642 		long num;
643 
644 		errno = 0;
645 		num = strtol(value, NULL, 0);
646 		if (errno != 0 || num < 0) {
647 			fprintf(stderr,
648 			    "viona: invalid mask '%s'", value);
649 		} else {
650 			sc->vsc_feature_mask = num;
651 		}
652 	}
653 
654 	value = get_config_value_node(nvl, "vqsize");
655 	if (value != NULL) {
656 		long num;
657 
658 		errno = 0;
659 		num = strtol(value, NULL, 0);
660 		if (errno != 0) {
661 			fprintf(stderr,
662 			    "viona: invalid vsqize '%s'", value);
663 			err = -1;
664 		} else if (num <= 2 || num > 32768) {
665 			fprintf(stderr,
666 			    "viona: vqsize out of range", num);
667 			err = -1;
668 		} else if ((1 << (ffs(num) - 1)) != num) {
669 			fprintf(stderr,
670 			    "viona: vqsize must be power of 2", num);
671 			err = -1;
672 		} else {
673 			sc->vsc_vq_size = num;
674 		}
675 	}
676 
677 	value = get_config_value_node(nvl, "vnic");
678 	if (value == NULL) {
679 		fprintf(stderr, "viona: vnic name required");
680 		err = -1;
681 	} else {
682 		(void) strlcpy(sc->vsc_linkname, value, MAXLINKNAMELEN);
683 	}
684 
685 	DPRINTF("viona=%p dev=%s vqsize=%x feature_mask=%x", sc,
686 	    sc->vsc_linkname, sc->vsc_vq_size, sc->vsc_feature_mask);
687 	return (err);
688 }
689 
690 static int
pci_viona_init(struct pci_devinst * pi,nvlist_t * nvl)691 pci_viona_init(struct pci_devinst *pi, nvlist_t *nvl)
692 {
693 	dladm_handle_t		handle;
694 	dladm_status_t		status;
695 	dladm_vnic_attr_t	attr;
696 	char			errmsg[DLADM_STRSIZE];
697 	char			tname[MAXCOMLEN + 1];
698 	int error, i;
699 	struct pci_viona_softc *sc;
700 	const char *vnic;
701 	pthread_t tid;
702 
703 	if (get_config_bool_default("viona.debug", false))
704 		pci_viona_debug = 1;
705 
706 	vnic = get_config_value_node(nvl, "vnic");
707 	if (vnic == NULL) {
708 		WPRINTF("virtio-viona: vnic required");
709 		return (1);
710 	}
711 
712 	sc = malloc(sizeof (struct pci_viona_softc));
713 	memset(sc, 0, sizeof (struct pci_viona_softc));
714 
715 	if (pci_viona_parse_opts(sc, nvl) != 0) {
716 		free(sc);
717 		return (1);
718 	}
719 
720 	if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) {
721 		WPRINTF("could not open /dev/dld");
722 		free(sc);
723 		return (1);
724 	}
725 
726 	if ((status = dladm_name2info(handle, sc->vsc_linkname, &sc->vsc_linkid,
727 	    NULL, NULL, NULL)) != DLADM_STATUS_OK) {
728 		WPRINTF("dladm_name2info() for %s failed: %s", vnic,
729 		    dladm_status2str(status, errmsg));
730 		dladm_close(handle);
731 		free(sc);
732 		return (1);
733 	}
734 
735 	if ((status = dladm_vnic_info(handle, sc->vsc_linkid, &attr,
736 	    DLADM_OPT_ACTIVE)) != DLADM_STATUS_OK) {
737 		WPRINTF("dladm_vnic_info() for %s failed: %s", vnic,
738 		    dladm_status2str(status, errmsg));
739 		dladm_close(handle);
740 		free(sc);
741 		return (1);
742 	}
743 
744 	memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL);
745 
746 	dladm_close(handle);
747 
748 	error = pci_viona_viona_init(pi->pi_vmctx, sc);
749 	if (error != 0) {
750 		free(sc);
751 		return (1);
752 	}
753 
754 	error = pthread_create(&tid, NULL, pci_viona_poll_thread, sc);
755 	assert(error == 0);
756 	snprintf(tname, sizeof (tname), "vionapoll:%s", vnic);
757 	pthread_set_name_np(tid, tname);
758 
759 	/* initialize config space */
760 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
761 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
762 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
763 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_NETWORK);
764 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
765 
766 	sc->vsc_consts = viona_vi_consts;
767 	pthread_mutex_init(&sc->vsc_mtx, NULL);
768 
769 	/*
770 	 * The RX and TX queues are handled in the kernel component of
771 	 * viona; however The control queue is emulated in userspace.
772 	 */
773 	sc->vsc_queues[VIONA_CTLQ].vq_qsize = pci_viona_qsize(sc, VIONA_CTLQ);
774 
775 	vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues);
776 	sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
777 
778 	/*
779 	 * Guests that do not support CTRL_RX_MAC still generally need to
780 	 * receive multicast packets. Guests that do support this feature will
781 	 * end up setting this flag indirectly via messages on the control
782 	 * queue but it does not hurt to default to multicast promiscuity here
783 	 * and it is what older version of viona did.
784 	 */
785 	sc->vsc_promisc_mmac = true;
786 	pci_viona_eval_promisc(sc);
787 
788 	/* MSI-X support */
789 	for (i = 0; i < VIONA_MAXQ; i++)
790 		sc->vsc_queues[i].vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
791 
792 	/* BAR 1 used to map MSI-X table and PBA */
793 	if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) {
794 		free(sc);
795 		return (1);
796 	}
797 
798 	/* BAR 0 for legacy-style virtio register access. */
799 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
800 	if (error != 0) {
801 		WPRINTF("could not allocate virtio BAR");
802 		free(sc);
803 		return (1);
804 	}
805 
806 	/*
807 	 * Need a legacy interrupt for virtio compliance, even though MSI-X
808 	 * operation is _strongly_ suggested for adequate performance.
809 	 */
810 	pci_lintr_request(pi);
811 
812 	return (0);
813 }
814 
815 static uint64_t
viona_adjust_offset(struct pci_devinst * pi,uint64_t offset)816 viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
817 {
818 	/*
819 	 * Device specific offsets used by guest would change based on
820 	 * whether MSI-X capability is enabled or not
821 	 */
822 	if (!pci_msix_enabled(pi)) {
823 		if (offset >= VIRTIO_PCI_CONFIG_OFF(0)) {
824 			return (offset + (VIRTIO_PCI_CONFIG_OFF(1) -
825 			    VIRTIO_PCI_CONFIG_OFF(0)));
826 		}
827 	}
828 
829 	return (offset);
830 }
831 
832 static void
pci_viona_ring_set_msix(struct pci_devinst * pi,uint_t ring)833 pci_viona_ring_set_msix(struct pci_devinst *pi, uint_t ring)
834 {
835 	struct pci_viona_softc *sc = pi->pi_arg;
836 	struct msix_table_entry mte;
837 	uint16_t tab_index;
838 	vioc_ring_msi_t vrm;
839 	int res;
840 
841 	if (ring == VIONA_CTLQ)
842 		return;
843 
844 	assert(ring <= VIONA_VQ_TX);
845 
846 	vrm.rm_index = ring;
847 	vrm.rm_addr = 0;
848 	vrm.rm_msg = 0;
849 	tab_index = sc->vsc_queues[ring].vq_msix_idx;
850 
851 	if (tab_index != VIRTIO_MSI_NO_VECTOR && sc->vsc_msix_active) {
852 		mte = pi->pi_msix.table[tab_index];
853 		if ((mte.vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
854 			vrm.rm_addr = mte.addr;
855 			vrm.rm_msg = mte.msg_data;
856 		}
857 	}
858 
859 	res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_SET_MSI, &vrm);
860 	if (res != 0) {
861 		WPRINTF("ioctl viona set_msi %d failed %d", ring, errno);
862 	}
863 }
864 
865 static void
pci_viona_lintrupdate(struct pci_devinst * pi)866 pci_viona_lintrupdate(struct pci_devinst *pi)
867 {
868 	struct pci_viona_softc *sc = pi->pi_arg;
869 	bool msix_on = false;
870 
871 	pthread_mutex_lock(&sc->vsc_mtx);
872 	msix_on = pci_msix_enabled(pi) && (pi->pi_msix.function_mask == 0);
873 	if ((sc->vsc_msix_active && !msix_on) ||
874 	    (msix_on && !sc->vsc_msix_active)) {
875 		uint_t i;
876 
877 		sc->vsc_msix_active = msix_on;
878 		/* Update in-kernel ring configs */
879 		for (i = 0; i <= VIONA_VQ_TX; i++) {
880 			pci_viona_ring_set_msix(pi, i);
881 		}
882 	}
883 	pthread_mutex_unlock(&sc->vsc_mtx);
884 }
885 
886 static void
pci_viona_msix_update(struct pci_devinst * pi,uint64_t offset)887 pci_viona_msix_update(struct pci_devinst *pi, uint64_t offset)
888 {
889 	struct pci_viona_softc *sc = pi->pi_arg;
890 	uint_t tab_index, i;
891 
892 	pthread_mutex_lock(&sc->vsc_mtx);
893 	if (!sc->vsc_msix_active) {
894 		pthread_mutex_unlock(&sc->vsc_mtx);
895 		return;
896 	}
897 
898 	/*
899 	 * Rather than update every possible MSI-X vector, cheat and use the
900 	 * offset to calculate the entry within the table.  Since this should
901 	 * only be called when a write to the table succeeds, the index should
902 	 * be valid.
903 	 */
904 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
905 
906 	for (i = 0; i <= VIONA_VQ_TX; i++) {
907 		if (sc->vsc_queues[i].vq_msix_idx != tab_index) {
908 			continue;
909 		}
910 		pci_viona_ring_set_msix(pi, i);
911 	}
912 
913 	pthread_mutex_unlock(&sc->vsc_mtx);
914 }
915 
916 static void
pci_viona_qnotify(struct pci_viona_softc * sc,int ring)917 pci_viona_qnotify(struct pci_viona_softc *sc, int ring)
918 {
919 	int error;
920 
921 	switch (ring) {
922 	case VIONA_TXQ:
923 	case VIONA_RXQ:
924 		error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_KICK, ring);
925 		if (error != 0) {
926 			WPRINTF("ioctl viona ring %d kick failed %d",
927 			    ring, errno);
928 		}
929 		break;
930 	case VIONA_CTLQ: {
931 		struct vqueue_info *vq = &sc->vsc_queues[VIONA_CTLQ];
932 
933 		if (vq_has_descs(vq))
934 			pci_viona_process_ctrlq(vq);
935 		break;
936 	}
937 	}
938 }
939 
940 static void
pci_viona_baraddr(struct pci_devinst * pi,int baridx,int enabled,uint64_t address)941 pci_viona_baraddr(struct pci_devinst *pi, int baridx, int enabled,
942     uint64_t address)
943 {
944 	struct pci_viona_softc *sc = pi->pi_arg;
945 	uint64_t ioport;
946 	int error;
947 
948 	if (baridx != 0)
949 		return;
950 
951 	if (enabled == 0) {
952 		error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, 0);
953 		if (error != 0)
954 			WPRINTF("uninstall ioport hook failed %d", errno);
955 		return;
956 	}
957 
958 	/*
959 	 * Install ioport hook for virtqueue notification.
960 	 * This is part of the virtio common configuration area so the
961 	 * address does not change with MSI-X status.
962 	 */
963 	ioport = address + VIRTIO_PCI_QUEUE_NOTIFY;
964 	error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, ioport);
965 	if (error != 0) {
966 		WPRINTF("install ioport hook at %x failed %d",
967 		    ioport, errno);
968 	}
969 }
970 
971 static void
pci_viona_write(struct pci_devinst * pi,int baridx,uint64_t offset,int size,uint64_t value)972 pci_viona_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
973     uint64_t value)
974 {
975 	struct pci_viona_softc *sc = pi->pi_arg;
976 	void *ptr;
977 	int err = 0;
978 
979 	if (baridx == pci_msix_table_bar(pi) ||
980 	    baridx == pci_msix_pba_bar(pi)) {
981 		if (pci_emul_msix_twrite(pi, offset, size, value) == 0) {
982 			pci_viona_msix_update(pi, offset);
983 		}
984 		return;
985 	}
986 
987 	assert(baridx == 0);
988 
989 	if (offset + size > pci_viona_iosize(pi)) {
990 		DPRINTF("viona_write: 2big, offset %ld size %d",
991 		    offset, size);
992 		return;
993 	}
994 
995 	pthread_mutex_lock(&sc->vsc_mtx);
996 
997 	offset = viona_adjust_offset(pi, offset);
998 
999 	switch (offset) {
1000 	case VIRTIO_PCI_GUEST_FEATURES:
1001 		assert(size == 4);
1002 		value &= ~(sc->vsc_feature_mask);
1003 		err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value);
1004 		if (err != 0) {
1005 			WPRINTF("ioctl feature negotiation returned err = %d",
1006 			    errno);
1007 		} else {
1008 			sc->vsc_vs.vs_negotiated_caps = value;
1009 		}
1010 		break;
1011 	case VIRTIO_PCI_QUEUE_PFN:
1012 		assert(size == 4);
1013 		pci_viona_ring_init(sc, value);
1014 		break;
1015 	case VIRTIO_PCI_QUEUE_SEL:
1016 		assert(size == 2);
1017 		assert(value < VIONA_MAXQ);
1018 		sc->vsc_vs.vs_curq = value;
1019 		break;
1020 	case VIRTIO_PCI_QUEUE_NOTIFY:
1021 		assert(size == 2);
1022 		assert(value < VIONA_MAXQ);
1023 		pci_viona_qnotify(sc, value);
1024 		break;
1025 	case VIRTIO_PCI_STATUS:
1026 		assert(size == 1);
1027 		pci_viona_update_status(sc, value);
1028 		break;
1029 	case VIRTIO_MSI_CONFIG_VECTOR:
1030 		assert(size == 2);
1031 		sc->vsc_vs.vs_msix_cfg_idx = value;
1032 		break;
1033 	case VIRTIO_MSI_QUEUE_VECTOR:
1034 		assert(size == 2);
1035 		assert(sc->vsc_vs.vs_curq < VIONA_MAXQ);
1036 		sc->vsc_queues[sc->vsc_vs.vs_curq].vq_msix_idx = value;
1037 		pci_viona_ring_set_msix(pi, sc->vsc_vs.vs_curq);
1038 		break;
1039 	case VIONA_R_CFG0:
1040 	case VIONA_R_CFG1:
1041 	case VIONA_R_CFG2:
1042 	case VIONA_R_CFG3:
1043 	case VIONA_R_CFG4:
1044 	case VIONA_R_CFG5:
1045 		assert((size + offset) <= (VIONA_R_CFG5 + 1));
1046 		ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
1047 		/*
1048 		 * The driver is allowed to change the MAC address
1049 		 */
1050 		sc->vsc_macaddr[offset - VIONA_R_CFG0] = value;
1051 		if (size == 1) {
1052 			*(uint8_t *)ptr = value;
1053 		} else if (size == 2) {
1054 			*(uint16_t *)ptr = value;
1055 		} else {
1056 			*(uint32_t *)ptr = value;
1057 		}
1058 		break;
1059 	case VIRTIO_PCI_HOST_FEATURES:
1060 	case VIRTIO_PCI_QUEUE_NUM:
1061 	case VIRTIO_PCI_ISR:
1062 	case VIONA_R_CFG6:
1063 	case VIONA_R_CFG7:
1064 		DPRINTF("viona: write to readonly reg %ld", offset);
1065 		break;
1066 	default:
1067 		DPRINTF("viona: unknown i/o write offset %ld", offset);
1068 		value = 0;
1069 		break;
1070 	}
1071 
1072 	pthread_mutex_unlock(&sc->vsc_mtx);
1073 }
1074 
1075 static uint64_t
pci_viona_read(struct pci_devinst * pi,int baridx,uint64_t offset,int size)1076 pci_viona_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
1077 {
1078 	struct pci_viona_softc *sc = pi->pi_arg;
1079 	void *ptr;
1080 	uint64_t value;
1081 	int err = 0;
1082 
1083 	if (baridx == pci_msix_table_bar(pi) ||
1084 	    baridx == pci_msix_pba_bar(pi)) {
1085 		return (pci_emul_msix_tread(pi, offset, size));
1086 	}
1087 
1088 	assert(baridx == 0);
1089 
1090 	if (offset + size > pci_viona_iosize(pi)) {
1091 		DPRINTF("viona_read: 2big, offset %ld size %d",
1092 		    offset, size);
1093 		return (0);
1094 	}
1095 
1096 	pthread_mutex_lock(&sc->vsc_mtx);
1097 
1098 	offset = viona_adjust_offset(pi, offset);
1099 
1100 	switch (offset) {
1101 	case VIRTIO_PCI_HOST_FEATURES:
1102 		assert(size == 4);
1103 		err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value);
1104 		if (err != 0) {
1105 			WPRINTF("ioctl get host features returned err = %d",
1106 			    errno);
1107 		}
1108 		value |= VIONA_S_HOSTCAPS_USERSPACE;
1109 		value &= ~sc->vsc_feature_mask;
1110 		sc->vsc_consts.vc_hv_caps = value;
1111 		break;
1112 	case VIRTIO_PCI_GUEST_FEATURES:
1113 		assert(size == 4);
1114 		value = sc->vsc_vs.vs_negotiated_caps; /* XXX never read ? */
1115 		break;
1116 	case VIRTIO_PCI_QUEUE_PFN:
1117 		assert(size == 4);
1118 		value = sc->vsc_queues[sc->vsc_vs.vs_curq].vq_pfn >> VRING_PFN;
1119 		break;
1120 	case VIRTIO_PCI_QUEUE_NUM:
1121 		assert(size == 2);
1122 		value = pci_viona_qsize(sc, sc->vsc_vs.vs_curq);
1123 		break;
1124 	case VIRTIO_PCI_QUEUE_SEL:
1125 		assert(size == 2);
1126 		value = sc->vsc_vs.vs_curq;  /* XXX never read ? */
1127 		break;
1128 	case VIRTIO_PCI_QUEUE_NOTIFY:
1129 		assert(size == 2);
1130 		value = sc->vsc_vs.vs_curq;  /* XXX never read ? */
1131 		break;
1132 	case VIRTIO_PCI_STATUS:
1133 		assert(size == 1);
1134 		value = sc->vsc_vs.vs_status;
1135 		break;
1136 	case VIRTIO_PCI_ISR:
1137 		assert(size == 1);
1138 		value = sc->vsc_vs.vs_isr;
1139 		sc->vsc_vs.vs_isr = 0;	/* a read clears this flag */
1140 		if (value != 0) {
1141 			pci_lintr_deassert(pi);
1142 		}
1143 		break;
1144 	case VIRTIO_MSI_CONFIG_VECTOR:
1145 		assert(size == 2);
1146 		value = sc->vsc_vs.vs_msix_cfg_idx;
1147 		break;
1148 	case VIRTIO_MSI_QUEUE_VECTOR:
1149 		assert(size == 2);
1150 		assert(sc->vsc_vs.vs_curq < VIONA_MAXQ);
1151 		value = sc->vsc_queues[sc->vsc_vs.vs_curq].vq_msix_idx;
1152 		break;
1153 	case VIONA_R_CFG0:
1154 	case VIONA_R_CFG1:
1155 	case VIONA_R_CFG2:
1156 	case VIONA_R_CFG3:
1157 	case VIONA_R_CFG4:
1158 	case VIONA_R_CFG5:
1159 		assert((size + offset) <= (VIONA_R_CFG5 + 1));
1160 		ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
1161 		if (size == 1) {
1162 			value = *(uint8_t *)ptr;
1163 		} else if (size == 2) {
1164 			value = *(uint16_t *)ptr;
1165 		} else {
1166 			value = *(uint32_t *)ptr;
1167 		}
1168 		break;
1169 	case VIONA_R_CFG6:
1170 		assert(size != 4);
1171 		value = 0x01;	/* XXX link always up */
1172 		break;
1173 	case VIONA_R_CFG7:
1174 		assert(size == 1);
1175 		value = 0;	/* XXX link status in LSB */
1176 		break;
1177 	default:
1178 		DPRINTF("viona: unknown i/o read offset %ld", offset);
1179 		value = 0;
1180 		break;
1181 	}
1182 
1183 	pthread_mutex_unlock(&sc->vsc_mtx);
1184 
1185 	return (value);
1186 }
1187 
1188 struct pci_devemu pci_de_viona = {
1189 	.pe_emu =	"virtio-net-viona",
1190 	.pe_init =	pci_viona_init,
1191 	.pe_legacy_config = pci_viona_legacy_config,
1192 	.pe_barwrite =	pci_viona_write,
1193 	.pe_barread =	pci_viona_read,
1194 	.pe_baraddr =	pci_viona_baraddr,
1195 	.pe_lintrupdate = pci_viona_lintrupdate
1196 };
1197 PCI_EMUL_SET(pci_de_viona);
1198