xref: /illumos-gate/usr/src/cmd/bhyve/pci_virtio_net.c (revision 8515d723262b57176aeeda8734edbe79fe1e7a5a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2013 Pluribus Networks Inc.
41  * Copyright 2018 Joyent, Inc.
42  */
43 
44 #include <sys/cdefs.h>
45 __FBSDID("$FreeBSD$");
46 
47 #include <sys/param.h>
48 #ifndef WITHOUT_CAPSICUM
49 #include <sys/capsicum.h>
50 #endif
51 #include <sys/linker_set.h>
52 #include <sys/select.h>
53 #include <sys/uio.h>
54 #include <sys/ioctl.h>
55 #include <net/ethernet.h>
56 #ifdef __FreeBSD__
57 #ifndef NETMAP_WITH_LIBS
58 #define NETMAP_WITH_LIBS
59 #endif
60 #include <net/netmap_user.h>
61 #endif
62 
63 #ifndef WITHOUT_CAPSICUM
64 #include <capsicum_helpers.h>
65 #endif
66 #include <err.h>
67 #include <errno.h>
68 #include <fcntl.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <stdint.h>
72 #include <string.h>
73 #include <strings.h>
74 #include <unistd.h>
75 #include <assert.h>
76 #include <md5.h>
77 #include <pthread.h>
78 #include <pthread_np.h>
79 #include <sysexits.h>
80 #ifndef __FreeBSD__
81 #include <poll.h>
82 #include <libdlpi.h>
83 #endif
84 
85 #include "bhyverun.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 #ifdef __FreeBSD__
89 #include "mevent.h"
90 #endif
91 #include "virtio.h"
92 #include "net_utils.h"
93 
94 #define VTNET_RINGSZ	1024
95 
96 #define VTNET_MAXSEGS	256
97 
98 /*
99  * Host capabilities.  Note that we only offer a few of these.
100  */
101 #define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
102 #define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
103 #define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
104 #define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
105 #define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
106 #define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
107 #define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
108 #define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
109 #define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
110 #define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
111 #define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
112 #define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
113 #define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
114 #define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
115 #define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
116 #define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
117 #define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
118 #define	VIRTIO_NET_F_GUEST_ANNOUNCE \
119 				(1 << 21) /* guest can send gratuitous pkts */
120 
121 #define VTNET_S_HOSTCAPS      \
122   ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
123     VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
124 
125 /*
126  * PCI config-space "registers"
127  */
128 struct virtio_net_config {
129 	uint8_t  mac[6];
130 	uint16_t status;
131 	uint16_t max_virtqueue_pairs;
132 	uint16_t mtu;
133 } __packed;
134 
135 /*
136  * Queue definitions.
137  */
138 #define VTNET_RXQ	0
139 #define VTNET_TXQ	1
140 #define VTNET_CTLQ	2	/* NB: not yet supported */
141 
142 #define VTNET_MAXQ	3
143 
144 /*
145  * Fixed network header size
146  */
147 struct virtio_net_rxhdr {
148 	uint8_t		vrh_flags;
149 	uint8_t		vrh_gso_type;
150 	uint16_t	vrh_hdr_len;
151 	uint16_t	vrh_gso_size;
152 	uint16_t	vrh_csum_start;
153 	uint16_t	vrh_csum_offset;
154 	uint16_t	vrh_bufs;
155 } __packed;
156 
157 /*
158  * Debug printf
159  */
160 static int pci_vtnet_debug;
161 #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params
162 #define WPRINTF(params) PRINTLN params
163 
164 /*
165  * Per-device softc
166  */
167 struct pci_vtnet_softc {
168 	struct virtio_softc vsc_vs;
169 	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
170 	pthread_mutex_t vsc_mtx;
171 	struct mevent	*vsc_mevp;
172 
173 #ifdef	__FreeBSD
174 	int		vsc_tapfd;
175 #else
176 	dlpi_handle_t	vsc_dhp;
177 	int		vsc_dlpifd;
178 #endif
179 	struct nm_desc	*vsc_nmd;
180 
181 	int		vsc_rx_ready;
182 	int		resetting;	/* protected by tx_mtx */
183 
184 	uint64_t	vsc_features;	/* negotiated features */
185 
186 	struct virtio_net_config vsc_config;
187 	struct virtio_consts vsc_consts;
188 
189 	pthread_mutex_t	rx_mtx;
190 	int		rx_vhdrlen;
191 	int		rx_merge;	/* merged rx bufs in use */
192 
193 	pthread_t 	tx_tid;
194 	pthread_mutex_t	tx_mtx;
195 	pthread_cond_t	tx_cond;
196 	int		tx_in_progress;
197 
198 	void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
199 	void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
200 			     int iovcnt, int len);
201 };
202 
203 static void pci_vtnet_reset(void *);
204 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */
205 static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
206 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
207 static void pci_vtnet_neg_features(void *, uint64_t);
208 
209 static struct virtio_consts vtnet_vi_consts = {
210 	"vtnet",		/* our name */
211 	VTNET_MAXQ - 1,		/* we currently support 2 virtqueues */
212 	sizeof(struct virtio_net_config), /* config reg size */
213 	pci_vtnet_reset,	/* reset */
214 	NULL,			/* device-wide qnotify -- not used */
215 	pci_vtnet_cfgread,	/* read PCI config */
216 	pci_vtnet_cfgwrite,	/* write PCI config */
217 	pci_vtnet_neg_features,	/* apply negotiated features */
218 	VTNET_S_HOSTCAPS,	/* our capabilities */
219 };
220 
221 static void
222 pci_vtnet_reset(void *vsc)
223 {
224 	struct pci_vtnet_softc *sc = vsc;
225 
226 	DPRINTF(("vtnet: device reset requested !"));
227 
228 	/* Acquire the RX lock to block RX processing. */
229 	pthread_mutex_lock(&sc->rx_mtx);
230 
231 	/* Set sc->resetting and give a chance to the TX thread to stop. */
232 	pthread_mutex_lock(&sc->tx_mtx);
233 	sc->resetting = 1;
234 	while (sc->tx_in_progress) {
235 		pthread_mutex_unlock(&sc->tx_mtx);
236 		usleep(10000);
237 		pthread_mutex_lock(&sc->tx_mtx);
238 	}
239 
240 	sc->vsc_rx_ready = 0;
241 	sc->rx_merge = 1;
242 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
243 
244 	/*
245 	 * Now reset rings, MSI-X vectors, and negotiated capabilities.
246 	 * Do that with the TX lock held, since we need to reset
247 	 * sc->resetting.
248 	 */
249 	vi_reset_dev(&sc->vsc_vs);
250 
251 	sc->resetting = 0;
252 	pthread_mutex_unlock(&sc->tx_mtx);
253 	pthread_mutex_unlock(&sc->rx_mtx);
254 }
255 
256 /*
257  * Called to send a buffer chain out to the tap device
258  */
259 #ifdef __FreeBSD__
260 static void
261 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
262 		 int len)
263 {
264 	static char pad[60]; /* all zero bytes */
265 
266 	if (sc->vsc_tapfd == -1)
267 		return;
268 
269 	/*
270 	 * If the length is < 60, pad out to that and add the
271 	 * extra zero'd segment to the iov. It is guaranteed that
272 	 * there is always an extra iov available by the caller.
273 	 */
274 	if (len < 60) {
275 		iov[iovcnt].iov_base = pad;
276 		iov[iovcnt].iov_len = 60 - len;
277 		iovcnt++;
278 	}
279 	(void) writev(sc->vsc_tapfd, iov, iovcnt);
280 }
281 #else
282 static void
283 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
284 		 int len)
285 {
286 	int i;
287 
288 	for (i = 0; i < iovcnt; i++) {
289 		(void) dlpi_send(sc->vsc_dhp, NULL, 0,
290 		    iov[i].iov_base, iov[i].iov_len, NULL);
291 	}
292 }
293 #endif /* __FreeBSD__ */
294 
295 #ifdef __FreeBSD__
296 /*
297  *  Called when there is read activity on the tap file descriptor.
298  * Each buffer posted by the guest is assumed to be able to contain
299  * an entire ethernet frame + rx header.
300  *  MP note: the dummybuf is only used for discarding frames, so there
301  * is no need for it to be per-vtnet or locked.
302  */
303 static uint8_t dummybuf[2048];
304 #endif /* __FreeBSD__ */
305 
306 static __inline struct iovec *
307 rx_iov_trim(struct iovec *iov, int *niov, int tlen)
308 {
309 	struct iovec *riov;
310 
311 	/* XXX short-cut: assume first segment is >= tlen */
312 	assert(iov[0].iov_len >= tlen);
313 
314 	iov[0].iov_len -= tlen;
315 	if (iov[0].iov_len == 0) {
316 		assert(*niov > 1);
317 		*niov -= 1;
318 		riov = &iov[1];
319 	} else {
320 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
321 		riov = &iov[0];
322 	}
323 
324 	return (riov);
325 }
326 
327 static void
328 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
329 {
330 	struct iovec iov[VTNET_MAXSEGS], *riov;
331 	struct vqueue_info *vq;
332 	void *vrx;
333 	int n;
334 #ifdef	__FreeBSD__
335 	int len;
336 #else
337 	size_t len;
338 	int ret;
339 #endif
340 	uint16_t idx;
341 
342 	/*
343 	 * Should never be called without a valid tap fd
344 	 */
345 #ifdef	__FreeBSD__
346 	assert(sc->vsc_tapfd != -1);
347 #else
348 	assert(sc->vsc_dlpifd != -1);
349 #endif
350 
351 	/*
352 	 * But, will be called when the rx ring hasn't yet
353 	 * been set up.
354 	 */
355 	if (!sc->vsc_rx_ready) {
356 #ifdef	__FreeBSD__
357 		/*
358 		 * Drop the packet and try later.
359 		 */
360 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
361 #endif
362 		return;
363 	}
364 
365 	/*
366 	 * Check for available rx buffers
367 	 */
368 	vq = &sc->vsc_queues[VTNET_RXQ];
369 	if (!vq_has_descs(vq)) {
370 		/*
371 		 * Drop the packet and try later.  Interrupt on
372 		 * empty, if that's negotiated.
373 		 */
374 #ifdef	__FreeBSD__
375 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
376 #endif
377 		vq_endchains(vq, 1);
378 		return;
379 	}
380 
381 	do {
382 		/*
383 		 * Get descriptor chain
384 		 */
385 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
386 		assert(n >= 1 && n <= VTNET_MAXSEGS);
387 
388 		/*
389 		 * Get a pointer to the rx header, and use the
390 		 * data immediately following it for the packet buffer.
391 		 */
392 		vrx = iov[0].iov_base;
393 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
394 #ifdef	__FreeBSD__
395 		len = readv(sc->vsc_tapfd, riov, n);
396 #else
397 		len = riov[0].iov_len;
398 		ret = dlpi_recv(sc->vsc_dhp, NULL, NULL,
399 		    (uint8_t *)riov[0].iov_base, &len, 0, NULL);
400 		if (ret != DLPI_SUCCESS) {
401 			errno = EWOULDBLOCK;
402 			len = 0;
403 		}
404 #endif
405 		if (len <= 0 && errno == EWOULDBLOCK) {
406 			/*
407 			 * No more packets, but still some avail ring
408 			 * entries.  Interrupt if needed/appropriate.
409 			 */
410 			vq_retchains(vq, 1);
411 			vq_endchains(vq, 0);
412 			return;
413 		}
414 
415 		/*
416 		 * The only valid field in the rx packet header is the
417 		 * number of buffers if merged rx bufs were negotiated.
418 		 */
419 		memset(vrx, 0, sc->rx_vhdrlen);
420 
421 		if (sc->rx_merge) {
422 			struct virtio_net_rxhdr *vrxh;
423 
424 			vrxh = vrx;
425 			vrxh->vrh_bufs = 1;
426 		}
427 
428 		/*
429 		 * Release this chain and handle more chains.
430 		 */
431 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
432 	} while (vq_has_descs(vq));
433 
434 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
435 	vq_endchains(vq, 1);
436 }
437 
438 #ifdef __FreeBSD__
439 static __inline int
440 pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
441 {
442 	int r, i;
443 	int len = 0;
444 
445 	for (r = nmd->cur_tx_ring; ; ) {
446 		struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
447 		uint32_t cur, idx;
448 		char *buf;
449 
450 		if (nm_ring_empty(ring)) {
451 			r++;
452 			if (r > nmd->last_tx_ring)
453 				r = nmd->first_tx_ring;
454 			if (r == nmd->cur_tx_ring)
455 				break;
456 			continue;
457 		}
458 		cur = ring->cur;
459 		idx = ring->slot[cur].buf_idx;
460 		buf = NETMAP_BUF(ring, idx);
461 
462 		for (i = 0; i < iovcnt; i++) {
463 			if (len + iov[i].iov_len > 2048)
464 				break;
465 			memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
466 			len += iov[i].iov_len;
467 		}
468 		ring->slot[cur].len = len;
469 		ring->head = ring->cur = nm_ring_next(ring, cur);
470 		nmd->cur_tx_ring = r;
471 		ioctl(nmd->fd, NIOCTXSYNC, NULL);
472 		break;
473 	}
474 
475 	return (len);
476 }
477 
478 static __inline int
479 pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
480 {
481 	int len = 0;
482 	int i = 0;
483 	int r;
484 
485 	for (r = nmd->cur_rx_ring; ; ) {
486 		struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
487 		uint32_t cur, idx;
488 		char *buf;
489 		size_t left;
490 
491 		if (nm_ring_empty(ring)) {
492 			r++;
493 			if (r > nmd->last_rx_ring)
494 				r = nmd->first_rx_ring;
495 			if (r == nmd->cur_rx_ring)
496 				break;
497 			continue;
498 		}
499 		cur = ring->cur;
500 		idx = ring->slot[cur].buf_idx;
501 		buf = NETMAP_BUF(ring, idx);
502 		left = ring->slot[cur].len;
503 
504 		for (i = 0; i < iovcnt && left > 0; i++) {
505 			if (iov[i].iov_len > left)
506 				iov[i].iov_len = left;
507 			memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
508 			len += iov[i].iov_len;
509 			left -= iov[i].iov_len;
510 		}
511 		ring->head = ring->cur = nm_ring_next(ring, cur);
512 		nmd->cur_rx_ring = r;
513 		ioctl(nmd->fd, NIOCRXSYNC, NULL);
514 		break;
515 	}
516 	for (; i < iovcnt; i++)
517 		iov[i].iov_len = 0;
518 
519 	return (len);
520 }
521 
522 /*
523  * Called to send a buffer chain out to the vale port
524  */
525 static void
526 pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
527 		    int len)
528 {
529 	static char pad[60]; /* all zero bytes */
530 
531 	if (sc->vsc_nmd == NULL)
532 		return;
533 
534 	/*
535 	 * If the length is < 60, pad out to that and add the
536 	 * extra zero'd segment to the iov. It is guaranteed that
537 	 * there is always an extra iov available by the caller.
538 	 */
539 	if (len < 60) {
540 		iov[iovcnt].iov_base = pad;
541 		iov[iovcnt].iov_len = 60 - len;
542 		iovcnt++;
543 	}
544 	(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
545 }
546 
547 static void
548 pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
549 {
550 	struct iovec iov[VTNET_MAXSEGS], *riov;
551 	struct vqueue_info *vq;
552 	void *vrx;
553 	int len, n;
554 	uint16_t idx;
555 
556 	/*
557 	 * Should never be called without a valid netmap descriptor
558 	 */
559 	assert(sc->vsc_nmd != NULL);
560 
561 	/*
562 	 * But, will be called when the rx ring hasn't yet
563 	 * been set up.
564 	 */
565 	if (!sc->vsc_rx_ready) {
566 		/*
567 		 * Drop the packet and try later.
568 		 */
569 		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
570 		return;
571 	}
572 
573 	/*
574 	 * Check for available rx buffers
575 	 */
576 	vq = &sc->vsc_queues[VTNET_RXQ];
577 	if (!vq_has_descs(vq)) {
578 		/*
579 		 * Drop the packet and try later.  Interrupt on
580 		 * empty, if that's negotiated.
581 		 */
582 		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
583 		vq_endchains(vq, 1);
584 		return;
585 	}
586 
587 	do {
588 		/*
589 		 * Get descriptor chain.
590 		 */
591 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
592 		assert(n >= 1 && n <= VTNET_MAXSEGS);
593 
594 		/*
595 		 * Get a pointer to the rx header, and use the
596 		 * data immediately following it for the packet buffer.
597 		 */
598 		vrx = iov[0].iov_base;
599 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
600 
601 		len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
602 
603 		if (len == 0) {
604 			/*
605 			 * No more packets, but still some avail ring
606 			 * entries.  Interrupt if needed/appropriate.
607 			 */
608 			vq_retchain(vq);
609 			vq_endchains(vq, 0);
610 			return;
611 		}
612 
613 		/*
614 		 * The only valid field in the rx packet header is the
615 		 * number of buffers if merged rx bufs were negotiated.
616 		 */
617 		memset(vrx, 0, sc->rx_vhdrlen);
618 
619 		if (sc->rx_merge) {
620 			struct virtio_net_rxhdr *vrxh;
621 
622 			vrxh = vrx;
623 			vrxh->vrh_bufs = 1;
624 		}
625 
626 		/*
627 		 * Release this chain and handle more chains.
628 		 */
629 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
630 	} while (vq_has_descs(vq));
631 
632 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
633 	vq_endchains(vq, 1);
634 }
635 #endif /* __FreeBSD__ */
636 
637 #ifdef __FreeBSD__
638 static void
639 pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
640 {
641 	struct pci_vtnet_softc *sc = param;
642 
643 	pthread_mutex_lock(&sc->rx_mtx);
644 	sc->pci_vtnet_rx(sc);
645 	pthread_mutex_unlock(&sc->rx_mtx);
646 
647 }
648 #else
649 static void *
650 pci_vtnet_poll_thread(void *param)
651 {
652 	struct pci_vtnet_softc *sc = param;
653 	pollfd_t pollset;
654 
655 	pollset.fd = sc->vsc_dlpifd;
656 	pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
657 
658 	for (;;) {
659 		if (poll(&pollset, 1, -1) < 0) {
660 			if (errno == EINTR)
661 				continue;
662 			fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno);
663 			continue;
664 		}
665 		pthread_mutex_lock(&sc->vsc_mtx);
666 		pci_vtnet_tap_rx(sc);
667 		pthread_mutex_unlock(&sc->vsc_mtx);
668 	}
669 
670 	return (NULL);
671 }
672 #endif /* __FreeBSD__ */
673 
674 static void
675 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
676 {
677 	struct pci_vtnet_softc *sc = vsc;
678 
679 	/*
680 	 * A qnotify means that the rx process can now begin.
681 	 */
682 	if (sc->vsc_rx_ready == 0) {
683 		sc->vsc_rx_ready = 1;
684 		vq_kick_disable(vq);
685 	}
686 }
687 
688 static void
689 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
690 {
691 	struct iovec iov[VTNET_MAXSEGS + 1];
692 	int i, n;
693 	int plen, tlen;
694 	uint16_t idx;
695 
696 	/*
697 	 * Obtain chain of descriptors.  The first one is
698 	 * really the header descriptor, so we need to sum
699 	 * up two lengths: packet length and transfer length.
700 	 */
701 	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
702 	assert(n >= 1 && n <= VTNET_MAXSEGS);
703 	plen = 0;
704 	tlen = iov[0].iov_len;
705 	for (i = 1; i < n; i++) {
706 		plen += iov[i].iov_len;
707 		tlen += iov[i].iov_len;
708 	}
709 
710 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
711 	sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
712 
713 	/* chain is processed, release it and set tlen */
714 	vq_relchain(vq, idx, tlen);
715 }
716 
717 static void
718 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
719 {
720 	struct pci_vtnet_softc *sc = vsc;
721 
722 	/*
723 	 * Any ring entries to process?
724 	 */
725 	if (!vq_has_descs(vq))
726 		return;
727 
728 	/* Signal the tx thread for processing */
729 	pthread_mutex_lock(&sc->tx_mtx);
730 	vq_kick_disable(vq);
731 	if (sc->tx_in_progress == 0)
732 		pthread_cond_signal(&sc->tx_cond);
733 	pthread_mutex_unlock(&sc->tx_mtx);
734 }
735 
736 /*
737  * Thread which will handle processing of TX desc
738  */
739 static void *
740 pci_vtnet_tx_thread(void *param)
741 {
742 	struct pci_vtnet_softc *sc = param;
743 	struct vqueue_info *vq;
744 	int error;
745 
746 	vq = &sc->vsc_queues[VTNET_TXQ];
747 
748 	/*
749 	 * Let us wait till the tx queue pointers get initialised &
750 	 * first tx signaled
751 	 */
752 	pthread_mutex_lock(&sc->tx_mtx);
753 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
754 	assert(error == 0);
755 
756 	for (;;) {
757 		/* note - tx mutex is locked here */
758 		while (sc->resetting || !vq_has_descs(vq)) {
759 			vq_kick_enable(vq);
760 			if (!sc->resetting && vq_has_descs(vq))
761 				break;
762 
763 			sc->tx_in_progress = 0;
764 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
765 			assert(error == 0);
766 		}
767 		vq_kick_disable(vq);
768 		sc->tx_in_progress = 1;
769 		pthread_mutex_unlock(&sc->tx_mtx);
770 
771 		do {
772 			/*
773 			 * Run through entries, placing them into
774 			 * iovecs and sending when an end-of-packet
775 			 * is found
776 			 */
777 			pci_vtnet_proctx(sc, vq);
778 		} while (vq_has_descs(vq));
779 
780 		/*
781 		 * Generate an interrupt if needed.
782 		 */
783 		vq_endchains(vq, 1);
784 
785 		pthread_mutex_lock(&sc->tx_mtx);
786 	}
787 	return (NULL);
788 }
789 
790 #ifdef __FreeBSD__
791 static void
792 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
793 {
794 
795 	DPRINTF(("vtnet: control qnotify!"));
796 }
797 #endif /* __FreeBSD__ */
798 
799 static void
800 pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
801 {
802 	char tbuf[80];
803 #ifndef WITHOUT_CAPSICUM
804 	cap_rights_t rights;
805 #endif
806 #ifndef	__FreeBSD__
807 	uchar_t physaddr[DLPI_PHYSADDR_MAX];
808 	size_t physaddrlen = DLPI_PHYSADDR_MAX;
809 	int error;
810 #endif
811 
812 	strcpy(tbuf, "/dev/");
813 	strlcat(tbuf, devname, sizeof(tbuf));
814 
815 	sc->pci_vtnet_rx = pci_vtnet_tap_rx;
816 	sc->pci_vtnet_tx = pci_vtnet_tap_tx;
817 #ifdef	__FreeBSD__
818 	sc->vsc_tapfd = open(tbuf, O_RDWR);
819 	if (sc->vsc_tapfd == -1) {
820 		WPRINTF(("open of tap device %s failed\n", tbuf));
821 		return;
822 	}
823 
824 	/*
825 	 * Set non-blocking and register for read
826 	 * notifications with the event loop
827 	 */
828 	int opt = 1;
829 	if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
830 		WPRINTF(("tap device O_NONBLOCK failed\n"));
831 		close(sc->vsc_tapfd);
832 		sc->vsc_tapfd = -1;
833 	}
834 
835 #ifndef WITHOUT_CAPSICUM
836 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
837 	if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
838 		errx(EX_OSERR, "Unable to apply rights for sandbox");
839 #endif
840 
841 	sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
842 				  EVF_READ,
843 				  pci_vtnet_rx_callback,
844 				  sc);
845 	if (sc->vsc_mevp == NULL) {
846 		WPRINTF(("Could not register event\n"));
847 		close(sc->vsc_tapfd);
848 		sc->vsc_tapfd = -1;
849 	}
850 #else
851 	if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
852 		WPRINTF(("open of vnic device %s failed\n", devname));
853 	}
854 
855 	if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr,
856 	    &physaddrlen) != DLPI_SUCCESS) {
857 		WPRINTF(("read MAC address of vnic device %s failed\n",
858 		    devname));
859 	}
860 	if (physaddrlen != ETHERADDRL) {
861 		WPRINTF(("bad MAC address len %d on vnic device %s\n",
862 		    physaddrlen, devname));
863 	}
864 	memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
865 
866 	if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
867 		WPRINTF(("bind of vnic device %s failed\n", devname));
868 	}
869 
870 	if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
871 		WPRINTF(("enable promiscous mode(physical) of vnic device %s "
872 		    "failed\n", devname));
873 	}
874 	if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
875 		WPRINTF(("enable promiscous mode(SAP) of vnic device %s "
876 		    "failed\n", devname));
877 	}
878 
879 	sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
880 
881 	if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
882 		WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n",
883 		    devname));
884 		dlpi_close(sc->vsc_dhp);
885 		sc->vsc_dlpifd = -1;
886 	}
887 
888 	error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
889 	assert(error == 0);
890 #endif
891 }
892 
893 #ifdef __FreeBSD__
894 static void
895 pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
896 {
897 	sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
898 	sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
899 
900 	sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
901 	if (sc->vsc_nmd == NULL) {
902 		WPRINTF(("open of netmap device %s failed\n", ifname));
903 		return;
904 	}
905 
906 	sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
907 				  EVF_READ,
908 				  pci_vtnet_rx_callback,
909 				  sc);
910 	if (sc->vsc_mevp == NULL) {
911 		WPRINTF(("Could not register event\n"));
912 		nm_close(sc->vsc_nmd);
913 		sc->vsc_nmd = NULL;
914 	}
915 }
916 #endif /* __FreeBSD__ */
917 
918 static int
919 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
920 {
921 	struct pci_vtnet_softc *sc;
922 	char tname[MAXCOMLEN + 1];
923 #ifdef __FreeBSD__
924 	int mac_provided;
925 	int mtu_provided;
926 	unsigned long mtu = ETHERMTU;
927 #else
928 	int use_msix = 1;
929 #endif
930 
931 	/*
932 	 * Allocate data structures for further virtio initializations.
933 	 * sc also contains a copy of vtnet_vi_consts, since capabilities
934 	 * change depending on the backend.
935 	 */
936 	sc = calloc(1, sizeof(struct pci_vtnet_softc));
937 
938 	sc->vsc_consts = vtnet_vi_consts;
939 	pthread_mutex_init(&sc->vsc_mtx, NULL);
940 
941 	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
942 	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
943 	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
944 	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
945 #ifdef notyet
946 	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
947         sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
948 #endif
949 
950 	/*
951 	 * Attempt to open the backend device and read the MAC address
952 	 * if specified.
953 	 */
954 #ifdef __FreeBSD__
955 	mac_provided = 0;
956 	mtu_provided = 0;
957 #endif
958 	if (opts != NULL) {
959 		char *optscopy;
960 		char *vtopts;
961 		int err = 0;
962 
963 		/* Get the device name. */
964 		optscopy = vtopts = strdup(opts);
965 		(void) strsep(&vtopts, ",");
966 
967 #ifdef __FreeBSD__
968 		/*
969 		 * Parse the list of options in the form
970 		 *     key1=value1,...,keyN=valueN.
971 		 */
972 		while (vtopts != NULL) {
973 			char *value = vtopts;
974 			char *key;
975 
976 			key = strsep(&value, "=");
977 			if (value == NULL)
978 				break;
979 			vtopts = value;
980 			(void) strsep(&vtopts, ",");
981 
982 			if (strcmp(key, "mac") == 0) {
983 				err = net_parsemac(value, sc->vsc_config.mac);
984 				if (err)
985 					break;
986 				mac_provided = 1;
987 			} else if (strcmp(key, "mtu") == 0) {
988 				err = net_parsemtu(value, &mtu);
989 				if (err)
990 					break;
991 
992 				if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) {
993 					err = EINVAL;
994 					errno = EINVAL;
995 					break;
996 				}
997 				mtu_provided = 1;
998 			}
999 		}
1000 #endif
1001 
1002 #ifndef __FreeBSD__
1003 		/* Use the already strsep(",")-ed optscopy */
1004 		if (strncmp(optscopy, "tap", 3) == 0 ||
1005 		    strncmp(optscopy, "vmnet", 5) == 0)
1006 			pci_vtnet_tap_setup(sc, optscopy);
1007 #endif
1008 
1009 		free(optscopy);
1010 
1011 		if (err) {
1012 			free(sc);
1013 			return (err);
1014 		}
1015 
1016 #ifdef __FreeBSD__
1017 		err = netbe_init(&sc->vsc_be, opts, pci_vtnet_rx_callback,
1018 		          sc);
1019 		if (err) {
1020 			free(sc);
1021 			return (err);
1022 		}
1023 
1024 		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF |
1025 		    netbe_get_cap(sc->vsc_be);
1026 #endif
1027 
1028 	}
1029 
1030 #ifdef __FreeBSD__
1031 	if (!mac_provided) {
1032 		net_genmac(pi, sc->vsc_config.mac);
1033 	}
1034 
1035 	sc->vsc_config.mtu = mtu;
1036 	if (mtu_provided) {
1037 		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU;
1038 	}
1039 #endif
1040 
1041 	/*
1042 	 * Since we do not actually support multiqueue,
1043 	 * set the maximum virtqueue pairs to 1.
1044 	 */
1045 	sc->vsc_config.max_virtqueue_pairs = 1;
1046 
1047 	/* initialize config space */
1048 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
1049 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
1050 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
1051 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
1052 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
1053 
1054 	/* Link is up if we managed to open tap device or vale port. */
1055 #ifdef	__FreeBSD__
1056 	sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
1057 #else
1058 	sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 ||
1059 	    sc->vsc_nmd != NULL);
1060 #endif
1061 
1062 	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
1063 	if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
1064 		return (1);
1065 
1066 	/* use BAR 0 to map config regs in IO space */
1067 	vi_set_io_bar(&sc->vsc_vs, 0);
1068 
1069 	sc->resetting = 0;
1070 
1071 	sc->rx_merge = 1;
1072 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
1073 	pthread_mutex_init(&sc->rx_mtx, NULL);
1074 
1075 	/*
1076 	 * Initialize tx semaphore & spawn TX processing thread.
1077 	 * As of now, only one thread for TX desc processing is
1078 	 * spawned.
1079 	 */
1080 	sc->tx_in_progress = 0;
1081 	pthread_mutex_init(&sc->tx_mtx, NULL);
1082 	pthread_cond_init(&sc->tx_cond, NULL);
1083 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
1084 	snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
1085 	    pi->pi_func);
1086 	pthread_set_name_np(sc->tx_tid, tname);
1087 
1088 	return (0);
1089 }
1090 
1091 static int
1092 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
1093 {
1094 	struct pci_vtnet_softc *sc = vsc;
1095 	void *ptr;
1096 
1097 	if (offset < 6) {
1098 		assert(offset + size <= 6);
1099 		/*
1100 		 * The driver is allowed to change the MAC address
1101 		 */
1102 		ptr = &sc->vsc_config.mac[offset];
1103 		memcpy(ptr, &value, size);
1104 	} else {
1105 		/* silently ignore other writes */
1106 		DPRINTF(("vtnet: write to readonly reg %d", offset));
1107 	}
1108 
1109 	return (0);
1110 }
1111 
1112 static int
1113 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
1114 {
1115 	struct pci_vtnet_softc *sc = vsc;
1116 	void *ptr;
1117 
1118 	ptr = (uint8_t *)&sc->vsc_config + offset;
1119 	memcpy(retval, ptr, size);
1120 	return (0);
1121 }
1122 
1123 static void
1124 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
1125 {
1126 	struct pci_vtnet_softc *sc = vsc;
1127 
1128 	sc->vsc_features = negotiated_features;
1129 
1130 	if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
1131 		sc->rx_merge = 0;
1132 		/* non-merge rx header is 2 bytes shorter */
1133 		sc->rx_vhdrlen -= 2;
1134 	}
1135 }
1136 
1137 struct pci_devemu pci_de_vnet = {
1138 	.pe_emu = 	"virtio-net",
1139 	.pe_init =	pci_vtnet_init,
1140 	.pe_barwrite =	vi_pci_write,
1141 	.pe_barread =	vi_pci_read
1142 };
1143 PCI_EMUL_SET(pci_de_vnet);
1144