xref: /illumos-gate/usr/src/cmd/bhyve/pci_virtio_net.c (revision 25a9a7aaf35c7e4a2b5a57d3875af906147710d5)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2013 Pluribus Networks Inc.
41  * Copyright 2018 Joyent, Inc.
42  */
43 
44 #include <sys/cdefs.h>
45 __FBSDID("$FreeBSD$");
46 
47 #include <sys/param.h>
48 #ifndef WITHOUT_CAPSICUM
49 #include <sys/capsicum.h>
50 #endif
51 #include <sys/linker_set.h>
52 #include <sys/select.h>
53 #include <sys/uio.h>
54 #include <sys/ioctl.h>
55 #include <net/ethernet.h>
56 #ifdef __FreeBSD__
57 #ifndef NETMAP_WITH_LIBS
58 #define NETMAP_WITH_LIBS
59 #endif
60 #include <net/netmap_user.h>
61 #endif
62 
63 #ifndef WITHOUT_CAPSICUM
64 #include <capsicum_helpers.h>
65 #endif
66 #include <err.h>
67 #include <errno.h>
68 #include <fcntl.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <stdint.h>
72 #include <string.h>
73 #include <strings.h>
74 #include <unistd.h>
75 #include <assert.h>
76 #include <md5.h>
77 #include <pthread.h>
78 #include <pthread_np.h>
79 #include <sysexits.h>
80 #ifndef __FreeBSD__
81 #include <poll.h>
82 #include <libdlpi.h>
83 #endif
84 
85 #include "bhyverun.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 #ifdef __FreeBSD__
89 #include "mevent.h"
90 #endif
91 #include "virtio.h"
92 #include "net_utils.h"
93 
94 #define VTNET_RINGSZ	1024
95 
96 #define VTNET_MAXSEGS	256
97 
98 /*
99  * Host capabilities.  Note that we only offer a few of these.
100  */
101 #define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
102 #define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
103 #define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
104 #define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
105 #define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
106 #define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
107 #define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
108 #define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
109 #define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
110 #define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
111 #define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
112 #define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
113 #define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
114 #define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
115 #define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
116 #define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
117 #define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
118 #define	VIRTIO_NET_F_GUEST_ANNOUNCE \
119 				(1 << 21) /* guest can send gratuitous pkts */
120 
121 #define VTNET_S_HOSTCAPS      \
122   ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
123     VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
124 
125 /*
126  * PCI config-space "registers"
127  */
128 struct virtio_net_config {
129 	uint8_t  mac[6];
130 	uint16_t status;
131 	uint16_t max_virtqueue_pairs;
132 	uint16_t mtu;
133 } __packed;
134 
135 /*
136  * Queue definitions.
137  */
138 #define VTNET_RXQ	0
139 #define VTNET_TXQ	1
140 #define VTNET_CTLQ	2	/* NB: not yet supported */
141 
142 #define VTNET_MAXQ	3
143 
144 /*
145  * Fixed network header size
146  */
147 struct virtio_net_rxhdr {
148 	uint8_t		vrh_flags;
149 	uint8_t		vrh_gso_type;
150 	uint16_t	vrh_hdr_len;
151 	uint16_t	vrh_gso_size;
152 	uint16_t	vrh_csum_start;
153 	uint16_t	vrh_csum_offset;
154 	uint16_t	vrh_bufs;
155 } __packed;
156 
157 /*
158  * Debug printf
159  */
160 static int pci_vtnet_debug;
161 #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params
162 #define WPRINTF(params) PRINTLN params
163 
164 /*
165  * Per-device softc
166  */
167 struct pci_vtnet_softc {
168 	struct virtio_softc vsc_vs;
169 	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
170 	pthread_mutex_t vsc_mtx;
171 	struct mevent	*vsc_mevp;
172 
173 #ifdef	__FreeBSD
174 	int		vsc_tapfd;
175 #else
176 	dlpi_handle_t	vsc_dhp;
177 	int		vsc_dlpifd;
178 #endif
179 	struct nm_desc	*vsc_nmd;
180 
181 	int		vsc_rx_ready;
182 	bool		features_negotiated;	/* protected by rx_mtx */
183 	int		resetting;	/* protected by tx_mtx */
184 
185 	uint64_t	vsc_features;	/* negotiated features */
186 
187 	struct virtio_net_config vsc_config;
188 	struct virtio_consts vsc_consts;
189 
190 	pthread_mutex_t	rx_mtx;
191 	int		rx_vhdrlen;
192 	int		rx_merge;	/* merged rx bufs in use */
193 
194 	pthread_t 	tx_tid;
195 	pthread_mutex_t	tx_mtx;
196 	pthread_cond_t	tx_cond;
197 	int		tx_in_progress;
198 
199 	void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
200 	void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
201 			     int iovcnt, int len);
202 };
203 
204 static void pci_vtnet_reset(void *);
205 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */
206 static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
207 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
208 static void pci_vtnet_neg_features(void *, uint64_t);
209 
210 static struct virtio_consts vtnet_vi_consts = {
211 	"vtnet",		/* our name */
212 	VTNET_MAXQ - 1,		/* we currently support 2 virtqueues */
213 	sizeof(struct virtio_net_config), /* config reg size */
214 	pci_vtnet_reset,	/* reset */
215 	NULL,			/* device-wide qnotify -- not used */
216 	pci_vtnet_cfgread,	/* read PCI config */
217 	pci_vtnet_cfgwrite,	/* write PCI config */
218 	pci_vtnet_neg_features,	/* apply negotiated features */
219 	VTNET_S_HOSTCAPS,	/* our capabilities */
220 };
221 
222 static void
223 pci_vtnet_reset(void *vsc)
224 {
225 	struct pci_vtnet_softc *sc = vsc;
226 
227 	DPRINTF(("vtnet: device reset requested !"));
228 
229 	/* Acquire the RX lock to block RX processing. */
230 	pthread_mutex_lock(&sc->rx_mtx);
231 
232 	sc->features_negotiated = false;
233 
234 	/* Set sc->resetting and give a chance to the TX thread to stop. */
235 	pthread_mutex_lock(&sc->tx_mtx);
236 	sc->resetting = 1;
237 	while (sc->tx_in_progress) {
238 		pthread_mutex_unlock(&sc->tx_mtx);
239 		usleep(10000);
240 		pthread_mutex_lock(&sc->tx_mtx);
241 	}
242 
243 	sc->vsc_rx_ready = 0;
244 	sc->rx_merge = 1;
245 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
246 
247 	/*
248 	 * Now reset rings, MSI-X vectors, and negotiated capabilities.
249 	 * Do that with the TX lock held, since we need to reset
250 	 * sc->resetting.
251 	 */
252 	vi_reset_dev(&sc->vsc_vs);
253 
254 	sc->resetting = 0;
255 	pthread_mutex_unlock(&sc->tx_mtx);
256 	pthread_mutex_unlock(&sc->rx_mtx);
257 }
258 
259 /*
260  * Called to send a buffer chain out to the tap device
261  */
262 #ifdef __FreeBSD__
263 static void
264 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
265 		 int len)
266 {
267 	static char pad[60]; /* all zero bytes */
268 
269 	if (sc->vsc_tapfd == -1)
270 		return;
271 
272 	/*
273 	 * If the length is < 60, pad out to that and add the
274 	 * extra zero'd segment to the iov. It is guaranteed that
275 	 * there is always an extra iov available by the caller.
276 	 */
277 	if (len < 60) {
278 		iov[iovcnt].iov_base = pad;
279 		iov[iovcnt].iov_len = 60 - len;
280 		iovcnt++;
281 	}
282 	(void) writev(sc->vsc_tapfd, iov, iovcnt);
283 }
284 #else
285 static void
286 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
287 		 int len)
288 {
289 	int i;
290 
291 	for (i = 0; i < iovcnt; i++) {
292 		(void) dlpi_send(sc->vsc_dhp, NULL, 0,
293 		    iov[i].iov_base, iov[i].iov_len, NULL);
294 	}
295 }
296 #endif /* __FreeBSD__ */
297 
298 #ifdef __FreeBSD__
299 /*
300  *  Called when there is read activity on the tap file descriptor.
301  * Each buffer posted by the guest is assumed to be able to contain
302  * an entire ethernet frame + rx header.
303  *  MP note: the dummybuf is only used for discarding frames, so there
304  * is no need for it to be per-vtnet or locked.
305  */
306 static uint8_t dummybuf[2048];
307 #endif /* __FreeBSD__ */
308 
309 static __inline struct iovec *
310 rx_iov_trim(struct iovec *iov, int *niov, int tlen)
311 {
312 	struct iovec *riov;
313 
314 	/* XXX short-cut: assume first segment is >= tlen */
315 	assert(iov[0].iov_len >= tlen);
316 
317 	iov[0].iov_len -= tlen;
318 	if (iov[0].iov_len == 0) {
319 		assert(*niov > 1);
320 		*niov -= 1;
321 		riov = &iov[1];
322 	} else {
323 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
324 		riov = &iov[0];
325 	}
326 
327 	return (riov);
328 }
329 
330 static void
331 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
332 {
333 	struct iovec iov[VTNET_MAXSEGS], *riov;
334 	struct vqueue_info *vq;
335 	void *vrx;
336 	int n;
337 #ifdef	__FreeBSD__
338 	int len;
339 #else
340 	size_t len;
341 	int ret;
342 #endif
343 	uint16_t idx;
344 
345 	/*
346 	 * Should never be called without a valid tap fd
347 	 */
348 #ifdef	__FreeBSD__
349 	assert(sc->vsc_tapfd != -1);
350 #else
351 	assert(sc->vsc_dlpifd != -1);
352 #endif
353 
354 	/* Features must be negotiated */
355 	if (!sc->features_negotiated) {
356 		return;
357 	}
358 
359 	/*
360 	 * But, will be called when the rx ring hasn't yet
361 	 * been set up.
362 	 */
363 	if (!sc->vsc_rx_ready) {
364 #ifdef	__FreeBSD__
365 		/*
366 		 * Drop the packet and try later.
367 		 */
368 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
369 #endif
370 		return;
371 	}
372 
373 	/*
374 	 * Check for available rx buffers
375 	 */
376 	vq = &sc->vsc_queues[VTNET_RXQ];
377 	if (!vq_has_descs(vq)) {
378 		/*
379 		 * Drop the packet and try later.  Interrupt on
380 		 * empty, if that's negotiated.
381 		 */
382 #ifdef	__FreeBSD__
383 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
384 #endif
385 		vq_endchains(vq, 1);
386 		return;
387 	}
388 
389 	do {
390 		/*
391 		 * Get descriptor chain
392 		 */
393 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
394 		assert(n >= 1 && n <= VTNET_MAXSEGS);
395 
396 		/*
397 		 * Get a pointer to the rx header, and use the
398 		 * data immediately following it for the packet buffer.
399 		 */
400 		vrx = iov[0].iov_base;
401 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
402 #ifdef	__FreeBSD__
403 		len = readv(sc->vsc_tapfd, riov, n);
404 #else
405 		len = riov[0].iov_len;
406 		ret = dlpi_recv(sc->vsc_dhp, NULL, NULL,
407 		    (uint8_t *)riov[0].iov_base, &len, 0, NULL);
408 		if (ret != DLPI_SUCCESS) {
409 			errno = EWOULDBLOCK;
410 			len = 0;
411 		}
412 #endif
413 		if (len <= 0 && errno == EWOULDBLOCK) {
414 			/*
415 			 * No more packets, but still some avail ring
416 			 * entries.  Interrupt if needed/appropriate.
417 			 */
418 			vq_retchains(vq, 1);
419 			vq_endchains(vq, 0);
420 			return;
421 		}
422 
423 		/*
424 		 * The only valid field in the rx packet header is the
425 		 * number of buffers if merged rx bufs were negotiated.
426 		 */
427 		memset(vrx, 0, sc->rx_vhdrlen);
428 
429 		if (sc->rx_merge) {
430 			struct virtio_net_rxhdr *vrxh;
431 
432 			vrxh = vrx;
433 			vrxh->vrh_bufs = 1;
434 		}
435 
436 		/*
437 		 * Release this chain and handle more chains.
438 		 */
439 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
440 	} while (vq_has_descs(vq));
441 
442 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
443 	vq_endchains(vq, 1);
444 }
445 
446 #ifdef __FreeBSD__
447 static __inline int
448 pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
449 {
450 	int r, i;
451 	int len = 0;
452 
453 	for (r = nmd->cur_tx_ring; ; ) {
454 		struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
455 		uint32_t cur, idx;
456 		char *buf;
457 
458 		if (nm_ring_empty(ring)) {
459 			r++;
460 			if (r > nmd->last_tx_ring)
461 				r = nmd->first_tx_ring;
462 			if (r == nmd->cur_tx_ring)
463 				break;
464 			continue;
465 		}
466 		cur = ring->cur;
467 		idx = ring->slot[cur].buf_idx;
468 		buf = NETMAP_BUF(ring, idx);
469 
470 		for (i = 0; i < iovcnt; i++) {
471 			if (len + iov[i].iov_len > 2048)
472 				break;
473 			memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
474 			len += iov[i].iov_len;
475 		}
476 		ring->slot[cur].len = len;
477 		ring->head = ring->cur = nm_ring_next(ring, cur);
478 		nmd->cur_tx_ring = r;
479 		ioctl(nmd->fd, NIOCTXSYNC, NULL);
480 		break;
481 	}
482 
483 	return (len);
484 }
485 
486 static __inline int
487 pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
488 {
489 	int len = 0;
490 	int i = 0;
491 	int r;
492 
493 	for (r = nmd->cur_rx_ring; ; ) {
494 		struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
495 		uint32_t cur, idx;
496 		char *buf;
497 		size_t left;
498 
499 		if (nm_ring_empty(ring)) {
500 			r++;
501 			if (r > nmd->last_rx_ring)
502 				r = nmd->first_rx_ring;
503 			if (r == nmd->cur_rx_ring)
504 				break;
505 			continue;
506 		}
507 		cur = ring->cur;
508 		idx = ring->slot[cur].buf_idx;
509 		buf = NETMAP_BUF(ring, idx);
510 		left = ring->slot[cur].len;
511 
512 		for (i = 0; i < iovcnt && left > 0; i++) {
513 			if (iov[i].iov_len > left)
514 				iov[i].iov_len = left;
515 			memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
516 			len += iov[i].iov_len;
517 			left -= iov[i].iov_len;
518 		}
519 		ring->head = ring->cur = nm_ring_next(ring, cur);
520 		nmd->cur_rx_ring = r;
521 		ioctl(nmd->fd, NIOCRXSYNC, NULL);
522 		break;
523 	}
524 	for (; i < iovcnt; i++)
525 		iov[i].iov_len = 0;
526 
527 	return (len);
528 }
529 
530 /*
531  * Called to send a buffer chain out to the vale port
532  */
533 static void
534 pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
535 		    int len)
536 {
537 	static char pad[60]; /* all zero bytes */
538 
539 	if (sc->vsc_nmd == NULL)
540 		return;
541 
542 	/*
543 	 * If the length is < 60, pad out to that and add the
544 	 * extra zero'd segment to the iov. It is guaranteed that
545 	 * there is always an extra iov available by the caller.
546 	 */
547 	if (len < 60) {
548 		iov[iovcnt].iov_base = pad;
549 		iov[iovcnt].iov_len = 60 - len;
550 		iovcnt++;
551 	}
552 	(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
553 }
554 
555 static void
556 pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
557 {
558 	struct iovec iov[VTNET_MAXSEGS], *riov;
559 	struct vqueue_info *vq;
560 	void *vrx;
561 	int len, n;
562 	uint16_t idx;
563 
564 	/*
565 	 * Should never be called without a valid netmap descriptor
566 	 */
567 	assert(sc->vsc_nmd != NULL);
568 
569 	/* Features must be negotiated */
570 	if (!sc->features_negotiated) {
571 		return;
572 	}
573 
574 	/*
575 	 * But, will be called when the rx ring hasn't yet
576 	 * been set up.
577 	 */
578 	if (!sc->vsc_rx_ready) {
579 		/*
580 		 * Drop the packet and try later.
581 		 */
582 		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
583 		return;
584 	}
585 
586 	/*
587 	 * Check for available rx buffers
588 	 */
589 	vq = &sc->vsc_queues[VTNET_RXQ];
590 	if (!vq_has_descs(vq)) {
591 		/*
592 		 * Drop the packet and try later.  Interrupt on
593 		 * empty, if that's negotiated.
594 		 */
595 		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
596 		vq_endchains(vq, 1);
597 		return;
598 	}
599 
600 	do {
601 		/*
602 		 * Get descriptor chain.
603 		 */
604 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
605 		assert(n >= 1 && n <= VTNET_MAXSEGS);
606 
607 		/*
608 		 * Get a pointer to the rx header, and use the
609 		 * data immediately following it for the packet buffer.
610 		 */
611 		vrx = iov[0].iov_base;
612 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
613 
614 		len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
615 
616 		if (len == 0) {
617 			/*
618 			 * No more packets, but still some avail ring
619 			 * entries.  Interrupt if needed/appropriate.
620 			 */
621 			vq_retchain(vq);
622 			vq_endchains(vq, 0);
623 			return;
624 		}
625 
626 		/*
627 		 * The only valid field in the rx packet header is the
628 		 * number of buffers if merged rx bufs were negotiated.
629 		 */
630 		memset(vrx, 0, sc->rx_vhdrlen);
631 
632 		if (sc->rx_merge) {
633 			struct virtio_net_rxhdr *vrxh;
634 
635 			vrxh = vrx;
636 			vrxh->vrh_bufs = 1;
637 		}
638 
639 		/*
640 		 * Release this chain and handle more chains.
641 		 */
642 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
643 	} while (vq_has_descs(vq));
644 
645 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
646 	vq_endchains(vq, 1);
647 }
648 #endif /* __FreeBSD__ */
649 
650 #ifdef __FreeBSD__
651 static void
652 pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
653 {
654 	struct pci_vtnet_softc *sc = param;
655 
656 	pthread_mutex_lock(&sc->rx_mtx);
657 	sc->pci_vtnet_rx(sc);
658 	pthread_mutex_unlock(&sc->rx_mtx);
659 
660 }
661 #else
662 static void *
663 pci_vtnet_poll_thread(void *param)
664 {
665 	struct pci_vtnet_softc *sc = param;
666 	pollfd_t pollset;
667 
668 	pollset.fd = sc->vsc_dlpifd;
669 	pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
670 
671 	for (;;) {
672 		if (poll(&pollset, 1, -1) < 0) {
673 			if (errno == EINTR)
674 				continue;
675 			fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno);
676 			continue;
677 		}
678 		pthread_mutex_lock(&sc->vsc_mtx);
679 		pci_vtnet_tap_rx(sc);
680 		pthread_mutex_unlock(&sc->vsc_mtx);
681 	}
682 
683 	return (NULL);
684 }
685 #endif /* __FreeBSD__ */
686 
687 static void
688 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
689 {
690 	struct pci_vtnet_softc *sc = vsc;
691 
692 	/*
693 	 * A qnotify means that the rx process can now begin.
694 	 * Enable RX only if features are negotiated.
695 	 */
696 	pthread_mutex_lock(&sc->rx_mtx);
697 	if (sc->vsc_rx_ready == 0 && sc->features_negotiated) {
698 		sc->vsc_rx_ready = 1;
699 		vq_kick_disable(vq);
700 	}
701 	pthread_mutex_unlock(&sc->rx_mtx);
702 }
703 
704 static void
705 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
706 {
707 	struct iovec iov[VTNET_MAXSEGS + 1];
708 	int i, n;
709 	int plen, tlen;
710 	uint16_t idx;
711 
712 	/*
713 	 * Obtain chain of descriptors.  The first one is
714 	 * really the header descriptor, so we need to sum
715 	 * up two lengths: packet length and transfer length.
716 	 */
717 	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
718 	assert(n >= 1 && n <= VTNET_MAXSEGS);
719 	plen = 0;
720 	tlen = iov[0].iov_len;
721 	for (i = 1; i < n; i++) {
722 		plen += iov[i].iov_len;
723 		tlen += iov[i].iov_len;
724 	}
725 
726 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
727 	sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
728 
729 	/* chain is processed, release it and set tlen */
730 	vq_relchain(vq, idx, tlen);
731 }
732 
733 static void
734 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
735 {
736 	struct pci_vtnet_softc *sc = vsc;
737 
738 	/*
739 	 * Any ring entries to process?
740 	 */
741 	if (!vq_has_descs(vq))
742 		return;
743 
744 	/* Signal the tx thread for processing */
745 	pthread_mutex_lock(&sc->tx_mtx);
746 	vq_kick_disable(vq);
747 	if (sc->tx_in_progress == 0)
748 		pthread_cond_signal(&sc->tx_cond);
749 	pthread_mutex_unlock(&sc->tx_mtx);
750 }
751 
752 /*
753  * Thread which will handle processing of TX desc
754  */
755 static void *
756 pci_vtnet_tx_thread(void *param)
757 {
758 	struct pci_vtnet_softc *sc = param;
759 	struct vqueue_info *vq;
760 	int error;
761 
762 	vq = &sc->vsc_queues[VTNET_TXQ];
763 
764 	/*
765 	 * Let us wait till the tx queue pointers get initialised &
766 	 * first tx signaled
767 	 */
768 	pthread_mutex_lock(&sc->tx_mtx);
769 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
770 	assert(error == 0);
771 
772 	for (;;) {
773 		/* note - tx mutex is locked here */
774 		while (sc->resetting || !vq_has_descs(vq)) {
775 			vq_kick_enable(vq);
776 			if (!sc->resetting && vq_has_descs(vq))
777 				break;
778 
779 			sc->tx_in_progress = 0;
780 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
781 			assert(error == 0);
782 		}
783 		vq_kick_disable(vq);
784 		sc->tx_in_progress = 1;
785 		pthread_mutex_unlock(&sc->tx_mtx);
786 
787 		do {
788 			/*
789 			 * Run through entries, placing them into
790 			 * iovecs and sending when an end-of-packet
791 			 * is found
792 			 */
793 			pci_vtnet_proctx(sc, vq);
794 		} while (vq_has_descs(vq));
795 
796 		/*
797 		 * Generate an interrupt if needed.
798 		 */
799 		vq_endchains(vq, 1);
800 
801 		pthread_mutex_lock(&sc->tx_mtx);
802 	}
803 	return (NULL);
804 }
805 
806 #ifdef __FreeBSD__
807 static void
808 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
809 {
810 
811 	DPRINTF(("vtnet: control qnotify!"));
812 }
813 #endif /* __FreeBSD__ */
814 
815 static void
816 pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
817 {
818 	char tbuf[80];
819 #ifndef WITHOUT_CAPSICUM
820 	cap_rights_t rights;
821 #endif
822 #ifndef	__FreeBSD__
823 	uchar_t physaddr[DLPI_PHYSADDR_MAX];
824 	size_t physaddrlen = DLPI_PHYSADDR_MAX;
825 	int error;
826 #endif
827 
828 	strcpy(tbuf, "/dev/");
829 	strlcat(tbuf, devname, sizeof(tbuf));
830 
831 	sc->pci_vtnet_rx = pci_vtnet_tap_rx;
832 	sc->pci_vtnet_tx = pci_vtnet_tap_tx;
833 #ifdef	__FreeBSD__
834 	sc->vsc_tapfd = open(tbuf, O_RDWR);
835 	if (sc->vsc_tapfd == -1) {
836 		WPRINTF(("open of tap device %s failed\n", tbuf));
837 		return;
838 	}
839 
840 	/*
841 	 * Set non-blocking and register for read
842 	 * notifications with the event loop
843 	 */
844 	int opt = 1;
845 	if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
846 		WPRINTF(("tap device O_NONBLOCK failed\n"));
847 		close(sc->vsc_tapfd);
848 		sc->vsc_tapfd = -1;
849 	}
850 
851 #ifndef WITHOUT_CAPSICUM
852 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
853 	if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
854 		errx(EX_OSERR, "Unable to apply rights for sandbox");
855 #endif
856 
857 	sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
858 				  EVF_READ,
859 				  pci_vtnet_rx_callback,
860 				  sc);
861 	if (sc->vsc_mevp == NULL) {
862 		WPRINTF(("Could not register event\n"));
863 		close(sc->vsc_tapfd);
864 		sc->vsc_tapfd = -1;
865 	}
866 #else
867 	if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
868 		WPRINTF(("open of vnic device %s failed\n", devname));
869 	}
870 
871 	if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr,
872 	    &physaddrlen) != DLPI_SUCCESS) {
873 		WPRINTF(("read MAC address of vnic device %s failed\n",
874 		    devname));
875 	}
876 	if (physaddrlen != ETHERADDRL) {
877 		WPRINTF(("bad MAC address len %d on vnic device %s\n",
878 		    physaddrlen, devname));
879 	}
880 	memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
881 
882 	if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
883 		WPRINTF(("bind of vnic device %s failed\n", devname));
884 	}
885 
886 	if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
887 		WPRINTF(("enable promiscous mode(physical) of vnic device %s "
888 		    "failed\n", devname));
889 	}
890 	if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
891 		WPRINTF(("enable promiscous mode(SAP) of vnic device %s "
892 		    "failed\n", devname));
893 	}
894 
895 	sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
896 
897 	if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
898 		WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n",
899 		    devname));
900 		dlpi_close(sc->vsc_dhp);
901 		sc->vsc_dlpifd = -1;
902 	}
903 
904 	error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
905 	assert(error == 0);
906 #endif
907 }
908 
909 #ifdef __FreeBSD__
910 static void
911 pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
912 {
913 	sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
914 	sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
915 
916 	sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
917 	if (sc->vsc_nmd == NULL) {
918 		WPRINTF(("open of netmap device %s failed\n", ifname));
919 		return;
920 	}
921 
922 	sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
923 				  EVF_READ,
924 				  pci_vtnet_rx_callback,
925 				  sc);
926 	if (sc->vsc_mevp == NULL) {
927 		WPRINTF(("Could not register event\n"));
928 		nm_close(sc->vsc_nmd);
929 		sc->vsc_nmd = NULL;
930 	}
931 }
932 #endif /* __FreeBSD__ */
933 
934 static int
935 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
936 {
937 	struct pci_vtnet_softc *sc;
938 	char tname[MAXCOMLEN + 1];
939 #ifdef __FreeBSD__
940 	int mac_provided;
941 	int mtu_provided;
942 	unsigned long mtu = ETHERMTU;
943 #else
944 	int use_msix = 1;
945 #endif
946 
947 	/*
948 	 * Allocate data structures for further virtio initializations.
949 	 * sc also contains a copy of vtnet_vi_consts, since capabilities
950 	 * change depending on the backend.
951 	 */
952 	sc = calloc(1, sizeof(struct pci_vtnet_softc));
953 
954 	sc->vsc_consts = vtnet_vi_consts;
955 	pthread_mutex_init(&sc->vsc_mtx, NULL);
956 
957 	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
958 	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
959 	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
960 	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
961 #ifdef notyet
962 	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
963         sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
964 #endif
965 
966 	/*
967 	 * Attempt to open the backend device and read the MAC address
968 	 * if specified.
969 	 */
970 #ifdef __FreeBSD__
971 	mac_provided = 0;
972 	mtu_provided = 0;
973 #endif
974 	if (opts != NULL) {
975 		char *optscopy;
976 		char *vtopts;
977 		int err = 0;
978 
979 		/* Get the device name. */
980 		optscopy = vtopts = strdup(opts);
981 		(void) strsep(&vtopts, ",");
982 
983 #ifdef __FreeBSD__
984 		/*
985 		 * Parse the list of options in the form
986 		 *     key1=value1,...,keyN=valueN.
987 		 */
988 		while (vtopts != NULL) {
989 			char *value = vtopts;
990 			char *key;
991 
992 			key = strsep(&value, "=");
993 			if (value == NULL)
994 				break;
995 			vtopts = value;
996 			(void) strsep(&vtopts, ",");
997 
998 			if (strcmp(key, "mac") == 0) {
999 				err = net_parsemac(value, sc->vsc_config.mac);
1000 				if (err)
1001 					break;
1002 				mac_provided = 1;
1003 			} else if (strcmp(key, "mtu") == 0) {
1004 				err = net_parsemtu(value, &mtu);
1005 				if (err)
1006 					break;
1007 
1008 				if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) {
1009 					err = EINVAL;
1010 					errno = EINVAL;
1011 					break;
1012 				}
1013 				mtu_provided = 1;
1014 			}
1015 		}
1016 #endif
1017 
1018 #ifndef __FreeBSD__
1019 		/* Use the already strsep(",")-ed optscopy */
1020 		if (strncmp(optscopy, "tap", 3) == 0 ||
1021 		    strncmp(optscopy, "vmnet", 5) == 0)
1022 			pci_vtnet_tap_setup(sc, optscopy);
1023 #endif
1024 
1025 		free(optscopy);
1026 
1027 		if (err) {
1028 			free(sc);
1029 			return (err);
1030 		}
1031 
1032 #ifdef __FreeBSD__
1033 		err = netbe_init(&sc->vsc_be, opts, pci_vtnet_rx_callback,
1034 		          sc);
1035 		if (err) {
1036 			free(sc);
1037 			return (err);
1038 		}
1039 
1040 		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF |
1041 		    netbe_get_cap(sc->vsc_be);
1042 #endif
1043 
1044 	}
1045 
1046 #ifdef __FreeBSD__
1047 	if (!mac_provided) {
1048 		net_genmac(pi, sc->vsc_config.mac);
1049 	}
1050 
1051 	sc->vsc_config.mtu = mtu;
1052 	if (mtu_provided) {
1053 		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU;
1054 	}
1055 #endif
1056 
1057 	/*
1058 	 * Since we do not actually support multiqueue,
1059 	 * set the maximum virtqueue pairs to 1.
1060 	 */
1061 	sc->vsc_config.max_virtqueue_pairs = 1;
1062 
1063 	/* initialize config space */
1064 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
1065 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
1066 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
1067 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
1068 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
1069 
1070 	/* Link is up if we managed to open tap device or vale port. */
1071 #ifdef	__FreeBSD__
1072 	sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
1073 #else
1074 	sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 ||
1075 	    sc->vsc_nmd != NULL);
1076 #endif
1077 
1078 	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
1079 	if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
1080 		return (1);
1081 
1082 	/* use BAR 0 to map config regs in IO space */
1083 	vi_set_io_bar(&sc->vsc_vs, 0);
1084 
1085 	sc->resetting = 0;
1086 
1087 	sc->rx_merge = 1;
1088 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
1089 	pthread_mutex_init(&sc->rx_mtx, NULL);
1090 
1091 	/*
1092 	 * Initialize tx semaphore & spawn TX processing thread.
1093 	 * As of now, only one thread for TX desc processing is
1094 	 * spawned.
1095 	 */
1096 	sc->tx_in_progress = 0;
1097 	pthread_mutex_init(&sc->tx_mtx, NULL);
1098 	pthread_cond_init(&sc->tx_cond, NULL);
1099 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
1100 	snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
1101 	    pi->pi_func);
1102 	pthread_set_name_np(sc->tx_tid, tname);
1103 
1104 	return (0);
1105 }
1106 
1107 static int
1108 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
1109 {
1110 	struct pci_vtnet_softc *sc = vsc;
1111 	void *ptr;
1112 
1113 	if (offset < 6) {
1114 		assert(offset + size <= 6);
1115 		/*
1116 		 * The driver is allowed to change the MAC address
1117 		 */
1118 		ptr = &sc->vsc_config.mac[offset];
1119 		memcpy(ptr, &value, size);
1120 	} else {
1121 		/* silently ignore other writes */
1122 		DPRINTF(("vtnet: write to readonly reg %d", offset));
1123 	}
1124 
1125 	return (0);
1126 }
1127 
1128 static int
1129 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
1130 {
1131 	struct pci_vtnet_softc *sc = vsc;
1132 	void *ptr;
1133 
1134 	ptr = (uint8_t *)&sc->vsc_config + offset;
1135 	memcpy(retval, ptr, size);
1136 	return (0);
1137 }
1138 
1139 static void
1140 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
1141 {
1142 	struct pci_vtnet_softc *sc = vsc;
1143 
1144 	sc->vsc_features = negotiated_features;
1145 
1146 	if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
1147 		sc->rx_merge = 0;
1148 		/* non-merge rx header is 2 bytes shorter */
1149 		sc->rx_vhdrlen -= 2;
1150 	}
1151 
1152 	pthread_mutex_lock(&sc->rx_mtx);
1153 	sc->features_negotiated = true;
1154 	pthread_mutex_unlock(&sc->rx_mtx);
1155 }
1156 
1157 struct pci_devemu pci_de_vnet = {
1158 	.pe_emu = 	"virtio-net",
1159 	.pe_init =	pci_vtnet_init,
1160 	.pe_barwrite =	vi_pci_write,
1161 	.pe_barread =	vi_pci_read
1162 };
1163 PCI_EMUL_SET(pci_de_vnet);
1164