xref: /freebsd/usr.sbin/bhyve/pci_virtio_net.c (revision f0157ce528a128e2abb181a5c766033a2ce49a5f)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/linker_set.h>
34 #include <sys/select.h>
35 #include <sys/uio.h>
36 #include <sys/ioctl.h>
37 #include <net/ethernet.h>
38 
39 #include <errno.h>
40 #include <fcntl.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <strings.h>
46 #include <unistd.h>
47 #include <assert.h>
48 #include <md5.h>
49 #include <pthread.h>
50 #include <pthread_np.h>
51 
52 #include "bhyverun.h"
53 #include "pci_emul.h"
54 #include "mevent.h"
55 #include "virtio.h"
56 
57 #define VTNET_RINGSZ	1024
58 
59 #define VTNET_MAXSEGS	32
60 
61 /*
62  * PCI config-space register offsets
63  */
64 #define VTNET_R_CFG0	24
65 #define VTNET_R_CFG1	25
66 #define VTNET_R_CFG2	26
67 #define VTNET_R_CFG3	27
68 #define VTNET_R_CFG4	28
69 #define VTNET_R_CFG5	29
70 #define VTNET_R_CFG6	30
71 #define VTNET_R_CFG7	31
72 #define VTNET_R_MAX	31
73 
74 #define VTNET_REGSZ	VTNET_R_MAX+1
75 
76 /*
77  * Host capabilities
78  */
79 #define VTNET_S_HOSTCAPS      \
80   ( 0x00000020 |	/* host supplies MAC */ \
81     0x00008000 |	/* host can merge Rx buffers */ \
82     0x00010000 |	/* config status available */ \
83     VIRTIO_F_NOTIFY_ON_EMPTY)
84 
85 /*
86  * Queue definitions.
87  */
88 #define VTNET_RXQ	0
89 #define VTNET_TXQ	1
90 #define VTNET_CTLQ	2
91 
92 #define VTNET_MAXQ	3
93 
94 static int use_msix = 1;
95 
96 struct vring_hqueue {
97 	/* Internal state */
98 	uint16_t	hq_size;
99 	uint16_t	hq_cur_aidx;		/* trails behind 'avail_idx' */
100 
101 	 /* Host-context pointers to the queue */
102 	struct virtio_desc *hq_dtable;
103 	uint16_t	*hq_avail_flags;
104 	uint16_t	*hq_avail_idx;		/* monotonically increasing */
105 	uint16_t	*hq_avail_ring;
106 
107 	uint16_t	*hq_used_flags;
108 	uint16_t	*hq_used_idx;		/* monotonically increasing */
109 	struct virtio_used *hq_used_ring;
110 };
111 
112 /*
113  * Fixed network header size
114  */
115 struct virtio_net_rxhdr {
116 	uint8_t		vrh_flags;
117 	uint8_t		vrh_gso_type;
118 	uint16_t	vrh_hdr_len;
119 	uint16_t	vrh_gso_size;
120 	uint16_t	vrh_csum_start;
121 	uint16_t	vrh_csum_offset;
122 	uint16_t	vrh_bufs;
123 } __packed;
124 
125 /*
126  * Debug printf
127  */
128 static int pci_vtnet_debug;
129 #define DPRINTF(params) if (pci_vtnet_debug) printf params
130 #define WPRINTF(params) printf params
131 
132 /*
133  * Per-device softc
134  */
135 struct pci_vtnet_softc {
136 	struct pci_devinst *vsc_pi;
137 	pthread_mutex_t vsc_mtx;
138 	struct mevent	*vsc_mevp;
139 
140 	int		vsc_curq;
141 	int		vsc_status;
142 	int		vsc_isr;
143 	int		vsc_tapfd;
144 	int		vsc_rx_ready;
145 	int		resetting;
146 
147 	uint32_t	vsc_features;
148 	uint8_t		vsc_macaddr[6];
149 
150 	uint64_t	vsc_pfn[VTNET_MAXQ];
151 	struct	vring_hqueue vsc_hq[VTNET_MAXQ];
152 	uint16_t	vsc_msix_table_idx[VTNET_MAXQ];
153 
154 	pthread_mutex_t	rx_mtx;
155 	int		rx_in_progress;
156 
157 	pthread_t 	tx_tid;
158 	pthread_mutex_t	tx_mtx;
159 	pthread_cond_t	tx_cond;
160 	int		tx_in_progress;
161 };
162 #define	vtnet_ctx(sc)		((sc)->vsc_pi->pi_vmctx)
163 #define	notify_on_empty(sc)	((sc)->vsc_features & VIRTIO_F_NOTIFY_ON_EMPTY)
164 
165 /*
166  * Return the size of IO BAR that maps virtio header and device specific
167  * region. The size would vary depending on whether MSI-X is enabled or
168  * not.
169  */
170 static uint64_t
171 pci_vtnet_iosize(struct pci_devinst *pi)
172 {
173 	if (pci_msix_enabled(pi))
174 		return (VTNET_REGSZ);
175 	else
176 		return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
177 }
178 
179 /*
180  * Return the number of available descriptors in the vring taking care
181  * of the 16-bit index wraparound.
182  */
183 static int
184 hq_num_avail(struct vring_hqueue *hq)
185 {
186 	uint16_t ndesc;
187 
188 	/*
189 	 * We're just computing (a-b) mod 2^16
190 	 *
191 	 * The only glitch here is that in standard C,
192 	 * uint16_t promotes to (signed) int when int has
193 	 * more than 16 bits (pretty much always now), so
194 	 * we have to force it back to unsigned.
195 	 */
196 	ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
197 
198 	assert(ndesc <= hq->hq_size);
199 
200 	return (ndesc);
201 }
202 
203 static uint16_t
204 pci_vtnet_qsize(int qnum)
205 {
206 	/* XXX no ctl queue currently */
207 	if (qnum == VTNET_CTLQ) {
208 		return (0);
209 	}
210 
211 	/* XXX fixed currently. Maybe different for tx/rx/ctl */
212 	return (VTNET_RINGSZ);
213 }
214 
215 static void
216 pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
217 {
218 	struct vring_hqueue *hq;
219 
220 	assert(ring < VTNET_MAXQ);
221 
222 	hq = &sc->vsc_hq[ring];
223 
224 	/*
225 	 * Reset all soft state
226 	 */
227 	hq->hq_cur_aidx = 0;
228 }
229 
230 /*
231  * If the transmit thread is active then stall until it is done.
232  */
233 static void
234 pci_vtnet_txwait(struct pci_vtnet_softc *sc)
235 {
236 
237 	pthread_mutex_lock(&sc->tx_mtx);
238 	while (sc->tx_in_progress) {
239 		pthread_mutex_unlock(&sc->tx_mtx);
240 		usleep(10000);
241 		pthread_mutex_lock(&sc->tx_mtx);
242 	}
243 	pthread_mutex_unlock(&sc->tx_mtx);
244 }
245 
246 /*
247  * If the receive thread is active then stall until it is done.
248  */
249 static void
250 pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
251 {
252 
253 	pthread_mutex_lock(&sc->rx_mtx);
254 	while (sc->rx_in_progress) {
255 		pthread_mutex_unlock(&sc->rx_mtx);
256 		usleep(10000);
257 		pthread_mutex_lock(&sc->rx_mtx);
258 	}
259 	pthread_mutex_unlock(&sc->rx_mtx);
260 }
261 
262 static void
263 pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
264 {
265 	int i;
266 
267 	if (value == 0) {
268 		DPRINTF(("vtnet: device reset requested !\n"));
269 
270 		sc->resetting = 1;
271 
272 		/*
273 		 * Wait for the transmit and receive threads to finish their
274 		 * processing.
275 		 */
276 		pci_vtnet_txwait(sc);
277 		pci_vtnet_rxwait(sc);
278 
279 		sc->vsc_rx_ready = 0;
280 		pci_vtnet_ring_reset(sc, VTNET_RXQ);
281 		pci_vtnet_ring_reset(sc, VTNET_TXQ);
282 
283 		for (i = 0; i < VTNET_MAXQ; i++)
284 			sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
285 
286 		sc->vsc_isr = 0;
287 		sc->vsc_features = 0;
288 
289 		sc->resetting = 0;
290 	}
291 
292 	sc->vsc_status = value;
293 }
294 
295 static void
296 vtnet_generate_interrupt(struct pci_vtnet_softc *sc, int qidx)
297 {
298 
299 	if (use_msix) {
300 		pci_generate_msix(sc->vsc_pi, sc->vsc_msix_table_idx[qidx]);
301 	} else {
302 		sc->vsc_isr |= 1;
303 		pci_generate_msi(sc->vsc_pi, 0);
304 	}
305 }
306 
307 /*
308  * Called to send a buffer chain out to the tap device
309  */
310 static void
311 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
312 		 int len)
313 {
314 	char pad[60];
315 
316 	if (sc->vsc_tapfd == -1)
317 		return;
318 
319 	/*
320 	 * If the length is < 60, pad out to that and add the
321 	 * extra zero'd segment to the iov. It is guaranteed that
322 	 * there is always an extra iov available by the caller.
323 	 */
324 	if (len < 60) {
325 		memset(pad, 0, 60 - len);
326 		iov[iovcnt].iov_base = pad;
327 		iov[iovcnt].iov_len = 60 - len;
328 		iovcnt++;
329 	}
330 	(void) writev(sc->vsc_tapfd, iov, iovcnt);
331 }
332 
333 /*
334  *  Called when there is read activity on the tap file descriptor.
335  * Each buffer posted by the guest is assumed to be able to contain
336  * an entire ethernet frame + rx header.
337  *  MP note: the dummybuf is only used for discarding frames, so there
338  * is no need for it to be per-vtnet or locked.
339  */
340 static uint8_t dummybuf[2048];
341 
342 static void
343 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
344 {
345 	struct virtio_desc *vd;
346 	struct virtio_used *vu;
347 	struct vring_hqueue *hq;
348 	struct virtio_net_rxhdr *vrx;
349 	uint8_t *buf;
350 	int i;
351 	int len;
352 	int ndescs;
353 	int didx, uidx, aidx;	/* descriptor, avail and used index */
354 
355 	/*
356 	 * Should never be called without a valid tap fd
357 	 */
358 	assert(sc->vsc_tapfd != -1);
359 
360 	/*
361 	 * But, will be called when the rx ring hasn't yet
362 	 * been set up or the guest is resetting the device.
363 	 */
364 	if (!sc->vsc_rx_ready || sc->resetting) {
365 		/*
366 		 * Drop the packet and try later.
367 		 */
368 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
369 		return;
370 	}
371 
372 	/*
373 	 * Calculate the number of available rx buffers
374 	 */
375 	hq = &sc->vsc_hq[VTNET_RXQ];
376 
377 	ndescs = hq_num_avail(hq);
378 
379 	if (ndescs == 0) {
380 		/*
381 		 * Drop the packet and try later
382 		 */
383 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
384 
385 		if (notify_on_empty(sc))
386 			vtnet_generate_interrupt(sc, VTNET_RXQ);
387 
388 		return;
389 	}
390 
391 	aidx = hq->hq_cur_aidx;
392 	uidx = *hq->hq_used_idx;
393 	for (i = 0; i < ndescs; i++) {
394 		/*
395 		 * 'aidx' indexes into the an array of descriptor indexes
396 		 */
397 		didx = hq->hq_avail_ring[aidx % hq->hq_size];
398 		assert(didx >= 0 && didx < hq->hq_size);
399 
400 		vd = &hq->hq_dtable[didx];
401 
402 		/*
403 		 * Get a pointer to the rx header, and use the
404 		 * data immediately following it for the packet buffer.
405 		 */
406 		vrx = paddr_guest2host(vtnet_ctx(sc), vd->vd_addr, vd->vd_len);
407 		buf = (uint8_t *)(vrx + 1);
408 
409 		len = read(sc->vsc_tapfd, buf,
410 			   vd->vd_len - sizeof(struct virtio_net_rxhdr));
411 
412 		if (len < 0 && errno == EWOULDBLOCK) {
413 			break;
414 		}
415 
416 		/*
417 		 * The only valid field in the rx packet header is the
418 		 * number of buffers, which is always 1 without TSO
419 		 * support.
420 		 */
421 		memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
422 		vrx->vrh_bufs = 1;
423 
424 		/*
425 		 * Write this descriptor into the used ring
426 		 */
427 		vu = &hq->hq_used_ring[uidx % hq->hq_size];
428 		vu->vu_idx = didx;
429 		vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
430 		uidx++;
431 		aidx++;
432 	}
433 
434 	/*
435 	 * Update the used pointer, and signal an interrupt if allowed
436 	 */
437 	*hq->hq_used_idx = uidx;
438 	hq->hq_cur_aidx = aidx;
439 
440 	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0)
441 		vtnet_generate_interrupt(sc, VTNET_RXQ);
442 }
443 
444 static void
445 pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
446 {
447 	struct pci_vtnet_softc *sc = param;
448 
449 	pthread_mutex_lock(&sc->rx_mtx);
450 	sc->rx_in_progress = 1;
451 	pci_vtnet_tap_rx(sc);
452 	sc->rx_in_progress = 0;
453 	pthread_mutex_unlock(&sc->rx_mtx);
454 
455 }
456 
457 static void
458 pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
459 {
460 	/*
461 	 * A qnotify means that the rx process can now begin
462 	 */
463 	if (sc->vsc_rx_ready == 0) {
464 		sc->vsc_rx_ready = 1;
465 	}
466 }
467 
468 static void
469 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
470 {
471 	struct iovec iov[VTNET_MAXSEGS + 1];
472 	struct virtio_desc *vd;
473 	struct virtio_used *vu;
474 	int i;
475 	int plen;
476 	int tlen;
477 	int uidx, aidx, didx;
478 
479 	uidx = *hq->hq_used_idx;
480 	aidx = hq->hq_cur_aidx;
481 	didx = hq->hq_avail_ring[aidx % hq->hq_size];
482 	assert(didx >= 0 && didx < hq->hq_size);
483 
484 	vd = &hq->hq_dtable[didx];
485 
486 	/*
487 	 * Run through the chain of descriptors, ignoring the
488 	 * first header descriptor. However, include the header
489 	 * length in the total length that will be put into the
490 	 * used queue.
491 	 */
492 	tlen = vd->vd_len;
493 	vd = &hq->hq_dtable[vd->vd_next];
494 
495 	for (i = 0, plen = 0;
496 	     i < VTNET_MAXSEGS;
497 	     i++, vd = &hq->hq_dtable[vd->vd_next]) {
498 		iov[i].iov_base = paddr_guest2host(vtnet_ctx(sc),
499 						   vd->vd_addr, vd->vd_len);
500 		iov[i].iov_len = vd->vd_len;
501 		plen += vd->vd_len;
502 		tlen += vd->vd_len;
503 
504 		if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
505 			break;
506 	}
507 	assert(i < VTNET_MAXSEGS);
508 
509 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
510 	pci_vtnet_tap_tx(sc, iov, i + 1, plen);
511 
512 	/*
513 	 * Return this chain back to the host
514 	 */
515 	vu = &hq->hq_used_ring[uidx % hq->hq_size];
516 	vu->vu_idx = didx;
517 	vu->vu_tlen = tlen;
518 	hq->hq_cur_aidx = aidx + 1;
519 	*hq->hq_used_idx = uidx + 1;
520 }
521 
522 static void
523 pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
524 {
525 	struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
526 	int ndescs;
527 
528 	/*
529 	 * Calculate number of ring entries to process
530 	 */
531 	ndescs = hq_num_avail(hq);
532 
533 	if (ndescs == 0)
534 		return;
535 
536 	/* Signal the tx thread for processing */
537 	pthread_mutex_lock(&sc->tx_mtx);
538 	if (sc->tx_in_progress == 0)
539 		pthread_cond_signal(&sc->tx_cond);
540 	pthread_mutex_unlock(&sc->tx_mtx);
541 }
542 
543 /*
544  * Thread which will handle processing of TX desc
545  */
546 static void *
547 pci_vtnet_tx_thread(void *param)
548 {
549 	struct pci_vtnet_softc *sc = (struct pci_vtnet_softc *) param;
550 	struct vring_hqueue *hq;
551 	int i, ndescs, error;
552 
553 	hq = &sc->vsc_hq[VTNET_TXQ];
554 
555 	/*
556 	 * Let us wait till the tx queue pointers get initialised &
557 	 * first tx signaled
558 	 */
559 	pthread_mutex_lock(&sc->tx_mtx);
560 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
561 	assert(error == 0);
562 
563 	for (;;) {
564 		pthread_mutex_lock(&sc->tx_mtx);
565 		for (;;) {
566 			if (sc->resetting)
567 				ndescs = 0;
568 			else
569 				ndescs = hq_num_avail(hq);
570 
571 			if (ndescs != 0)
572 				break;
573 
574 			sc->tx_in_progress = 0;
575 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
576 			assert(error == 0);
577 		}
578 		sc->tx_in_progress = 1;
579 		pthread_mutex_unlock(&sc->tx_mtx);
580 
581 		while (ndescs > 0) {
582 			/*
583 			 * Run through all the entries, placing them into
584 			 * iovecs and sending when an end-of-packet is found
585 			 */
586 			for (i = 0; i < ndescs; i++)
587 				pci_vtnet_proctx(sc, hq);
588 
589 			ndescs = hq_num_avail(hq);
590 		}
591 
592 		/*
593 		 * Generate an interrupt if needed.
594 		 */
595 		if (notify_on_empty(sc) ||
596 		    (*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0)
597 			vtnet_generate_interrupt(sc, VTNET_TXQ);
598 	}
599 }
600 
601 static void
602 pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
603 {
604 
605 	DPRINTF(("vtnet: control qnotify!\n\r"));
606 }
607 
608 static void
609 pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
610 {
611 	struct vring_hqueue *hq;
612 	int qnum = sc->vsc_curq;
613 
614 	assert(qnum < VTNET_MAXQ);
615 
616 	sc->vsc_pfn[qnum] = pfn << VRING_PFN;
617 
618 	/*
619 	 * Set up host pointers to the various parts of the
620 	 * queue
621 	 */
622 	hq = &sc->vsc_hq[qnum];
623 	hq->hq_size = pci_vtnet_qsize(qnum);
624 
625 	hq->hq_dtable = paddr_guest2host(vtnet_ctx(sc), pfn << VRING_PFN,
626 					 vring_size(hq->hq_size));
627 	hq->hq_avail_flags =  (uint16_t *)(hq->hq_dtable + hq->hq_size);
628 	hq->hq_avail_idx = hq->hq_avail_flags + 1;
629 	hq->hq_avail_ring = hq->hq_avail_flags + 2;
630 	hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
631 						 VRING_ALIGN);
632 	hq->hq_used_idx = hq->hq_used_flags + 1;
633 	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
634 
635 	/*
636 	 * Initialize queue indexes
637 	 */
638 	hq->hq_cur_aidx = 0;
639 }
640 
641 static int
642 pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
643 {
644         struct ether_addr *ea;
645         char *tmpstr;
646         char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
647 
648         tmpstr = strsep(&mac_str,"=");
649 
650         if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
651                 ea = ether_aton(mac_str);
652 
653                 if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
654                     memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
655 			fprintf(stderr, "Invalid MAC %s\n", mac_str);
656                         return (EINVAL);
657                 } else
658                         memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
659         }
660 
661         return (0);
662 }
663 
664 
665 static int
666 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
667 {
668 	MD5_CTX mdctx;
669 	unsigned char digest[16];
670 	char nstr[80];
671 	char tname[MAXCOMLEN + 1];
672 	struct pci_vtnet_softc *sc;
673 	const char *env_msi;
674 	char *devname;
675 	char *vtopts;
676 	int mac_provided;
677 
678 	sc = malloc(sizeof(struct pci_vtnet_softc));
679 	memset(sc, 0, sizeof(struct pci_vtnet_softc));
680 
681 	pi->pi_arg = sc;
682 	sc->vsc_pi = pi;
683 
684 	pthread_mutex_init(&sc->vsc_mtx, NULL);
685 
686 	/*
687 	 * Use MSI if set by user
688 	 */
689 	if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
690 		if (strcasecmp(env_msi, "yes") == 0)
691 			use_msix = 0;
692 	}
693 
694 	/*
695 	 * Attempt to open the tap device and read the MAC address
696 	 * if specified
697 	 */
698 	mac_provided = 0;
699 	sc->vsc_tapfd = -1;
700 	if (opts != NULL) {
701 		char tbuf[80];
702 		int err;
703 
704 		devname = vtopts = strdup(opts);
705 		(void) strsep(&vtopts, ",");
706 
707 		if (vtopts != NULL) {
708 			err = pci_vtnet_parsemac(vtopts, sc->vsc_macaddr);
709 			if (err != 0) {
710 				free(devname);
711 				return (err);
712 			}
713 			mac_provided = 1;
714 		}
715 
716 		strcpy(tbuf, "/dev/");
717 		strlcat(tbuf, devname, sizeof(tbuf));
718 
719 		free(devname);
720 
721 		sc->vsc_tapfd = open(tbuf, O_RDWR);
722 		if (sc->vsc_tapfd == -1) {
723 			WPRINTF(("open of tap device %s failed\n", tbuf));
724 		} else {
725 			/*
726 			 * Set non-blocking and register for read
727 			 * notifications with the event loop
728 			 */
729 			int opt = 1;
730 			if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
731 				WPRINTF(("tap device O_NONBLOCK failed\n"));
732 				close(sc->vsc_tapfd);
733 				sc->vsc_tapfd = -1;
734 			}
735 
736 			sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
737 						  EVF_READ,
738 						  pci_vtnet_tap_callback,
739 						  sc);
740 			if (sc->vsc_mevp == NULL) {
741 				WPRINTF(("Could not register event\n"));
742 				close(sc->vsc_tapfd);
743 				sc->vsc_tapfd = -1;
744 			}
745 		}
746 	}
747 
748 	/*
749 	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
750 	 * followed by an MD5 of the PCI slot/func number and dev name
751 	 */
752 	if (!mac_provided) {
753 		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
754 	            pi->pi_func, vmname);
755 
756 		MD5Init(&mdctx);
757 		MD5Update(&mdctx, nstr, strlen(nstr));
758 		MD5Final(digest, &mdctx);
759 
760 		sc->vsc_macaddr[0] = 0x00;
761 		sc->vsc_macaddr[1] = 0xa0;
762 		sc->vsc_macaddr[2] = 0x98;
763 		sc->vsc_macaddr[3] = digest[0];
764 		sc->vsc_macaddr[4] = digest[1];
765 		sc->vsc_macaddr[5] = digest[2];
766 	}
767 
768 	/* initialize config space */
769 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
770 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
771 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
772 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
773 
774 	if (use_msix) {
775 		/* MSI-X support */
776 		int i;
777 
778 		for (i = 0; i < VTNET_MAXQ; i++)
779 			sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
780 
781 		/*
782 		 * BAR 1 used to map MSI-X table and PBA
783 		 */
784 		if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1))
785 			return (1);
786 	} else {
787 		/* MSI support */
788 		pci_emul_add_msicap(pi, 1);
789 	}
790 
791 	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
792 
793 	sc->resetting = 0;
794 
795 	sc->rx_in_progress = 0;
796 	pthread_mutex_init(&sc->rx_mtx, NULL);
797 
798 	/*
799 	 * Initialize tx semaphore & spawn TX processing thread
800 	 * As of now, only one thread for TX desc processing is
801 	 * spawned.
802 	 */
803 	sc->tx_in_progress = 0;
804 	pthread_mutex_init(&sc->tx_mtx, NULL);
805 	pthread_cond_init(&sc->tx_cond, NULL);
806 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
807         snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot);
808         pthread_set_name_np(sc->tx_tid, tname);
809 
810 	return (0);
811 }
812 
813 /*
814  * Function pointer array to handle queue notifications
815  */
816 static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
817 	pci_vtnet_ping_rxq,
818 	pci_vtnet_ping_txq,
819 	pci_vtnet_ping_ctlq
820 };
821 
822 static uint64_t
823 vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset)
824 {
825 	/*
826 	 * Device specific offsets used by guest would change based on
827 	 * whether MSI-X capability is enabled or not
828 	 */
829 	if (!pci_msix_enabled(pi)) {
830 		if (offset >= VTCFG_R_MSIX)
831 			return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
832 	}
833 
834 	return (offset);
835 }
836 
837 static void
838 pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
839 		int baridx, uint64_t offset, int size, uint64_t value)
840 {
841 	struct pci_vtnet_softc *sc = pi->pi_arg;
842 	void *ptr;
843 
844 	if (use_msix) {
845 		if (baridx == pci_msix_table_bar(pi) ||
846 		    baridx == pci_msix_pba_bar(pi)) {
847 			pci_emul_msix_twrite(pi, offset, size, value);
848 			return;
849 		}
850 	}
851 
852 	assert(baridx == 0);
853 
854 	if (offset + size > pci_vtnet_iosize(pi)) {
855 		DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
856 			 offset, size));
857 		return;
858 	}
859 
860 	pthread_mutex_lock(&sc->vsc_mtx);
861 
862 	offset = vtnet_adjust_offset(pi, offset);
863 
864 	switch (offset) {
865 	case VTCFG_R_GUESTCAP:
866 		assert(size == 4);
867 		sc->vsc_features = value & VTNET_S_HOSTCAPS;
868 		break;
869 	case VTCFG_R_PFN:
870 		assert(size == 4);
871 		pci_vtnet_ring_init(sc, value);
872 		break;
873 	case VTCFG_R_QSEL:
874 		assert(size == 2);
875 		assert(value < VTNET_MAXQ);
876 		sc->vsc_curq = value;
877 		break;
878 	case VTCFG_R_QNOTIFY:
879 		assert(size == 2);
880 		assert(value < VTNET_MAXQ);
881 		(*pci_vtnet_qnotify[value])(sc);
882 		break;
883 	case VTCFG_R_STATUS:
884 		assert(size == 1);
885 		pci_vtnet_update_status(sc, value);
886 		break;
887 	case VTCFG_R_CFGVEC:
888 		assert(size == 2);
889 		sc->vsc_msix_table_idx[VTNET_CTLQ] = value;
890 		break;
891 	case VTCFG_R_QVEC:
892 		assert(size == 2);
893 		assert(sc->vsc_curq != VTNET_CTLQ);
894 		sc->vsc_msix_table_idx[sc->vsc_curq] = value;
895 		break;
896 	case VTNET_R_CFG0:
897 	case VTNET_R_CFG1:
898 	case VTNET_R_CFG2:
899 	case VTNET_R_CFG3:
900 	case VTNET_R_CFG4:
901 	case VTNET_R_CFG5:
902 		assert((size + offset) <= (VTNET_R_CFG5 + 1));
903 		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
904 		/*
905 		 * The driver is allowed to change the MAC address
906 		 */
907 		sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
908 		if (size == 1) {
909 			*(uint8_t *) ptr = value;
910 		} else if (size == 2) {
911 			*(uint16_t *) ptr = value;
912 		} else {
913 			*(uint32_t *) ptr = value;
914 		}
915 		break;
916 	case VTCFG_R_HOSTCAP:
917 	case VTCFG_R_QNUM:
918 	case VTCFG_R_ISR:
919 	case VTNET_R_CFG6:
920 	case VTNET_R_CFG7:
921 		DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
922 		break;
923 	default:
924 		DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
925 		value = 0;
926 		break;
927 	}
928 
929 	pthread_mutex_unlock(&sc->vsc_mtx);
930 }
931 
932 uint64_t
933 pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
934 	       int baridx, uint64_t offset, int size)
935 {
936 	struct pci_vtnet_softc *sc = pi->pi_arg;
937 	void *ptr;
938 	uint64_t value;
939 
940 	if (use_msix) {
941 		if (baridx == pci_msix_table_bar(pi) ||
942 		    baridx == pci_msix_pba_bar(pi)) {
943 			return (pci_emul_msix_tread(pi, offset, size));
944 		}
945 	}
946 
947 	assert(baridx == 0);
948 
949 	if (offset + size > pci_vtnet_iosize(pi)) {
950 		DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
951 			 offset, size));
952 		return (0);
953 	}
954 
955 	pthread_mutex_lock(&sc->vsc_mtx);
956 
957 	offset = vtnet_adjust_offset(pi, offset);
958 
959 	switch (offset) {
960 	case VTCFG_R_HOSTCAP:
961 		assert(size == 4);
962 		value = VTNET_S_HOSTCAPS;
963 		break;
964 	case VTCFG_R_GUESTCAP:
965 		assert(size == 4);
966 		value = sc->vsc_features; /* XXX never read ? */
967 		break;
968 	case VTCFG_R_PFN:
969 		assert(size == 4);
970 		value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
971 		break;
972 	case VTCFG_R_QNUM:
973 		assert(size == 2);
974 		value = pci_vtnet_qsize(sc->vsc_curq);
975 		break;
976 	case VTCFG_R_QSEL:
977 		assert(size == 2);
978 		value = sc->vsc_curq;  /* XXX never read ? */
979 		break;
980 	case VTCFG_R_QNOTIFY:
981 		assert(size == 2);
982 		value = sc->vsc_curq;  /* XXX never read ? */
983 		break;
984 	case VTCFG_R_STATUS:
985 		assert(size == 1);
986 		value = sc->vsc_status;
987 		break;
988 	case VTCFG_R_ISR:
989 		assert(size == 1);
990 		value = sc->vsc_isr;
991 		sc->vsc_isr = 0;     /* a read clears this flag */
992 		break;
993 	case VTCFG_R_CFGVEC:
994 		assert(size == 2);
995 		value = sc->vsc_msix_table_idx[VTNET_CTLQ];
996 		break;
997 	case VTCFG_R_QVEC:
998 		assert(size == 2);
999 		assert(sc->vsc_curq != VTNET_CTLQ);
1000 		value = sc->vsc_msix_table_idx[sc->vsc_curq];
1001 		break;
1002 	case VTNET_R_CFG0:
1003 	case VTNET_R_CFG1:
1004 	case VTNET_R_CFG2:
1005 	case VTNET_R_CFG3:
1006 	case VTNET_R_CFG4:
1007 	case VTNET_R_CFG5:
1008 		assert((size + offset) <= (VTNET_R_CFG5 + 1));
1009 		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
1010 		if (size == 1) {
1011 			value = *(uint8_t *) ptr;
1012 		} else if (size == 2) {
1013 			value = *(uint16_t *) ptr;
1014 		} else {
1015 			value = *(uint32_t *) ptr;
1016 		}
1017 		break;
1018 	case VTNET_R_CFG6:
1019 		assert(size != 4);
1020 		value = 0x01; /* XXX link always up */
1021 		break;
1022 	case VTNET_R_CFG7:
1023 		assert(size == 1);
1024 		value = 0; /* XXX link status in LSB */
1025 		break;
1026 	default:
1027 		DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
1028 		value = 0;
1029 		break;
1030 	}
1031 
1032 	pthread_mutex_unlock(&sc->vsc_mtx);
1033 
1034 	return (value);
1035 }
1036 
1037 struct pci_devemu pci_de_vnet = {
1038 	.pe_emu = 	"virtio-net",
1039 	.pe_init =	pci_vtnet_init,
1040 	.pe_barwrite =	pci_vtnet_write,
1041 	.pe_barread =	pci_vtnet_read
1042 };
1043 PCI_EMUL_SET(pci_de_vnet);
1044