xref: /freebsd/usr.sbin/bhyve/pci_virtio_net.c (revision 595e514d0df2bac5b813d35f83e32875dbf16a83)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/linker_set.h>
34 #include <sys/select.h>
35 #include <sys/uio.h>
36 #include <sys/ioctl.h>
37 
38 #include <errno.h>
39 #include <fcntl.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <stdint.h>
43 #include <string.h>
44 #include <strings.h>
45 #include <unistd.h>
46 #include <assert.h>
47 #include <md5.h>
48 #include <pthread.h>
49 #include <pthread_np.h>
50 
51 #include "bhyverun.h"
52 #include "pci_emul.h"
53 #include "mevent.h"
54 #include "virtio.h"
55 
56 #define VTNET_RINGSZ	1024
57 
58 #define VTNET_MAXSEGS	32
59 
60 /*
61  * PCI config-space register offsets
62  */
63 #define VTNET_R_CFG0	24
64 #define VTNET_R_CFG1	25
65 #define VTNET_R_CFG2	26
66 #define VTNET_R_CFG3	27
67 #define VTNET_R_CFG4	28
68 #define VTNET_R_CFG5	29
69 #define VTNET_R_CFG6	30
70 #define VTNET_R_CFG7	31
71 #define VTNET_R_MAX	31
72 
73 #define VTNET_REGSZ	VTNET_R_MAX+1
74 
75 /*
76  * Host capabilities
77  */
78 #define VTNET_S_HOSTCAPS      \
79   ( 0x00000020 |	/* host supplies MAC */ \
80     0x00008000 |	/* host can merge Rx buffers */ \
81     0x00010000 |	/* config status available */ \
82     VIRTIO_F_NOTIFY_ON_EMPTY)
83 
84 /*
85  * Queue definitions.
86  */
87 #define VTNET_RXQ	0
88 #define VTNET_TXQ	1
89 #define VTNET_CTLQ	2
90 
91 #define VTNET_MAXQ	3
92 
93 static int use_msix = 1;
94 
95 struct vring_hqueue {
96 	/* Internal state */
97 	uint16_t	hq_size;
98 	uint16_t	hq_cur_aidx;		/* trails behind 'avail_idx' */
99 
100 	 /* Host-context pointers to the queue */
101 	struct virtio_desc *hq_dtable;
102 	uint16_t	*hq_avail_flags;
103 	uint16_t	*hq_avail_idx;		/* monotonically increasing */
104 	uint16_t	*hq_avail_ring;
105 
106 	uint16_t	*hq_used_flags;
107 	uint16_t	*hq_used_idx;		/* monotonically increasing */
108 	struct virtio_used *hq_used_ring;
109 };
110 
111 /*
112  * Fixed network header size
113  */
114 struct virtio_net_rxhdr {
115 	uint8_t		vrh_flags;
116 	uint8_t		vrh_gso_type;
117 	uint16_t	vrh_hdr_len;
118 	uint16_t	vrh_gso_size;
119 	uint16_t	vrh_csum_start;
120 	uint16_t	vrh_csum_offset;
121 	uint16_t	vrh_bufs;
122 } __packed;
123 
124 /*
125  * Debug printf
126  */
127 static int pci_vtnet_debug;
128 #define DPRINTF(params) if (pci_vtnet_debug) printf params
129 #define WPRINTF(params) printf params
130 
131 /*
132  * Per-device softc
133  */
134 struct pci_vtnet_softc {
135 	struct pci_devinst *vsc_pi;
136 	pthread_mutex_t vsc_mtx;
137 	struct mevent	*vsc_mevp;
138 
139 	int		vsc_curq;
140 	int		vsc_status;
141 	int		vsc_isr;
142 	int		vsc_tapfd;
143 	int		vsc_rx_ready;
144 	int		resetting;
145 
146 	uint32_t	vsc_features;
147 	uint8_t		vsc_macaddr[6];
148 
149 	uint64_t	vsc_pfn[VTNET_MAXQ];
150 	struct	vring_hqueue vsc_hq[VTNET_MAXQ];
151 	uint16_t	vsc_msix_table_idx[VTNET_MAXQ];
152 
153 	pthread_mutex_t	rx_mtx;
154 	int		rx_in_progress;
155 
156 	pthread_t 	tx_tid;
157 	pthread_mutex_t	tx_mtx;
158 	pthread_cond_t	tx_cond;
159 	int		tx_in_progress;
160 };
161 #define	vtnet_ctx(sc)		((sc)->vsc_pi->pi_vmctx)
162 #define	notify_on_empty(sc)	((sc)->vsc_features & VIRTIO_F_NOTIFY_ON_EMPTY)
163 
164 /*
165  * Return the size of IO BAR that maps virtio header and device specific
166  * region. The size would vary depending on whether MSI-X is enabled or
167  * not.
168  */
169 static uint64_t
170 pci_vtnet_iosize(struct pci_devinst *pi)
171 {
172 	if (pci_msix_enabled(pi))
173 		return (VTNET_REGSZ);
174 	else
175 		return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
176 }
177 
178 /*
179  * Return the number of available descriptors in the vring taking care
180  * of the 16-bit index wraparound.
181  */
182 static int
183 hq_num_avail(struct vring_hqueue *hq)
184 {
185 	uint16_t ndesc;
186 
187 	/*
188 	 * We're just computing (a-b) mod 2^16
189 	 *
190 	 * The only glitch here is that in standard C,
191 	 * uint16_t promotes to (signed) int when int has
192 	 * more than 16 bits (pretty much always now), so
193 	 * we have to force it back to unsigned.
194 	 */
195 	ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
196 
197 	assert(ndesc <= hq->hq_size);
198 
199 	return (ndesc);
200 }
201 
202 static uint16_t
203 pci_vtnet_qsize(int qnum)
204 {
205 	/* XXX no ctl queue currently */
206 	if (qnum == VTNET_CTLQ) {
207 		return (0);
208 	}
209 
210 	/* XXX fixed currently. Maybe different for tx/rx/ctl */
211 	return (VTNET_RINGSZ);
212 }
213 
214 static void
215 pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
216 {
217 	struct vring_hqueue *hq;
218 
219 	assert(ring < VTNET_MAXQ);
220 
221 	hq = &sc->vsc_hq[ring];
222 
223 	/*
224 	 * Reset all soft state
225 	 */
226 	hq->hq_cur_aidx = 0;
227 }
228 
229 /*
230  * If the transmit thread is active then stall until it is done.
231  */
232 static void
233 pci_vtnet_txwait(struct pci_vtnet_softc *sc)
234 {
235 
236 	pthread_mutex_lock(&sc->tx_mtx);
237 	while (sc->tx_in_progress) {
238 		pthread_mutex_unlock(&sc->tx_mtx);
239 		usleep(10000);
240 		pthread_mutex_lock(&sc->tx_mtx);
241 	}
242 	pthread_mutex_unlock(&sc->tx_mtx);
243 }
244 
245 /*
246  * If the receive thread is active then stall until it is done.
247  */
248 static void
249 pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
250 {
251 
252 	pthread_mutex_lock(&sc->rx_mtx);
253 	while (sc->rx_in_progress) {
254 		pthread_mutex_unlock(&sc->rx_mtx);
255 		usleep(10000);
256 		pthread_mutex_lock(&sc->rx_mtx);
257 	}
258 	pthread_mutex_unlock(&sc->rx_mtx);
259 }
260 
261 static void
262 pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
263 {
264 	int i;
265 
266 	if (value == 0) {
267 		DPRINTF(("vtnet: device reset requested !\n"));
268 
269 		sc->resetting = 1;
270 
271 		/*
272 		 * Wait for the transmit and receive threads to finish their
273 		 * processing.
274 		 */
275 		pci_vtnet_txwait(sc);
276 		pci_vtnet_rxwait(sc);
277 
278 		sc->vsc_rx_ready = 0;
279 		pci_vtnet_ring_reset(sc, VTNET_RXQ);
280 		pci_vtnet_ring_reset(sc, VTNET_TXQ);
281 
282 		for (i = 0; i < VTNET_MAXQ; i++)
283 			sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
284 
285 		sc->vsc_isr = 0;
286 		sc->vsc_features = 0;
287 
288 		sc->resetting = 0;
289 	}
290 
291 	sc->vsc_status = value;
292 }
293 
294 static void
295 vtnet_generate_interrupt(struct pci_vtnet_softc *sc, int qidx)
296 {
297 
298 	if (use_msix) {
299 		pci_generate_msix(sc->vsc_pi, sc->vsc_msix_table_idx[qidx]);
300 	} else {
301 		sc->vsc_isr |= 1;
302 		pci_generate_msi(sc->vsc_pi, 0);
303 	}
304 }
305 
306 /*
307  * Called to send a buffer chain out to the tap device
308  */
309 static void
310 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
311 		 int len)
312 {
313 	char pad[60];
314 
315 	if (sc->vsc_tapfd == -1)
316 		return;
317 
318 	/*
319 	 * If the length is < 60, pad out to that and add the
320 	 * extra zero'd segment to the iov. It is guaranteed that
321 	 * there is always an extra iov available by the caller.
322 	 */
323 	if (len < 60) {
324 		memset(pad, 0, 60 - len);
325 		iov[iovcnt].iov_base = pad;
326 		iov[iovcnt].iov_len = 60 - len;
327 		iovcnt++;
328 	}
329 	(void) writev(sc->vsc_tapfd, iov, iovcnt);
330 }
331 
332 /*
333  *  Called when there is read activity on the tap file descriptor.
334  * Each buffer posted by the guest is assumed to be able to contain
335  * an entire ethernet frame + rx header.
336  *  MP note: the dummybuf is only used for discarding frames, so there
337  * is no need for it to be per-vtnet or locked.
338  */
339 static uint8_t dummybuf[2048];
340 
341 static void
342 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
343 {
344 	struct virtio_desc *vd;
345 	struct virtio_used *vu;
346 	struct vring_hqueue *hq;
347 	struct virtio_net_rxhdr *vrx;
348 	uint8_t *buf;
349 	int i;
350 	int len;
351 	int ndescs;
352 	int didx, uidx, aidx;	/* descriptor, avail and used index */
353 
354 	/*
355 	 * Should never be called without a valid tap fd
356 	 */
357 	assert(sc->vsc_tapfd != -1);
358 
359 	/*
360 	 * But, will be called when the rx ring hasn't yet
361 	 * been set up or the guest is resetting the device.
362 	 */
363 	if (!sc->vsc_rx_ready || sc->resetting) {
364 		/*
365 		 * Drop the packet and try later.
366 		 */
367 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
368 		return;
369 	}
370 
371 	/*
372 	 * Calculate the number of available rx buffers
373 	 */
374 	hq = &sc->vsc_hq[VTNET_RXQ];
375 
376 	ndescs = hq_num_avail(hq);
377 
378 	if (ndescs == 0) {
379 		/*
380 		 * Drop the packet and try later
381 		 */
382 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
383 
384 		if (notify_on_empty(sc))
385 			vtnet_generate_interrupt(sc, VTNET_RXQ);
386 
387 		return;
388 	}
389 
390 	aidx = hq->hq_cur_aidx;
391 	uidx = *hq->hq_used_idx;
392 	for (i = 0; i < ndescs; i++) {
393 		/*
394 		 * 'aidx' indexes into the an array of descriptor indexes
395 		 */
396 		didx = hq->hq_avail_ring[aidx % hq->hq_size];
397 		assert(didx >= 0 && didx < hq->hq_size);
398 
399 		vd = &hq->hq_dtable[didx];
400 
401 		/*
402 		 * Get a pointer to the rx header, and use the
403 		 * data immediately following it for the packet buffer.
404 		 */
405 		vrx = paddr_guest2host(vtnet_ctx(sc), vd->vd_addr, vd->vd_len);
406 		buf = (uint8_t *)(vrx + 1);
407 
408 		len = read(sc->vsc_tapfd, buf,
409 			   vd->vd_len - sizeof(struct virtio_net_rxhdr));
410 
411 		if (len < 0 && errno == EWOULDBLOCK) {
412 			break;
413 		}
414 
415 		/*
416 		 * The only valid field in the rx packet header is the
417 		 * number of buffers, which is always 1 without TSO
418 		 * support.
419 		 */
420 		memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
421 		vrx->vrh_bufs = 1;
422 
423 		/*
424 		 * Write this descriptor into the used ring
425 		 */
426 		vu = &hq->hq_used_ring[uidx % hq->hq_size];
427 		vu->vu_idx = didx;
428 		vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
429 		uidx++;
430 		aidx++;
431 	}
432 
433 	/*
434 	 * Update the used pointer, and signal an interrupt if allowed
435 	 */
436 	*hq->hq_used_idx = uidx;
437 	hq->hq_cur_aidx = aidx;
438 
439 	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0)
440 		vtnet_generate_interrupt(sc, VTNET_RXQ);
441 }
442 
443 static void
444 pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
445 {
446 	struct pci_vtnet_softc *sc = param;
447 
448 	pthread_mutex_lock(&sc->rx_mtx);
449 	sc->rx_in_progress = 1;
450 	pci_vtnet_tap_rx(sc);
451 	sc->rx_in_progress = 0;
452 	pthread_mutex_unlock(&sc->rx_mtx);
453 
454 }
455 
456 static void
457 pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
458 {
459 	/*
460 	 * A qnotify means that the rx process can now begin
461 	 */
462 	if (sc->vsc_rx_ready == 0) {
463 		sc->vsc_rx_ready = 1;
464 	}
465 }
466 
467 static void
468 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
469 {
470 	struct iovec iov[VTNET_MAXSEGS + 1];
471 	struct virtio_desc *vd;
472 	struct virtio_used *vu;
473 	int i;
474 	int plen;
475 	int tlen;
476 	int uidx, aidx, didx;
477 
478 	uidx = *hq->hq_used_idx;
479 	aidx = hq->hq_cur_aidx;
480 	didx = hq->hq_avail_ring[aidx % hq->hq_size];
481 	assert(didx >= 0 && didx < hq->hq_size);
482 
483 	vd = &hq->hq_dtable[didx];
484 
485 	/*
486 	 * Run through the chain of descriptors, ignoring the
487 	 * first header descriptor. However, include the header
488 	 * length in the total length that will be put into the
489 	 * used queue.
490 	 */
491 	tlen = vd->vd_len;
492 	vd = &hq->hq_dtable[vd->vd_next];
493 
494 	for (i = 0, plen = 0;
495 	     i < VTNET_MAXSEGS;
496 	     i++, vd = &hq->hq_dtable[vd->vd_next]) {
497 		iov[i].iov_base = paddr_guest2host(vtnet_ctx(sc),
498 						   vd->vd_addr, vd->vd_len);
499 		iov[i].iov_len = vd->vd_len;
500 		plen += vd->vd_len;
501 		tlen += vd->vd_len;
502 
503 		if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
504 			break;
505 	}
506 	assert(i < VTNET_MAXSEGS);
507 
508 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
509 	pci_vtnet_tap_tx(sc, iov, i + 1, plen);
510 
511 	/*
512 	 * Return this chain back to the host
513 	 */
514 	vu = &hq->hq_used_ring[uidx % hq->hq_size];
515 	vu->vu_idx = didx;
516 	vu->vu_tlen = tlen;
517 	hq->hq_cur_aidx = aidx + 1;
518 	*hq->hq_used_idx = uidx + 1;
519 }
520 
521 static void
522 pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
523 {
524 	struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
525 	int ndescs;
526 
527 	/*
528 	 * Calculate number of ring entries to process
529 	 */
530 	ndescs = hq_num_avail(hq);
531 
532 	if (ndescs == 0)
533 		return;
534 
535 	/* Signal the tx thread for processing */
536 	pthread_mutex_lock(&sc->tx_mtx);
537 	if (sc->tx_in_progress == 0)
538 		pthread_cond_signal(&sc->tx_cond);
539 	pthread_mutex_unlock(&sc->tx_mtx);
540 }
541 
542 /*
543  * Thread which will handle processing of TX desc
544  */
545 static void *
546 pci_vtnet_tx_thread(void *param)
547 {
548 	struct pci_vtnet_softc *sc = (struct pci_vtnet_softc *) param;
549 	struct vring_hqueue *hq;
550 	int i, ndescs, error;
551 
552 	hq = &sc->vsc_hq[VTNET_TXQ];
553 
554 	/*
555 	 * Let us wait till the tx queue pointers get initialised &
556 	 * first tx signaled
557 	 */
558 	pthread_mutex_lock(&sc->tx_mtx);
559 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
560 	assert(error == 0);
561 
562 	for (;;) {
563 		pthread_mutex_lock(&sc->tx_mtx);
564 		for (;;) {
565 			if (sc->resetting)
566 				ndescs = 0;
567 			else
568 				ndescs = hq_num_avail(hq);
569 
570 			if (ndescs != 0)
571 				break;
572 
573 			sc->tx_in_progress = 0;
574 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
575 			assert(error == 0);
576 		}
577 		sc->tx_in_progress = 1;
578 		pthread_mutex_unlock(&sc->tx_mtx);
579 
580 		while (ndescs > 0) {
581 			/*
582 			 * Run through all the entries, placing them into
583 			 * iovecs and sending when an end-of-packet is found
584 			 */
585 			for (i = 0; i < ndescs; i++)
586 				pci_vtnet_proctx(sc, hq);
587 
588 			ndescs = hq_num_avail(hq);
589 		}
590 
591 		/*
592 		 * Generate an interrupt if needed.
593 		 */
594 		if (notify_on_empty(sc) ||
595 		    (*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0)
596 			vtnet_generate_interrupt(sc, VTNET_TXQ);
597 	}
598 }
599 
600 static void
601 pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
602 {
603 
604 	DPRINTF(("vtnet: control qnotify!\n\r"));
605 }
606 
607 static void
608 pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
609 {
610 	struct vring_hqueue *hq;
611 	int qnum = sc->vsc_curq;
612 
613 	assert(qnum < VTNET_MAXQ);
614 
615 	sc->vsc_pfn[qnum] = pfn << VRING_PFN;
616 
617 	/*
618 	 * Set up host pointers to the various parts of the
619 	 * queue
620 	 */
621 	hq = &sc->vsc_hq[qnum];
622 	hq->hq_size = pci_vtnet_qsize(qnum);
623 
624 	hq->hq_dtable = paddr_guest2host(vtnet_ctx(sc), pfn << VRING_PFN,
625 					 vring_size(hq->hq_size));
626 	hq->hq_avail_flags =  (uint16_t *)(hq->hq_dtable + hq->hq_size);
627 	hq->hq_avail_idx = hq->hq_avail_flags + 1;
628 	hq->hq_avail_ring = hq->hq_avail_flags + 2;
629 	hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
630 						 VRING_ALIGN);
631 	hq->hq_used_idx = hq->hq_used_flags + 1;
632 	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
633 
634 	/*
635 	 * Initialize queue indexes
636 	 */
637 	hq->hq_cur_aidx = 0;
638 }
639 
640 static int
641 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
642 {
643 	MD5_CTX mdctx;
644 	unsigned char digest[16];
645 	char nstr[80];
646 	char tname[MAXCOMLEN + 1];
647 	struct pci_vtnet_softc *sc;
648 	const char *env_msi;
649 
650 	sc = malloc(sizeof(struct pci_vtnet_softc));
651 	memset(sc, 0, sizeof(struct pci_vtnet_softc));
652 
653 	pi->pi_arg = sc;
654 	sc->vsc_pi = pi;
655 
656 	pthread_mutex_init(&sc->vsc_mtx, NULL);
657 
658 	/*
659 	 * Use MSI if set by user
660 	 */
661 	if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
662 		if (strcasecmp(env_msi, "yes") == 0)
663 			use_msix = 0;
664 	}
665 
666 	/*
667 	 * Attempt to open the tap device
668 	 */
669 	sc->vsc_tapfd = -1;
670 	if (opts != NULL) {
671 		char tbuf[80];
672 
673 		strcpy(tbuf, "/dev/");
674 		strlcat(tbuf, opts, sizeof(tbuf));
675 
676 		sc->vsc_tapfd = open(tbuf, O_RDWR);
677 		if (sc->vsc_tapfd == -1) {
678 			WPRINTF(("open of tap device %s failed\n", tbuf));
679 		} else {
680 			/*
681 			 * Set non-blocking and register for read
682 			 * notifications with the event loop
683 			 */
684 			int opt = 1;
685 			if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
686 				WPRINTF(("tap device O_NONBLOCK failed\n"));
687 				close(sc->vsc_tapfd);
688 				sc->vsc_tapfd = -1;
689 			}
690 
691 			sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
692 						  EVF_READ,
693 						  pci_vtnet_tap_callback,
694 						  sc);
695 			if (sc->vsc_mevp == NULL) {
696 				WPRINTF(("Could not register event\n"));
697 				close(sc->vsc_tapfd);
698 				sc->vsc_tapfd = -1;
699 			}
700 		}
701 	}
702 
703 	/*
704 	 * The MAC address is the standard NetApp OUI of 00-a0-98,
705 	 * followed by an MD5 of the vm name. The slot/func number is
706 	 * prepended to this for slots other than 1:0, so that
707 	 * a bootloader can netboot from the equivalent of slot 1.
708 	 */
709 	if (pi->pi_slot == 1 && pi->pi_func == 0) {
710 		strncpy(nstr, vmname, sizeof(nstr));
711 	} else {
712 		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
713 		    pi->pi_func, vmname);
714 	}
715 
716 	MD5Init(&mdctx);
717 	MD5Update(&mdctx, nstr, strlen(nstr));
718 	MD5Final(digest, &mdctx);
719 
720 	sc->vsc_macaddr[0] = 0x00;
721 	sc->vsc_macaddr[1] = 0xa0;
722 	sc->vsc_macaddr[2] = 0x98;
723 	sc->vsc_macaddr[3] = digest[0];
724 	sc->vsc_macaddr[4] = digest[1];
725 	sc->vsc_macaddr[5] = digest[2];
726 
727 	/* initialize config space */
728 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
729 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
730 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
731 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
732 
733 	if (use_msix) {
734 		/* MSI-X support */
735 		int i;
736 
737 		for (i = 0; i < VTNET_MAXQ; i++)
738 			sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
739 
740 		/*
741 		 * BAR 1 used to map MSI-X table and PBA
742 		 */
743 		if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1))
744 			return (1);
745 	} else {
746 		/* MSI support */
747 		pci_emul_add_msicap(pi, 1);
748 	}
749 
750 	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
751 
752 	sc->resetting = 0;
753 
754 	sc->rx_in_progress = 0;
755 	pthread_mutex_init(&sc->rx_mtx, NULL);
756 
757 	/*
758 	 * Initialize tx semaphore & spawn TX processing thread
759 	 * As of now, only one thread for TX desc processing is
760 	 * spawned.
761 	 */
762 	sc->tx_in_progress = 0;
763 	pthread_mutex_init(&sc->tx_mtx, NULL);
764 	pthread_cond_init(&sc->tx_cond, NULL);
765 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
766         snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot);
767         pthread_set_name_np(sc->tx_tid, tname);
768 
769 	return (0);
770 }
771 
772 /*
773  * Function pointer array to handle queue notifications
774  */
775 static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
776 	pci_vtnet_ping_rxq,
777 	pci_vtnet_ping_txq,
778 	pci_vtnet_ping_ctlq
779 };
780 
781 static uint64_t
782 vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset)
783 {
784 	/*
785 	 * Device specific offsets used by guest would change based on
786 	 * whether MSI-X capability is enabled or not
787 	 */
788 	if (!pci_msix_enabled(pi)) {
789 		if (offset >= VTCFG_R_MSIX)
790 			return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
791 	}
792 
793 	return (offset);
794 }
795 
796 static void
797 pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
798 		int baridx, uint64_t offset, int size, uint64_t value)
799 {
800 	struct pci_vtnet_softc *sc = pi->pi_arg;
801 	void *ptr;
802 
803 	if (use_msix) {
804 		if (baridx == pci_msix_table_bar(pi) ||
805 		    baridx == pci_msix_pba_bar(pi)) {
806 			pci_emul_msix_twrite(pi, offset, size, value);
807 			return;
808 		}
809 	}
810 
811 	assert(baridx == 0);
812 
813 	if (offset + size > pci_vtnet_iosize(pi)) {
814 		DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
815 			 offset, size));
816 		return;
817 	}
818 
819 	pthread_mutex_lock(&sc->vsc_mtx);
820 
821 	offset = vtnet_adjust_offset(pi, offset);
822 
823 	switch (offset) {
824 	case VTCFG_R_GUESTCAP:
825 		assert(size == 4);
826 		sc->vsc_features = value & VTNET_S_HOSTCAPS;
827 		break;
828 	case VTCFG_R_PFN:
829 		assert(size == 4);
830 		pci_vtnet_ring_init(sc, value);
831 		break;
832 	case VTCFG_R_QSEL:
833 		assert(size == 2);
834 		assert(value < VTNET_MAXQ);
835 		sc->vsc_curq = value;
836 		break;
837 	case VTCFG_R_QNOTIFY:
838 		assert(size == 2);
839 		assert(value < VTNET_MAXQ);
840 		(*pci_vtnet_qnotify[value])(sc);
841 		break;
842 	case VTCFG_R_STATUS:
843 		assert(size == 1);
844 		pci_vtnet_update_status(sc, value);
845 		break;
846 	case VTCFG_R_CFGVEC:
847 		assert(size == 2);
848 		sc->vsc_msix_table_idx[VTNET_CTLQ] = value;
849 		break;
850 	case VTCFG_R_QVEC:
851 		assert(size == 2);
852 		assert(sc->vsc_curq != VTNET_CTLQ);
853 		sc->vsc_msix_table_idx[sc->vsc_curq] = value;
854 		break;
855 	case VTNET_R_CFG0:
856 	case VTNET_R_CFG1:
857 	case VTNET_R_CFG2:
858 	case VTNET_R_CFG3:
859 	case VTNET_R_CFG4:
860 	case VTNET_R_CFG5:
861 		assert((size + offset) <= (VTNET_R_CFG5 + 1));
862 		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
863 		/*
864 		 * The driver is allowed to change the MAC address
865 		 */
866 		sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
867 		if (size == 1) {
868 			*(uint8_t *) ptr = value;
869 		} else if (size == 2) {
870 			*(uint16_t *) ptr = value;
871 		} else {
872 			*(uint32_t *) ptr = value;
873 		}
874 		break;
875 	case VTCFG_R_HOSTCAP:
876 	case VTCFG_R_QNUM:
877 	case VTCFG_R_ISR:
878 	case VTNET_R_CFG6:
879 	case VTNET_R_CFG7:
880 		DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
881 		break;
882 	default:
883 		DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
884 		value = 0;
885 		break;
886 	}
887 
888 	pthread_mutex_unlock(&sc->vsc_mtx);
889 }
890 
891 uint64_t
892 pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
893 	       int baridx, uint64_t offset, int size)
894 {
895 	struct pci_vtnet_softc *sc = pi->pi_arg;
896 	void *ptr;
897 	uint64_t value;
898 
899 	if (use_msix) {
900 		if (baridx == pci_msix_table_bar(pi) ||
901 		    baridx == pci_msix_pba_bar(pi)) {
902 			return (pci_emul_msix_tread(pi, offset, size));
903 		}
904 	}
905 
906 	assert(baridx == 0);
907 
908 	if (offset + size > pci_vtnet_iosize(pi)) {
909 		DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
910 			 offset, size));
911 		return (0);
912 	}
913 
914 	pthread_mutex_lock(&sc->vsc_mtx);
915 
916 	offset = vtnet_adjust_offset(pi, offset);
917 
918 	switch (offset) {
919 	case VTCFG_R_HOSTCAP:
920 		assert(size == 4);
921 		value = VTNET_S_HOSTCAPS;
922 		break;
923 	case VTCFG_R_GUESTCAP:
924 		assert(size == 4);
925 		value = sc->vsc_features; /* XXX never read ? */
926 		break;
927 	case VTCFG_R_PFN:
928 		assert(size == 4);
929 		value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
930 		break;
931 	case VTCFG_R_QNUM:
932 		assert(size == 2);
933 		value = pci_vtnet_qsize(sc->vsc_curq);
934 		break;
935 	case VTCFG_R_QSEL:
936 		assert(size == 2);
937 		value = sc->vsc_curq;  /* XXX never read ? */
938 		break;
939 	case VTCFG_R_QNOTIFY:
940 		assert(size == 2);
941 		value = sc->vsc_curq;  /* XXX never read ? */
942 		break;
943 	case VTCFG_R_STATUS:
944 		assert(size == 1);
945 		value = sc->vsc_status;
946 		break;
947 	case VTCFG_R_ISR:
948 		assert(size == 1);
949 		value = sc->vsc_isr;
950 		sc->vsc_isr = 0;     /* a read clears this flag */
951 		break;
952 	case VTCFG_R_CFGVEC:
953 		assert(size == 2);
954 		value = sc->vsc_msix_table_idx[VTNET_CTLQ];
955 		break;
956 	case VTCFG_R_QVEC:
957 		assert(size == 2);
958 		assert(sc->vsc_curq != VTNET_CTLQ);
959 		value = sc->vsc_msix_table_idx[sc->vsc_curq];
960 		break;
961 	case VTNET_R_CFG0:
962 	case VTNET_R_CFG1:
963 	case VTNET_R_CFG2:
964 	case VTNET_R_CFG3:
965 	case VTNET_R_CFG4:
966 	case VTNET_R_CFG5:
967 		assert((size + offset) <= (VTNET_R_CFG5 + 1));
968 		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
969 		if (size == 1) {
970 			value = *(uint8_t *) ptr;
971 		} else if (size == 2) {
972 			value = *(uint16_t *) ptr;
973 		} else {
974 			value = *(uint32_t *) ptr;
975 		}
976 		break;
977 	case VTNET_R_CFG6:
978 		assert(size != 4);
979 		value = 0x01; /* XXX link always up */
980 		break;
981 	case VTNET_R_CFG7:
982 		assert(size == 1);
983 		value = 0; /* XXX link status in LSB */
984 		break;
985 	default:
986 		DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
987 		value = 0;
988 		break;
989 	}
990 
991 	pthread_mutex_unlock(&sc->vsc_mtx);
992 
993 	return (value);
994 }
995 
996 struct pci_devemu pci_de_vnet = {
997 	.pe_emu = 	"virtio-net",
998 	.pe_init =	pci_vtnet_init,
999 	.pe_barwrite =	pci_vtnet_write,
1000 	.pe_barread =	pci_vtnet_read
1001 };
1002 PCI_EMUL_SET(pci_de_vnet);
1003