xref: /freebsd/usr.sbin/bhyve/pci_virtio_net.c (revision f5f7c05209ca2c3748fd8b27c5e80ffad49120eb)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/linker_set.h>
34 #include <sys/select.h>
35 #include <sys/uio.h>
36 #include <sys/ioctl.h>
37 
38 #include <errno.h>
39 #include <fcntl.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <stdint.h>
43 #include <string.h>
44 #include <strings.h>
45 #include <unistd.h>
46 #include <assert.h>
47 #include <md5.h>
48 #include <pthread.h>
49 
50 #include "bhyverun.h"
51 #include "pci_emul.h"
52 #include "mevent.h"
53 #include "virtio.h"
54 
55 #define VTNET_RINGSZ	256
56 
57 #define VTNET_MAXSEGS	32
58 
59 /*
60  * PCI config-space register offsets
61  */
62 #define VTNET_R_CFG0	24
63 #define VTNET_R_CFG1	25
64 #define VTNET_R_CFG2	26
65 #define VTNET_R_CFG3	27
66 #define VTNET_R_CFG4	28
67 #define VTNET_R_CFG5	29
68 #define VTNET_R_CFG6	30
69 #define VTNET_R_CFG7	31
70 #define VTNET_R_MAX	31
71 
72 #define VTNET_REGSZ	VTNET_R_MAX+1
73 
74 /*
75  * Host capabilities
76  */
77 #define VTNET_S_HOSTCAPS      \
78   ( 0x00000020 |	/* host supplies MAC */ \
79     0x00008000 |	/* host can merge Rx buffers */ \
80     0x00010000 )	/* config status available */
81 
82 /*
83  * Queue definitions.
84  */
85 #define VTNET_RXQ	0
86 #define VTNET_TXQ	1
87 #define VTNET_CTLQ	2
88 
89 #define VTNET_MAXQ	3
90 
91 static int use_msix = 1;
92 
93 struct vring_hqueue {
94 	/* Internal state */
95 	uint16_t	hq_size;
96 	uint16_t	hq_cur_aidx;		/* trails behind 'avail_idx' */
97 
98 	 /* Host-context pointers to the queue */
99 	struct virtio_desc *hq_dtable;
100 	uint16_t	*hq_avail_flags;
101 	uint16_t	*hq_avail_idx;		/* monotonically increasing */
102 	uint16_t	*hq_avail_ring;
103 
104 	uint16_t	*hq_used_flags;
105 	uint16_t	*hq_used_idx;		/* monotonically increasing */
106 	struct virtio_used *hq_used_ring;
107 };
108 
109 /*
110  * Fixed network header size
111  */
112 struct virtio_net_rxhdr {
113 	uint8_t		vrh_flags;
114 	uint8_t		vrh_gso_type;
115 	uint16_t	vrh_hdr_len;
116 	uint16_t	vrh_gso_size;
117 	uint16_t	vrh_csum_start;
118 	uint16_t	vrh_csum_offset;
119 	uint16_t	vrh_bufs;
120 } __packed;
121 
122 /*
123  * Debug printf
124  */
125 static int pci_vtnet_debug;
126 #define DPRINTF(params) if (pci_vtnet_debug) printf params
127 #define WPRINTF(params) printf params
128 
129 /*
130  * Per-device softc
131  */
132 struct pci_vtnet_softc {
133 	struct pci_devinst *vsc_pi;
134 	pthread_mutex_t vsc_mtx;
135 	struct mevent	*vsc_mevp;
136 
137 	int		vsc_curq;
138 	int		vsc_status;
139 	int		vsc_isr;
140 	int		vsc_tapfd;
141 	int		vsc_rx_ready;
142 	int		vsc_rxpend;
143 
144 	uint32_t	vsc_features;
145 	uint8_t		vsc_macaddr[6];
146 
147 	uint64_t	vsc_pfn[VTNET_MAXQ];
148 	struct	vring_hqueue vsc_hq[VTNET_MAXQ];
149 	uint16_t	vsc_msix_table_idx[VTNET_MAXQ];
150 };
151 
152 /*
153  * Return the size of IO BAR that maps virtio header and device specific
154  * region. The size would vary depending on whether MSI-X is enabled or
155  * not.
156  */
157 static uint64_t
158 pci_vtnet_iosize(struct pci_devinst *pi)
159 {
160 	if (pci_msix_enabled(pi))
161 		return (VTNET_REGSZ);
162 	else
163 		return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
164 }
165 
166 /*
167  * Return the number of available descriptors in the vring taking care
168  * of the 16-bit index wraparound.
169  */
170 static int
171 hq_num_avail(struct vring_hqueue *hq)
172 {
173 	int ndesc;
174 
175 	if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
176 		ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
177 	else
178 		ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
179 
180 	assert(ndesc >= 0 && ndesc <= hq->hq_size);
181 
182 	return (ndesc);
183 }
184 
185 static uint16_t
186 pci_vtnet_qsize(int qnum)
187 {
188 	/* XXX no ctl queue currently */
189 	if (qnum == VTNET_CTLQ) {
190 		return (0);
191 	}
192 
193 	/* XXX fixed currently. Maybe different for tx/rx/ctl */
194 	return (VTNET_RINGSZ);
195 }
196 
197 static void
198 pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
199 {
200 	struct vring_hqueue *hq;
201 
202 	assert(ring < VTNET_MAXQ);
203 
204 	hq = &sc->vsc_hq[ring];
205 
206 	/*
207 	 * Reset all soft state
208 	 */
209 	hq->hq_cur_aidx = 0;
210 }
211 
212 static void
213 pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
214 {
215 
216 	if (value == 0) {
217 		DPRINTF(("vtnet: device reset requested !\n"));
218 		pci_vtnet_ring_reset(sc, VTNET_RXQ);
219 		pci_vtnet_ring_reset(sc, VTNET_TXQ);
220 		sc->vsc_rx_ready = 0;
221 	}
222 
223 	sc->vsc_status = value;
224 }
225 
226 /*
227  * Called to send a buffer chain out to the tap device
228  */
229 static void
230 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
231 		 int len)
232 {
233 	char pad[60];
234 
235 	if (sc->vsc_tapfd == -1)
236 		return;
237 
238 	/*
239 	 * If the length is < 60, pad out to that and add the
240 	 * extra zero'd segment to the iov. It is guaranteed that
241 	 * there is always an extra iov available by the caller.
242 	 */
243 	if (len < 60) {
244 		memset(pad, 0, 60 - len);
245 		iov[iovcnt].iov_base = pad;
246 		iov[iovcnt].iov_len = 60 - len;
247 		iovcnt++;
248 	}
249 	(void) writev(sc->vsc_tapfd, iov, iovcnt);
250 }
251 
252 /*
253  *  Called when there is read activity on the tap file descriptor.
254  * Each buffer posted by the guest is assumed to be able to contain
255  * an entire ethernet frame + rx header.
256  *  MP note: the dummybuf is only used for discarding frames, so there
257  * is no need for it to be per-vtnet or locked.
258  */
259 static uint8_t dummybuf[2048];
260 
261 static void
262 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
263 {
264 	struct virtio_desc *vd;
265 	struct virtio_used *vu;
266 	struct vring_hqueue *hq;
267 	struct virtio_net_rxhdr *vrx;
268 	uint8_t *buf;
269 	int i;
270 	int len;
271 	int ndescs;
272 	int didx, uidx, aidx;	/* descriptor, avail and used index */
273 
274 	/*
275 	 * Should never be called without a valid tap fd
276 	 */
277 	assert(sc->vsc_tapfd != -1);
278 
279 	/*
280 	 * But, will be called when the rx ring hasn't yet
281 	 * been set up.
282 	 */
283 	if (sc->vsc_rx_ready == 0) {
284 		/*
285 		 * Drop the packet and try later.
286 		 */
287 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
288 		return;
289 	}
290 
291 	/*
292 	 * Calculate the number of available rx buffers
293 	 */
294 	hq = &sc->vsc_hq[VTNET_RXQ];
295 
296 	ndescs = hq_num_avail(hq);
297 
298 	if (ndescs == 0) {
299 		/*
300 		 * Need to wait for host notification to read
301 		 */
302 		if (sc->vsc_rxpend == 0) {
303 			WPRINTF(("vtnet: no rx descriptors !\n"));
304 			sc->vsc_rxpend = 1;
305 		}
306 
307 		/*
308 		 * Drop the packet and try later
309 		 */
310 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
311 		return;
312 	}
313 
314 	aidx = hq->hq_cur_aidx;
315 	uidx = *hq->hq_used_idx;
316 	for (i = 0; i < ndescs; i++) {
317 		/*
318 		 * 'aidx' indexes into the an array of descriptor indexes
319 		 */
320 		didx = hq->hq_avail_ring[aidx % hq->hq_size];
321 		assert(didx >= 0 && didx < hq->hq_size);
322 
323 		vd = &hq->hq_dtable[didx];
324 
325 		/*
326 		 * Get a pointer to the rx header, and use the
327 		 * data immediately following it for the packet buffer.
328 		 */
329 		vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
330 		buf = (uint8_t *)(vrx + 1);
331 
332 		len = read(sc->vsc_tapfd, buf,
333 			   vd->vd_len - sizeof(struct virtio_net_rxhdr));
334 
335 		if (len < 0 && errno == EWOULDBLOCK) {
336 			break;
337 		}
338 
339 		/*
340 		 * The only valid field in the rx packet header is the
341 		 * number of buffers, which is always 1 without TSO
342 		 * support.
343 		 */
344 		memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
345 		vrx->vrh_bufs = 1;
346 
347 		/*
348 		 * Write this descriptor into the used ring
349 		 */
350 		vu = &hq->hq_used_ring[uidx % hq->hq_size];
351 		vu->vu_idx = didx;
352 		vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
353 		uidx++;
354 		aidx++;
355 	}
356 
357 	/*
358 	 * Update the used pointer, and signal an interrupt if allowed
359 	 */
360 	*hq->hq_used_idx = uidx;
361 	hq->hq_cur_aidx = aidx;
362 
363 	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
364 		if (use_msix) {
365 			pci_generate_msix(sc->vsc_pi,
366 					  sc->vsc_msix_table_idx[VTNET_RXQ]);
367 		} else {
368 			sc->vsc_isr |= 1;
369 			pci_generate_msi(sc->vsc_pi, 0);
370 		}
371 	}
372 }
373 
374 static void
375 pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
376 {
377 	struct pci_vtnet_softc *sc = param;
378 
379 	pthread_mutex_lock(&sc->vsc_mtx);
380 	pci_vtnet_tap_rx(sc);
381 	pthread_mutex_unlock(&sc->vsc_mtx);
382 
383 }
384 
385 static void
386 pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
387 {
388 	/*
389 	 * A qnotify means that the rx process can now begin
390 	 */
391 	if (sc->vsc_rx_ready == 0) {
392 		sc->vsc_rx_ready = 1;
393 	}
394 
395 	/*
396 	 * If the rx queue was empty, attempt to receive a
397 	 * packet that was previously blocked due to no rx bufs
398 	 * available
399 	 */
400 	if (sc->vsc_rxpend) {
401 		WPRINTF(("vtnet: rx resumed\n\r"));
402 		sc->vsc_rxpend = 0;
403 		pci_vtnet_tap_rx(sc);
404 	}
405 }
406 
407 static void
408 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
409 {
410 	struct iovec iov[VTNET_MAXSEGS + 1];
411 	struct virtio_desc *vd;
412 	struct virtio_used *vu;
413 	int i;
414 	int plen;
415 	int tlen;
416 	int uidx, aidx, didx;
417 
418 	uidx = *hq->hq_used_idx;
419 	aidx = hq->hq_cur_aidx;
420 	didx = hq->hq_avail_ring[aidx % hq->hq_size];
421 	assert(didx >= 0 && didx < hq->hq_size);
422 
423 	vd = &hq->hq_dtable[didx];
424 
425 	/*
426 	 * Run through the chain of descriptors, ignoring the
427 	 * first header descriptor. However, include the header
428 	 * length in the total length that will be put into the
429 	 * used queue.
430 	 */
431 	tlen = vd->vd_len;
432 	vd = &hq->hq_dtable[vd->vd_next];
433 
434 	for (i = 0, plen = 0;
435 	     i < VTNET_MAXSEGS;
436 	     i++, vd = &hq->hq_dtable[vd->vd_next]) {
437 		iov[i].iov_base = paddr_guest2host(vd->vd_addr);
438 		iov[i].iov_len = vd->vd_len;
439 		plen += vd->vd_len;
440 		tlen += vd->vd_len;
441 
442 		if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
443 			break;
444 	}
445 	assert(i < VTNET_MAXSEGS);
446 
447 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
448 	pci_vtnet_tap_tx(sc, iov, i + 1, plen);
449 
450 	/*
451 	 * Return this chain back to the host
452 	 */
453 	vu = &hq->hq_used_ring[uidx % hq->hq_size];
454 	vu->vu_idx = didx;
455 	vu->vu_tlen = tlen;
456 	hq->hq_cur_aidx = aidx + 1;
457 	*hq->hq_used_idx = uidx + 1;
458 
459 	/*
460 	 * Generate an interrupt if able
461 	 */
462 	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
463 		if (use_msix) {
464 			pci_generate_msix(sc->vsc_pi,
465 					  sc->vsc_msix_table_idx[VTNET_TXQ]);
466 		} else {
467 			sc->vsc_isr |= 1;
468 			pci_generate_msi(sc->vsc_pi, 0);
469 		}
470 	}
471 }
472 
473 static void
474 pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
475 {
476 	struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
477 	int i;
478 	int ndescs;
479 
480 	/*
481 	 * Calculate number of ring entries to process
482 	 */
483 	ndescs = hq_num_avail(hq);
484 
485 	if (ndescs == 0)
486 		return;
487 
488 	/*
489 	 * Run through all the entries, placing them into iovecs and
490 	 * sending when an end-of-packet is found
491 	 */
492 	for (i = 0; i < ndescs; i++)
493 		pci_vtnet_proctx(sc, hq);
494 }
495 
496 static void
497 pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
498 {
499 
500 	DPRINTF(("vtnet: control qnotify!\n\r"));
501 }
502 
503 static void
504 pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
505 {
506 	struct vring_hqueue *hq;
507 	int qnum = sc->vsc_curq;
508 
509 	assert(qnum < VTNET_MAXQ);
510 
511 	sc->vsc_pfn[qnum] = pfn << VRING_PFN;
512 
513 	/*
514 	 * Set up host pointers to the various parts of the
515 	 * queue
516 	 */
517 	hq = &sc->vsc_hq[qnum];
518 	hq->hq_size = pci_vtnet_qsize(qnum);
519 
520 	hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
521 	hq->hq_avail_flags =  (uint16_t *)(hq->hq_dtable + hq->hq_size);
522 	hq->hq_avail_idx = hq->hq_avail_flags + 1;
523 	hq->hq_avail_ring = hq->hq_avail_flags + 2;
524 	hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
525 						 VRING_ALIGN);
526 	hq->hq_used_idx = hq->hq_used_flags + 1;
527 	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
528 
529 	/*
530 	 * Initialize queue indexes
531 	 */
532 	hq->hq_cur_aidx = 0;
533 }
534 
535 static int
536 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
537 {
538 	MD5_CTX mdctx;
539 	unsigned char digest[16];
540 	char nstr[80];
541 	struct pci_vtnet_softc *sc;
542 	const char *env_msi;
543 
544 	/*
545 	 * Access to guest memory is required. Fail if
546 	 * memory not mapped
547 	 */
548 	if (paddr_guest2host(0) == NULL)
549 		return (1);
550 
551 	sc = malloc(sizeof(struct pci_vtnet_softc));
552 	memset(sc, 0, sizeof(struct pci_vtnet_softc));
553 
554 	pi->pi_arg = sc;
555 	sc->vsc_pi = pi;
556 
557 	pthread_mutex_init(&sc->vsc_mtx, NULL);
558 
559 	/*
560 	 * Use MSI if set by user
561 	 */
562 	if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
563 		if (strcasecmp(env_msi, "yes") == 0)
564 			use_msix = 0;
565 	}
566 
567 	/*
568 	 * Attempt to open the tap device
569 	 */
570 	sc->vsc_tapfd = -1;
571 	if (opts != NULL) {
572 		char tbuf[80];
573 
574 		strcpy(tbuf, "/dev/");
575 		strlcat(tbuf, opts, sizeof(tbuf));
576 
577 		sc->vsc_tapfd = open(tbuf, O_RDWR);
578 		if (sc->vsc_tapfd == -1) {
579 			WPRINTF(("open of tap device %s failed\n", tbuf));
580 		} else {
581 			/*
582 			 * Set non-blocking and register for read
583 			 * notifications with the event loop
584 			 */
585 			int opt = 1;
586 			if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
587 				WPRINTF(("tap device O_NONBLOCK failed\n"));
588 				close(sc->vsc_tapfd);
589 				sc->vsc_tapfd = -1;
590 			}
591 
592 			sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
593 						  EVF_READ,
594 						  pci_vtnet_tap_callback,
595 						  sc);
596 			if (sc->vsc_mevp == NULL) {
597 				WPRINTF(("Could not register event\n"));
598 				close(sc->vsc_tapfd);
599 				sc->vsc_tapfd = -1;
600 			}
601 		}
602 	}
603 
604 	/*
605 	 * The MAC address is the standard NetApp OUI of 00-a0-98,
606 	 * followed by an MD5 of the vm name. The slot/func number is
607 	 * prepended to this for slots other than 1:0, so that
608 	 * a bootloader can netboot from the equivalent of slot 1.
609 	 */
610 	if (pi->pi_slot == 1 && pi->pi_func == 0) {
611 		strncpy(nstr, vmname, sizeof(nstr));
612 	} else {
613 		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
614 		    pi->pi_func, vmname);
615 	}
616 
617 	MD5Init(&mdctx);
618 	MD5Update(&mdctx, nstr, strlen(nstr));
619 	MD5Final(digest, &mdctx);
620 
621 	sc->vsc_macaddr[0] = 0x00;
622 	sc->vsc_macaddr[1] = 0xa0;
623 	sc->vsc_macaddr[2] = 0x98;
624 	sc->vsc_macaddr[3] = digest[0];
625 	sc->vsc_macaddr[4] = digest[1];
626 	sc->vsc_macaddr[5] = digest[2];
627 
628 	/* initialize config space */
629 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
630 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
631 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
632 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
633 
634 	if (use_msix) {
635 		/* MSI-X support */
636 		int i;
637 
638 		for (i = 0; i < VTNET_MAXQ; i++)
639 			sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
640 
641 		/*
642 		 * BAR 1 used to map MSI-X table and PBA
643 		 */
644 		if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1))
645 			return (1);
646 	} else {
647 		/* MSI support */
648 		pci_emul_add_msicap(pi, 1);
649 	}
650 
651 	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
652 
653 	return (0);
654 }
655 
656 /*
657  * Function pointer array to handle queue notifications
658  */
659 static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
660 	pci_vtnet_ping_rxq,
661 	pci_vtnet_ping_txq,
662 	pci_vtnet_ping_ctlq
663 };
664 
665 static uint64_t
666 vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset)
667 {
668 	/*
669 	 * Device specific offsets used by guest would change based on
670 	 * whether MSI-X capability is enabled or not
671 	 */
672 	if (!pci_msix_enabled(pi)) {
673 		if (offset >= VTCFG_R_MSIX)
674 			return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
675 	}
676 
677 	return (offset);
678 }
679 
680 static void
681 pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
682 		int baridx, uint64_t offset, int size, uint64_t value)
683 {
684 	struct pci_vtnet_softc *sc = pi->pi_arg;
685 	void *ptr;
686 
687 	if (use_msix) {
688 		if (baridx == pci_msix_table_bar(pi) ||
689 		    baridx == pci_msix_pba_bar(pi)) {
690 			pci_emul_msix_twrite(pi, offset, size, value);
691 			return;
692 		}
693 	}
694 
695 	assert(baridx == 0);
696 
697 	if (offset + size > pci_vtnet_iosize(pi)) {
698 		DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
699 			 offset, size));
700 		return;
701 	}
702 
703 	pthread_mutex_lock(&sc->vsc_mtx);
704 
705 	offset = vtnet_adjust_offset(pi, offset);
706 
707 	switch (offset) {
708 	case VTCFG_R_GUESTCAP:
709 		assert(size == 4);
710 		sc->vsc_features = value & VTNET_S_HOSTCAPS;
711 		break;
712 	case VTCFG_R_PFN:
713 		assert(size == 4);
714 		pci_vtnet_ring_init(sc, value);
715 		break;
716 	case VTCFG_R_QSEL:
717 		assert(size == 2);
718 		assert(value < VTNET_MAXQ);
719 		sc->vsc_curq = value;
720 		break;
721 	case VTCFG_R_QNOTIFY:
722 		assert(size == 2);
723 		assert(value < VTNET_MAXQ);
724 		(*pci_vtnet_qnotify[value])(sc);
725 		break;
726 	case VTCFG_R_STATUS:
727 		assert(size == 1);
728 		pci_vtnet_update_status(sc, value);
729 		break;
730 	case VTCFG_R_CFGVEC:
731 		assert(size == 2);
732 		sc->vsc_msix_table_idx[VTNET_CTLQ] = value;
733 		break;
734 	case VTCFG_R_QVEC:
735 		assert(size == 2);
736 		assert(sc->vsc_curq != VTNET_CTLQ);
737 		sc->vsc_msix_table_idx[sc->vsc_curq] = value;
738 		break;
739 	case VTNET_R_CFG0:
740 	case VTNET_R_CFG1:
741 	case VTNET_R_CFG2:
742 	case VTNET_R_CFG3:
743 	case VTNET_R_CFG4:
744 	case VTNET_R_CFG5:
745 		assert((size + offset) <= (VTNET_R_CFG5 + 1));
746 		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
747 		/*
748 		 * The driver is allowed to change the MAC address
749 		 */
750 		sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
751 		if (size == 1) {
752 			*(uint8_t *) ptr = value;
753 		} else if (size == 2) {
754 			*(uint16_t *) ptr = value;
755 		} else {
756 			*(uint32_t *) ptr = value;
757 		}
758 		break;
759 	case VTCFG_R_HOSTCAP:
760 	case VTCFG_R_QNUM:
761 	case VTCFG_R_ISR:
762 	case VTNET_R_CFG6:
763 	case VTNET_R_CFG7:
764 		DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
765 		break;
766 	default:
767 		DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
768 		value = 0;
769 		break;
770 	}
771 
772 	pthread_mutex_unlock(&sc->vsc_mtx);
773 }
774 
775 uint64_t
776 pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
777 	       int baridx, uint64_t offset, int size)
778 {
779 	struct pci_vtnet_softc *sc = pi->pi_arg;
780 	void *ptr;
781 	uint64_t value;
782 
783 	if (use_msix) {
784 		if (baridx == pci_msix_table_bar(pi) ||
785 		    baridx == pci_msix_pba_bar(pi)) {
786 			return (pci_emul_msix_tread(pi, offset, size));
787 		}
788 	}
789 
790 	assert(baridx == 0);
791 
792 	if (offset + size > pci_vtnet_iosize(pi)) {
793 		DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
794 			 offset, size));
795 		return (0);
796 	}
797 
798 	pthread_mutex_lock(&sc->vsc_mtx);
799 
800 	offset = vtnet_adjust_offset(pi, offset);
801 
802 	switch (offset) {
803 	case VTCFG_R_HOSTCAP:
804 		assert(size == 4);
805 		value = VTNET_S_HOSTCAPS;
806 		break;
807 	case VTCFG_R_GUESTCAP:
808 		assert(size == 4);
809 		value = sc->vsc_features; /* XXX never read ? */
810 		break;
811 	case VTCFG_R_PFN:
812 		assert(size == 4);
813 		value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
814 		break;
815 	case VTCFG_R_QNUM:
816 		assert(size == 2);
817 		value = pci_vtnet_qsize(sc->vsc_curq);
818 		break;
819 	case VTCFG_R_QSEL:
820 		assert(size == 2);
821 		value = sc->vsc_curq;  /* XXX never read ? */
822 		break;
823 	case VTCFG_R_QNOTIFY:
824 		assert(size == 2);
825 		value = sc->vsc_curq;  /* XXX never read ? */
826 		break;
827 	case VTCFG_R_STATUS:
828 		assert(size == 1);
829 		value = sc->vsc_status;
830 		break;
831 	case VTCFG_R_ISR:
832 		assert(size == 1);
833 		value = sc->vsc_isr;
834 		sc->vsc_isr = 0;     /* a read clears this flag */
835 		break;
836 	case VTCFG_R_CFGVEC:
837 		assert(size == 2);
838 		value = sc->vsc_msix_table_idx[VTNET_CTLQ];
839 		break;
840 	case VTCFG_R_QVEC:
841 		assert(size == 2);
842 		assert(sc->vsc_curq != VTNET_CTLQ);
843 		value = sc->vsc_msix_table_idx[sc->vsc_curq];
844 		break;
845 	case VTNET_R_CFG0:
846 	case VTNET_R_CFG1:
847 	case VTNET_R_CFG2:
848 	case VTNET_R_CFG3:
849 	case VTNET_R_CFG4:
850 	case VTNET_R_CFG5:
851 		assert((size + offset) <= (VTNET_R_CFG5 + 1));
852 		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
853 		if (size == 1) {
854 			value = *(uint8_t *) ptr;
855 		} else if (size == 2) {
856 			value = *(uint16_t *) ptr;
857 		} else {
858 			value = *(uint32_t *) ptr;
859 		}
860 		break;
861 	case VTNET_R_CFG6:
862 		assert(size != 4);
863 		value = 0x01; /* XXX link always up */
864 		break;
865 	case VTNET_R_CFG7:
866 		assert(size == 1);
867 		value = 0; /* XXX link status in LSB */
868 		break;
869 	default:
870 		DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
871 		value = 0;
872 		break;
873 	}
874 
875 	pthread_mutex_unlock(&sc->vsc_mtx);
876 
877 	return (value);
878 }
879 
880 struct pci_devemu pci_de_vnet = {
881 	.pe_emu = 	"virtio-net",
882 	.pe_init =	pci_vtnet_init,
883 	.pe_barwrite =	pci_vtnet_write,
884 	.pe_barread =	pci_vtnet_read
885 };
886 PCI_EMUL_SET(pci_de_vnet);
887