xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision ddc0daea20280c3a06a910b72b14ffe3f624df71)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #include <net/netmap.h>
50 #include <net/netmap_virt.h>
51 #define NETMAP_WITH_LIBS
52 #include <net/netmap_user.h>
53 
54 #ifndef WITHOUT_CAPSICUM
55 #include <capsicum_helpers.h>
56 #endif
57 #include <err.h>
58 #include <errno.h>
59 #include <fcntl.h>
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <stdint.h>
63 #include <string.h>
64 #include <unistd.h>
65 #include <sysexits.h>
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <poll.h>
70 #include <assert.h>
71 
72 
73 #include "iov.h"
74 #include "mevent.h"
75 #include "net_backends.h"
76 
77 #include <sys/linker_set.h>
78 
79 /*
80  * Each network backend registers a set of function pointers that are
81  * used to implement the net backends API.
82  * This might need to be exposed if we implement backends in separate files.
83  */
84 struct net_backend {
85 	const char *prefix;	/* prefix matching this backend */
86 
87 	/*
88 	 * Routines used to initialize and cleanup the resources needed
89 	 * by a backend. The cleanup function is used internally,
90 	 * and should not be called by the frontend.
91 	 */
92 	int (*init)(struct net_backend *be, const char *devname,
93 	    net_be_rxeof_t cb, void *param);
94 	void (*cleanup)(struct net_backend *be);
95 
96 	/*
97 	 * Called to serve a guest transmit request. The scatter-gather
98 	 * vector provided by the caller has 'iovcnt' elements and contains
99 	 * the packet to send.
100 	 */
101 	ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt);
102 
103 	/*
104 	 * Called to receive a packet from the backend. When the function
105 	 * returns a positive value 'len', the scatter-gather vector
106 	 * provided by the caller contains a packet with such length.
107 	 * The function returns 0 if the backend doesn't have a new packet to
108 	 * receive.
109 	 */
110 	ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
111 
112 	/*
113 	 * Ask the backend for the virtio-net features it is able to
114 	 * support. Possible features are TSO, UFO and checksum offloading
115 	 * in both rx and tx direction and for both IPv4 and IPv6.
116 	 */
117 	uint64_t (*get_cap)(struct net_backend *be);
118 
119 	/*
120 	 * Tell the backend to enable/disable the specified virtio-net
121 	 * features (capabilities).
122 	 */
123 	int (*set_cap)(struct net_backend *be, uint64_t features,
124 	    unsigned int vnet_hdr_len);
125 
126 	struct pci_vtnet_softc *sc;
127 	int fd;
128 
129 	/*
130 	 * Length of the virtio-net header used by the backend and the
131 	 * frontend, respectively. A zero value means that the header
132 	 * is not used.
133 	 */
134 	unsigned int be_vnet_hdr_len;
135 	unsigned int fe_vnet_hdr_len;
136 
137 	/* Size of backend-specific private data. */
138 	size_t priv_size;
139 
140 	/* Room for backend-specific data. */
141 	char opaque[0];
142 };
143 
144 SET_DECLARE(net_backend_set, struct net_backend);
145 
146 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
147 
148 #define WPRINTF(params) printf params
149 
150 /*
151  * The tap backend
152  */
153 
154 struct tap_priv {
155 	struct mevent *mevp;
156 };
157 
158 static void
159 tap_cleanup(struct net_backend *be)
160 {
161 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
162 
163 	if (priv->mevp) {
164 		mevent_delete(priv->mevp);
165 	}
166 	if (be->fd != -1) {
167 		close(be->fd);
168 		be->fd = -1;
169 	}
170 }
171 
172 static int
173 tap_init(struct net_backend *be, const char *devname,
174 	 net_be_rxeof_t cb, void *param)
175 {
176 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
177 	char tbuf[80];
178 	int opt = 1;
179 #ifndef WITHOUT_CAPSICUM
180 	cap_rights_t rights;
181 #endif
182 
183 	if (cb == NULL) {
184 		WPRINTF(("TAP backend requires non-NULL callback\n"));
185 		return (-1);
186 	}
187 
188 	strcpy(tbuf, "/dev/");
189 	strlcat(tbuf, devname, sizeof(tbuf));
190 
191 	be->fd = open(tbuf, O_RDWR);
192 	if (be->fd == -1) {
193 		WPRINTF(("open of tap device %s failed\n", tbuf));
194 		goto error;
195 	}
196 
197 	/*
198 	 * Set non-blocking and register for read
199 	 * notifications with the event loop
200 	 */
201 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
202 		WPRINTF(("tap device O_NONBLOCK failed\n"));
203 		goto error;
204 	}
205 
206 #ifndef WITHOUT_CAPSICUM
207 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
208 	if (caph_rights_limit(be->fd, &rights) == -1)
209 		errx(EX_OSERR, "Unable to apply rights for sandbox");
210 #endif
211 
212 	priv->mevp = mevent_add(be->fd, EVF_READ, cb, param);
213 	if (priv->mevp == NULL) {
214 		WPRINTF(("Could not register event\n"));
215 		goto error;
216 	}
217 
218 	return (0);
219 
220 error:
221 	tap_cleanup(be);
222 	return (-1);
223 }
224 
225 /*
226  * Called to send a buffer chain out to the tap device
227  */
228 static ssize_t
229 tap_send(struct net_backend *be, struct iovec *iov, int iovcnt)
230 {
231 	return (writev(be->fd, iov, iovcnt));
232 }
233 
234 static ssize_t
235 tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
236 {
237 	ssize_t ret;
238 
239 	/* Should never be called without a valid tap fd */
240 	assert(be->fd != -1);
241 
242 	ret = readv(be->fd, iov, iovcnt);
243 
244 	if (ret < 0 && errno == EWOULDBLOCK) {
245 		return (0);
246 	}
247 
248 	return (ret);
249 }
250 
251 static uint64_t
252 tap_get_cap(struct net_backend *be)
253 {
254 
255 	return (0); /* no capabilities for now */
256 }
257 
258 static int
259 tap_set_cap(struct net_backend *be, uint64_t features,
260 		unsigned vnet_hdr_len)
261 {
262 
263 	return ((features || vnet_hdr_len) ? -1 : 0);
264 }
265 
266 static struct net_backend tap_backend = {
267 	.prefix = "tap",
268 	.priv_size = sizeof(struct tap_priv),
269 	.init = tap_init,
270 	.cleanup = tap_cleanup,
271 	.send = tap_send,
272 	.recv = tap_recv,
273 	.get_cap = tap_get_cap,
274 	.set_cap = tap_set_cap,
275 };
276 
277 /* A clone of the tap backend, with a different prefix. */
278 static struct net_backend vmnet_backend = {
279 	.prefix = "vmnet",
280 	.priv_size = sizeof(struct tap_priv),
281 	.init = tap_init,
282 	.cleanup = tap_cleanup,
283 	.send = tap_send,
284 	.recv = tap_recv,
285 	.get_cap = tap_get_cap,
286 	.set_cap = tap_set_cap,
287 };
288 
289 DATA_SET(net_backend_set, tap_backend);
290 DATA_SET(net_backend_set, vmnet_backend);
291 
292 /*
293  * The netmap backend
294  */
295 
296 /* The virtio-net features supported by netmap. */
297 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
298 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
299 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
300 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
301 
302 struct netmap_priv {
303 	char ifname[IFNAMSIZ];
304 	struct nm_desc *nmd;
305 	uint16_t memid;
306 	struct netmap_ring *rx;
307 	struct netmap_ring *tx;
308 	struct mevent *mevp;
309 	net_be_rxeof_t cb;
310 	void *cb_param;
311 };
312 
313 static void
314 nmreq_init(struct nmreq *req, char *ifname)
315 {
316 
317 	memset(req, 0, sizeof(*req));
318 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
319 	req->nr_version = NETMAP_API;
320 }
321 
322 static int
323 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
324 {
325 	int err;
326 	struct nmreq req;
327 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
328 
329 	nmreq_init(&req, priv->ifname);
330 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
331 	req.nr_arg1 = vnet_hdr_len;
332 	err = ioctl(be->fd, NIOCREGIF, &req);
333 	if (err) {
334 		WPRINTF(("Unable to set vnet header length %d\n",
335 				vnet_hdr_len));
336 		return (err);
337 	}
338 
339 	be->be_vnet_hdr_len = vnet_hdr_len;
340 
341 	return (0);
342 }
343 
344 static int
345 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
346 {
347 	int prev_hdr_len = be->be_vnet_hdr_len;
348 	int ret;
349 
350 	if (vnet_hdr_len == prev_hdr_len) {
351 		return (1);
352 	}
353 
354 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
355 	if (ret) {
356 		return (0);
357 	}
358 
359 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
360 
361 	return (1);
362 }
363 
364 static uint64_t
365 netmap_get_cap(struct net_backend *be)
366 {
367 
368 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
369 	    NETMAP_FEATURES : 0);
370 }
371 
372 static int
373 netmap_set_cap(struct net_backend *be, uint64_t features,
374 	       unsigned vnet_hdr_len)
375 {
376 
377 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
378 }
379 
380 static int
381 netmap_init(struct net_backend *be, const char *devname,
382 	    net_be_rxeof_t cb, void *param)
383 {
384 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
385 
386 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
387 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
388 
389 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
390 	if (priv->nmd == NULL) {
391 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n",
392 			devname, strerror(errno)));
393 		free(priv);
394 		return (-1);
395 	}
396 
397 	priv->memid = priv->nmd->req.nr_arg2;
398 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
399 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
400 	priv->cb = cb;
401 	priv->cb_param = param;
402 	be->fd = priv->nmd->fd;
403 
404 	priv->mevp = mevent_add(be->fd, EVF_READ, cb, param);
405 	if (priv->mevp == NULL) {
406 		WPRINTF(("Could not register event\n"));
407 		return (-1);
408 	}
409 
410 	return (0);
411 }
412 
413 static void
414 netmap_cleanup(struct net_backend *be)
415 {
416 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
417 
418 	if (priv->mevp) {
419 		mevent_delete(priv->mevp);
420 	}
421 	if (priv->nmd) {
422 		nm_close(priv->nmd);
423 	}
424 	be->fd = -1;
425 }
426 
427 static ssize_t
428 netmap_send(struct net_backend *be, struct iovec *iov,
429 	    int iovcnt)
430 {
431 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
432 	struct netmap_ring *ring;
433 	ssize_t totlen = 0;
434 	int nm_buf_size;
435 	int nm_buf_len;
436 	uint32_t head;
437 	void *nm_buf;
438 	int j;
439 
440 	ring = priv->tx;
441 	head = ring->head;
442 	if (head == ring->tail) {
443 		WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt)));
444 		goto txsync;
445 	}
446 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
447 	nm_buf_size = ring->nr_buf_size;
448 	nm_buf_len = 0;
449 
450 	for (j = 0; j < iovcnt; j++) {
451 		int iov_frag_size = iov[j].iov_len;
452 		void *iov_frag_buf = iov[j].iov_base;
453 
454 		totlen += iov_frag_size;
455 
456 		/*
457 		 * Split each iovec fragment over more netmap slots, if
458 		 * necessary.
459 		 */
460 		for (;;) {
461 			int copylen;
462 
463 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
464 			memcpy(nm_buf, iov_frag_buf, copylen);
465 
466 			iov_frag_buf += copylen;
467 			iov_frag_size -= copylen;
468 			nm_buf += copylen;
469 			nm_buf_size -= copylen;
470 			nm_buf_len += copylen;
471 
472 			if (iov_frag_size == 0) {
473 				break;
474 			}
475 
476 			ring->slot[head].len = nm_buf_len;
477 			ring->slot[head].flags = NS_MOREFRAG;
478 			head = nm_ring_next(ring, head);
479 			if (head == ring->tail) {
480 				/*
481 				 * We ran out of netmap slots while
482 				 * splitting the iovec fragments.
483 				 */
484 				WPRINTF(("No space, drop %zu bytes\n",
485 				   count_iov(iov, iovcnt)));
486 				goto txsync;
487 			}
488 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
489 			nm_buf_size = ring->nr_buf_size;
490 			nm_buf_len = 0;
491 		}
492 	}
493 
494 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
495 	ring->slot[head].len = nm_buf_len;
496 	ring->slot[head].flags = 0;
497 	head = nm_ring_next(ring, head);
498 
499 	/* Now update ring->head and ring->cur. */
500 	ring->head = ring->cur = head;
501 txsync:
502 	ioctl(be->fd, NIOCTXSYNC, NULL);
503 
504 	return (totlen);
505 }
506 
507 static ssize_t
508 netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
509 {
510 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
511 	struct netmap_slot *slot = NULL;
512 	struct netmap_ring *ring;
513 	void *iov_frag_buf;
514 	int iov_frag_size;
515 	ssize_t totlen = 0;
516 	uint32_t head;
517 
518 	assert(iovcnt);
519 
520 	ring = priv->rx;
521 	head = ring->head;
522 	iov_frag_buf = iov->iov_base;
523 	iov_frag_size = iov->iov_len;
524 
525 	do {
526 		int nm_buf_len;
527 		void *nm_buf;
528 
529 		if (head == ring->tail) {
530 			return (0);
531 		}
532 
533 		slot = ring->slot + head;
534 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
535 		nm_buf_len = slot->len;
536 
537 		for (;;) {
538 			int copylen = nm_buf_len < iov_frag_size ?
539 			    nm_buf_len : iov_frag_size;
540 
541 			memcpy(iov_frag_buf, nm_buf, copylen);
542 			nm_buf += copylen;
543 			nm_buf_len -= copylen;
544 			iov_frag_buf += copylen;
545 			iov_frag_size -= copylen;
546 			totlen += copylen;
547 
548 			if (nm_buf_len == 0) {
549 				break;
550 			}
551 
552 			iov++;
553 			iovcnt--;
554 			if (iovcnt == 0) {
555 				/* No space to receive. */
556 				WPRINTF(("Short iov, drop %zd bytes\n",
557 				    totlen));
558 				return (-ENOSPC);
559 			}
560 			iov_frag_buf = iov->iov_base;
561 			iov_frag_size = iov->iov_len;
562 		}
563 
564 		head = nm_ring_next(ring, head);
565 
566 	} while (slot->flags & NS_MOREFRAG);
567 
568 	/* Release slots to netmap. */
569 	ring->head = ring->cur = head;
570 
571 	return (totlen);
572 }
573 
574 static struct net_backend netmap_backend = {
575 	.prefix = "netmap",
576 	.priv_size = sizeof(struct netmap_priv),
577 	.init = netmap_init,
578 	.cleanup = netmap_cleanup,
579 	.send = netmap_send,
580 	.recv = netmap_recv,
581 	.get_cap = netmap_get_cap,
582 	.set_cap = netmap_set_cap,
583 };
584 
585 /* A clone of the netmap backend, with a different prefix. */
586 static struct net_backend vale_backend = {
587 	.prefix = "vale",
588 	.priv_size = sizeof(struct netmap_priv),
589 	.init = netmap_init,
590 	.cleanup = netmap_cleanup,
591 	.send = netmap_send,
592 	.recv = netmap_recv,
593 	.get_cap = netmap_get_cap,
594 	.set_cap = netmap_set_cap,
595 };
596 
597 DATA_SET(net_backend_set, netmap_backend);
598 DATA_SET(net_backend_set, vale_backend);
599 
600 /*
601  * Initialize a backend and attach to the frontend.
602  * This is called during frontend initialization.
603  *  @pbe is a pointer to the backend to be initialized
604  *  @devname is the backend-name as supplied on the command line,
605  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
606  *  @cb is the receive callback supplied by the frontend,
607  *	and it is invoked in the event loop when a receive
608  *	event is generated in the hypervisor,
609  *  @param is a pointer to the frontend, and normally used as
610  *	the argument for the callback.
611  */
612 int
613 netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
614     void *param)
615 {
616 	struct net_backend **pbe, *nbe, *tbe = NULL;
617 	int err;
618 
619 	/*
620 	 * Find the network backend that matches the user-provided
621 	 * device name. net_backend_set is built using a linker set.
622 	 */
623 	SET_FOREACH(pbe, net_backend_set) {
624 		if (strncmp(devname, (*pbe)->prefix,
625 		    strlen((*pbe)->prefix)) == 0) {
626 			tbe = *pbe;
627 			assert(tbe->init != NULL);
628 			assert(tbe->cleanup != NULL);
629 			assert(tbe->send != NULL);
630 			assert(tbe->recv != NULL);
631 			assert(tbe->get_cap != NULL);
632 			assert(tbe->set_cap != NULL);
633 			break;
634 		}
635 	}
636 
637 	*ret = NULL;
638 	if (tbe == NULL)
639 		return (EINVAL);
640 	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
641 	*nbe = *tbe;	/* copy the template */
642 	nbe->fd = -1;
643 	nbe->sc = param;
644 	nbe->be_vnet_hdr_len = 0;
645 	nbe->fe_vnet_hdr_len = 0;
646 
647 	/* Initialize the backend. */
648 	err = nbe->init(nbe, devname, cb, param);
649 	if (err) {
650 		free(nbe);
651 		return (err);
652 	}
653 
654 	*ret = nbe;
655 
656 	return (0);
657 }
658 
659 void
660 netbe_cleanup(struct net_backend *be)
661 {
662 
663 	if (be != NULL) {
664 		be->cleanup(be);
665 		free(be);
666 	}
667 }
668 
669 uint64_t
670 netbe_get_cap(struct net_backend *be)
671 {
672 
673 	assert(be != NULL);
674 	return (be->get_cap(be));
675 }
676 
677 int
678 netbe_set_cap(struct net_backend *be, uint64_t features,
679 	      unsigned vnet_hdr_len)
680 {
681 	int ret;
682 
683 	assert(be != NULL);
684 
685 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
686 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
687 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
688 		return (-1);
689 
690 	be->fe_vnet_hdr_len = vnet_hdr_len;
691 
692 	ret = be->set_cap(be, features, vnet_hdr_len);
693 	assert(be->be_vnet_hdr_len == 0 ||
694 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
695 
696 	return (ret);
697 }
698 
699 static __inline struct iovec *
700 iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
701 {
702 	struct iovec *riov;
703 
704 	/* XXX short-cut: assume first segment is >= tlen */
705 	assert(iov[0].iov_len >= tlen);
706 
707 	iov[0].iov_len -= tlen;
708 	if (iov[0].iov_len == 0) {
709 		assert(*iovcnt > 1);
710 		*iovcnt -= 1;
711 		riov = &iov[1];
712 	} else {
713 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
714 		riov = &iov[0];
715 	}
716 
717 	return (riov);
718 }
719 
720 ssize_t
721 netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt)
722 {
723 
724 	assert(be != NULL);
725 	if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
726 		/*
727 		 * The frontend uses a virtio-net header, but the backend
728 		 * does not. We ignore it (as it must be all zeroes) and
729 		 * strip it.
730 		 */
731 		assert(be->be_vnet_hdr_len == 0);
732 		iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
733 	}
734 
735 	return (be->send(be, iov, iovcnt));
736 }
737 
738 /*
739  * Try to read a packet from the backend, without blocking.
740  * If no packets are available, return 0. In case of success, return
741  * the length of the packet just read. Return -1 in case of errors.
742  */
743 ssize_t
744 netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
745 {
746 	/* Length of prepended virtio-net header. */
747 	unsigned int hlen = be->fe_vnet_hdr_len;
748 	int ret;
749 
750 	assert(be != NULL);
751 
752 	if (hlen && hlen != be->be_vnet_hdr_len) {
753 		/*
754 		 * The frontend uses a virtio-net header, but the backend
755 		 * does not. We need to prepend a zeroed header.
756 		 */
757 		struct virtio_net_rxhdr *vh;
758 
759 		assert(be->be_vnet_hdr_len == 0);
760 
761 		/*
762 		 * Get a pointer to the rx header, and use the
763 		 * data immediately following it for the packet buffer.
764 		 */
765 		vh = iov[0].iov_base;
766 		iov = iov_trim(iov, &iovcnt, hlen);
767 
768 		/*
769 		 * The only valid field in the rx packet header is the
770 		 * number of buffers if merged rx bufs were negotiated.
771 		 */
772 		memset(vh, 0, hlen);
773 		if (hlen == VNET_HDR_LEN) {
774 			vh->vrh_bufs = 1;
775 		}
776 	}
777 
778 	ret = be->recv(be, iov, iovcnt);
779 	if (ret > 0) {
780 		ret += hlen;
781 	}
782 
783 	return (ret);
784 }
785 
786 /*
787  * Read a packet from the backend and discard it.
788  * Returns the size of the discarded packet or zero if no packet was available.
789  * A negative error code is returned in case of read error.
790  */
791 ssize_t
792 netbe_rx_discard(struct net_backend *be)
793 {
794 	/*
795 	 * MP note: the dummybuf is only used to discard frames,
796 	 * so there is no need for it to be per-vtnet or locked.
797 	 * We only make it large enough for TSO-sized segment.
798 	 */
799 	static uint8_t dummybuf[65536 + 64];
800 	struct iovec iov;
801 
802 	iov.iov_base = dummybuf;
803 	iov.iov_len = sizeof(dummybuf);
804 
805 	return netbe_recv(be, &iov, 1);
806 }
807 
808