xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision 8b238f4126d32df3e70056bc32536b7248ebffa0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #include <net/netmap.h>
50 #include <net/netmap_virt.h>
51 #define NETMAP_WITH_LIBS
52 #include <net/netmap_user.h>
53 
54 #ifndef WITHOUT_CAPSICUM
55 #include <capsicum_helpers.h>
56 #endif
57 #include <err.h>
58 #include <errno.h>
59 #include <fcntl.h>
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <stdint.h>
63 #include <string.h>
64 #include <unistd.h>
65 #include <sysexits.h>
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <poll.h>
70 #include <assert.h>
71 
72 
73 #include "iov.h"
74 #include "mevent.h"
75 #include "net_backends.h"
76 
77 #include <sys/linker_set.h>
78 
79 /*
80  * Each network backend registers a set of function pointers that are
81  * used to implement the net backends API.
82  * This might need to be exposed if we implement backends in separate files.
83  */
84 struct net_backend {
85 	const char *prefix;	/* prefix matching this backend */
86 
87 	/*
88 	 * Routines used to initialize and cleanup the resources needed
89 	 * by a backend. The cleanup function is used internally,
90 	 * and should not be called by the frontend.
91 	 */
92 	int (*init)(struct net_backend *be, const char *devname,
93 	    net_be_rxeof_t cb, void *param);
94 	void (*cleanup)(struct net_backend *be);
95 
96 	/*
97 	 * Called to serve a guest transmit request. The scatter-gather
98 	 * vector provided by the caller has 'iovcnt' elements and contains
99 	 * the packet to send.
100 	 */
101 	ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt);
102 
103 	/*
104 	 * Called to receive a packet from the backend. When the function
105 	 * returns a positive value 'len', the scatter-gather vector
106 	 * provided by the caller contains a packet with such length.
107 	 * The function returns 0 if the backend doesn't have a new packet to
108 	 * receive.
109 	 */
110 	ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
111 
112 	/*
113 	 * Ask the backend to enable or disable receive operation in the
114 	 * backend. On return from a disable operation, it is guaranteed
115 	 * that the receive callback won't be called until receive is
116 	 * enabled again. Note however that it is up to the caller to make
117 	 * sure that netbe_recv() is not currently being executed by another
118 	 * thread.
119 	 */
120 	void (*recv_enable)(struct net_backend *be);
121 	void (*recv_disable)(struct net_backend *be);
122 
123 	/*
124 	 * Ask the backend for the virtio-net features it is able to
125 	 * support. Possible features are TSO, UFO and checksum offloading
126 	 * in both rx and tx direction and for both IPv4 and IPv6.
127 	 */
128 	uint64_t (*get_cap)(struct net_backend *be);
129 
130 	/*
131 	 * Tell the backend to enable/disable the specified virtio-net
132 	 * features (capabilities).
133 	 */
134 	int (*set_cap)(struct net_backend *be, uint64_t features,
135 	    unsigned int vnet_hdr_len);
136 
137 	struct pci_vtnet_softc *sc;
138 	int fd;
139 
140 	/*
141 	 * Length of the virtio-net header used by the backend and the
142 	 * frontend, respectively. A zero value means that the header
143 	 * is not used.
144 	 */
145 	unsigned int be_vnet_hdr_len;
146 	unsigned int fe_vnet_hdr_len;
147 
148 	/* Size of backend-specific private data. */
149 	size_t priv_size;
150 
151 	/* Room for backend-specific data. */
152 	char opaque[0];
153 };
154 
155 SET_DECLARE(net_backend_set, struct net_backend);
156 
157 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
158 
159 #define WPRINTF(params) printf params
160 
161 /*
162  * The tap backend
163  */
164 
165 struct tap_priv {
166 	struct mevent *mevp;
167 };
168 
169 static void
170 tap_cleanup(struct net_backend *be)
171 {
172 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
173 
174 	if (priv->mevp) {
175 		mevent_delete(priv->mevp);
176 	}
177 	if (be->fd != -1) {
178 		close(be->fd);
179 		be->fd = -1;
180 	}
181 }
182 
183 static int
184 tap_init(struct net_backend *be, const char *devname,
185 	 net_be_rxeof_t cb, void *param)
186 {
187 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
188 	char tbuf[80];
189 	int opt = 1;
190 #ifndef WITHOUT_CAPSICUM
191 	cap_rights_t rights;
192 #endif
193 
194 	if (cb == NULL) {
195 		WPRINTF(("TAP backend requires non-NULL callback\n\r"));
196 		return (-1);
197 	}
198 
199 	strcpy(tbuf, "/dev/");
200 	strlcat(tbuf, devname, sizeof(tbuf));
201 
202 	be->fd = open(tbuf, O_RDWR);
203 	if (be->fd == -1) {
204 		WPRINTF(("open of tap device %s failed\n\r", tbuf));
205 		goto error;
206 	}
207 
208 	/*
209 	 * Set non-blocking and register for read
210 	 * notifications with the event loop
211 	 */
212 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
213 		WPRINTF(("tap device O_NONBLOCK failed\n\r"));
214 		goto error;
215 	}
216 
217 #ifndef WITHOUT_CAPSICUM
218 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
219 	if (caph_rights_limit(be->fd, &rights) == -1)
220 		errx(EX_OSERR, "Unable to apply rights for sandbox");
221 #endif
222 
223 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
224 	if (priv->mevp == NULL) {
225 		WPRINTF(("Could not register event\n\r"));
226 		goto error;
227 	}
228 
229 	return (0);
230 
231 error:
232 	tap_cleanup(be);
233 	return (-1);
234 }
235 
236 /*
237  * Called to send a buffer chain out to the tap device
238  */
239 static ssize_t
240 tap_send(struct net_backend *be, struct iovec *iov, int iovcnt)
241 {
242 	return (writev(be->fd, iov, iovcnt));
243 }
244 
245 static ssize_t
246 tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
247 {
248 	ssize_t ret;
249 
250 	/* Should never be called without a valid tap fd */
251 	assert(be->fd != -1);
252 
253 	ret = readv(be->fd, iov, iovcnt);
254 
255 	if (ret < 0 && errno == EWOULDBLOCK) {
256 		return (0);
257 	}
258 
259 	return (ret);
260 }
261 
262 static void
263 tap_recv_enable(struct net_backend *be)
264 {
265 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
266 
267 	mevent_enable(priv->mevp);
268 }
269 
270 static void
271 tap_recv_disable(struct net_backend *be)
272 {
273 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
274 
275 	mevent_disable(priv->mevp);
276 }
277 
278 static uint64_t
279 tap_get_cap(struct net_backend *be)
280 {
281 
282 	return (0); /* no capabilities for now */
283 }
284 
285 static int
286 tap_set_cap(struct net_backend *be, uint64_t features,
287 		unsigned vnet_hdr_len)
288 {
289 
290 	return ((features || vnet_hdr_len) ? -1 : 0);
291 }
292 
293 static struct net_backend tap_backend = {
294 	.prefix = "tap",
295 	.priv_size = sizeof(struct tap_priv),
296 	.init = tap_init,
297 	.cleanup = tap_cleanup,
298 	.send = tap_send,
299 	.recv = tap_recv,
300 	.recv_enable = tap_recv_enable,
301 	.recv_disable = tap_recv_disable,
302 	.get_cap = tap_get_cap,
303 	.set_cap = tap_set_cap,
304 };
305 
306 /* A clone of the tap backend, with a different prefix. */
307 static struct net_backend vmnet_backend = {
308 	.prefix = "vmnet",
309 	.priv_size = sizeof(struct tap_priv),
310 	.init = tap_init,
311 	.cleanup = tap_cleanup,
312 	.send = tap_send,
313 	.recv = tap_recv,
314 	.recv_enable = tap_recv_enable,
315 	.recv_disable = tap_recv_disable,
316 	.get_cap = tap_get_cap,
317 	.set_cap = tap_set_cap,
318 };
319 
320 DATA_SET(net_backend_set, tap_backend);
321 DATA_SET(net_backend_set, vmnet_backend);
322 
323 /*
324  * The netmap backend
325  */
326 
327 /* The virtio-net features supported by netmap. */
328 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
329 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
330 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
331 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \
332 		VIRTIO_NET_F_MRG_RXBUF)
333 
334 struct netmap_priv {
335 	char ifname[IFNAMSIZ];
336 	struct nm_desc *nmd;
337 	uint16_t memid;
338 	struct netmap_ring *rx;
339 	struct netmap_ring *tx;
340 	struct mevent *mevp;
341 	net_be_rxeof_t cb;
342 	void *cb_param;
343 };
344 
345 static void
346 nmreq_init(struct nmreq *req, char *ifname)
347 {
348 
349 	memset(req, 0, sizeof(*req));
350 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
351 	req->nr_version = NETMAP_API;
352 }
353 
354 static int
355 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
356 {
357 	int err;
358 	struct nmreq req;
359 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
360 
361 	nmreq_init(&req, priv->ifname);
362 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
363 	req.nr_arg1 = vnet_hdr_len;
364 	err = ioctl(be->fd, NIOCREGIF, &req);
365 	if (err) {
366 		WPRINTF(("Unable to set vnet header length %d\n\r",
367 				vnet_hdr_len));
368 		return (err);
369 	}
370 
371 	be->be_vnet_hdr_len = vnet_hdr_len;
372 
373 	return (0);
374 }
375 
376 static int
377 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
378 {
379 	int prev_hdr_len = be->be_vnet_hdr_len;
380 	int ret;
381 
382 	if (vnet_hdr_len == prev_hdr_len) {
383 		return (1);
384 	}
385 
386 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
387 	if (ret) {
388 		return (0);
389 	}
390 
391 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
392 
393 	return (1);
394 }
395 
396 static uint64_t
397 netmap_get_cap(struct net_backend *be)
398 {
399 
400 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
401 	    NETMAP_FEATURES : 0);
402 }
403 
404 static int
405 netmap_set_cap(struct net_backend *be, uint64_t features,
406 	       unsigned vnet_hdr_len)
407 {
408 
409 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
410 }
411 
412 static int
413 netmap_init(struct net_backend *be, const char *devname,
414 	    net_be_rxeof_t cb, void *param)
415 {
416 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
417 
418 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
419 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
420 
421 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
422 	if (priv->nmd == NULL) {
423 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n\r",
424 			devname, strerror(errno)));
425 		free(priv);
426 		return (-1);
427 	}
428 
429 	priv->memid = priv->nmd->req.nr_arg2;
430 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
431 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
432 	priv->cb = cb;
433 	priv->cb_param = param;
434 	be->fd = priv->nmd->fd;
435 
436 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
437 	if (priv->mevp == NULL) {
438 		WPRINTF(("Could not register event\n\r"));
439 		return (-1);
440 	}
441 
442 	return (0);
443 }
444 
445 static void
446 netmap_cleanup(struct net_backend *be)
447 {
448 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
449 
450 	if (priv->mevp) {
451 		mevent_delete(priv->mevp);
452 	}
453 	if (priv->nmd) {
454 		nm_close(priv->nmd);
455 	}
456 	be->fd = -1;
457 }
458 
459 static ssize_t
460 netmap_send(struct net_backend *be, struct iovec *iov,
461 	    int iovcnt)
462 {
463 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
464 	struct netmap_ring *ring;
465 	ssize_t totlen = 0;
466 	int nm_buf_size;
467 	int nm_buf_len;
468 	uint32_t head;
469 	void *nm_buf;
470 	int j;
471 
472 	ring = priv->tx;
473 	head = ring->head;
474 	if (head == ring->tail) {
475 		WPRINTF(("No space, drop %zu bytes\n\r", count_iov(iov, iovcnt)));
476 		goto txsync;
477 	}
478 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
479 	nm_buf_size = ring->nr_buf_size;
480 	nm_buf_len = 0;
481 
482 	for (j = 0; j < iovcnt; j++) {
483 		int iov_frag_size = iov[j].iov_len;
484 		void *iov_frag_buf = iov[j].iov_base;
485 
486 		totlen += iov_frag_size;
487 
488 		/*
489 		 * Split each iovec fragment over more netmap slots, if
490 		 * necessary.
491 		 */
492 		for (;;) {
493 			int copylen;
494 
495 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
496 			memcpy(nm_buf, iov_frag_buf, copylen);
497 
498 			iov_frag_buf += copylen;
499 			iov_frag_size -= copylen;
500 			nm_buf += copylen;
501 			nm_buf_size -= copylen;
502 			nm_buf_len += copylen;
503 
504 			if (iov_frag_size == 0) {
505 				break;
506 			}
507 
508 			ring->slot[head].len = nm_buf_len;
509 			ring->slot[head].flags = NS_MOREFRAG;
510 			head = nm_ring_next(ring, head);
511 			if (head == ring->tail) {
512 				/*
513 				 * We ran out of netmap slots while
514 				 * splitting the iovec fragments.
515 				 */
516 				WPRINTF(("No space, drop %zu bytes\n\r",
517 				   count_iov(iov, iovcnt)));
518 				goto txsync;
519 			}
520 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
521 			nm_buf_size = ring->nr_buf_size;
522 			nm_buf_len = 0;
523 		}
524 	}
525 
526 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
527 	ring->slot[head].len = nm_buf_len;
528 	ring->slot[head].flags = 0;
529 	head = nm_ring_next(ring, head);
530 
531 	/* Now update ring->head and ring->cur. */
532 	ring->head = ring->cur = head;
533 txsync:
534 	ioctl(be->fd, NIOCTXSYNC, NULL);
535 
536 	return (totlen);
537 }
538 
539 static ssize_t
540 netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
541 {
542 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
543 	struct netmap_slot *slot = NULL;
544 	struct netmap_ring *ring;
545 	void *iov_frag_buf;
546 	int iov_frag_size;
547 	ssize_t totlen = 0;
548 	uint32_t head;
549 
550 	assert(iovcnt);
551 
552 	ring = priv->rx;
553 	head = ring->head;
554 	iov_frag_buf = iov->iov_base;
555 	iov_frag_size = iov->iov_len;
556 
557 	do {
558 		int nm_buf_len;
559 		void *nm_buf;
560 
561 		if (head == ring->tail) {
562 			return (0);
563 		}
564 
565 		slot = ring->slot + head;
566 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
567 		nm_buf_len = slot->len;
568 
569 		for (;;) {
570 			int copylen = nm_buf_len < iov_frag_size ?
571 			    nm_buf_len : iov_frag_size;
572 
573 			memcpy(iov_frag_buf, nm_buf, copylen);
574 			nm_buf += copylen;
575 			nm_buf_len -= copylen;
576 			iov_frag_buf += copylen;
577 			iov_frag_size -= copylen;
578 			totlen += copylen;
579 
580 			if (nm_buf_len == 0) {
581 				break;
582 			}
583 
584 			iov++;
585 			iovcnt--;
586 			if (iovcnt == 0) {
587 				/* No space to receive. */
588 				WPRINTF(("Short iov, drop %zd bytes\n\r",
589 				    totlen));
590 				return (-ENOSPC);
591 			}
592 			iov_frag_buf = iov->iov_base;
593 			iov_frag_size = iov->iov_len;
594 		}
595 
596 		head = nm_ring_next(ring, head);
597 
598 	} while (slot->flags & NS_MOREFRAG);
599 
600 	/* Release slots to netmap. */
601 	ring->head = ring->cur = head;
602 
603 	return (totlen);
604 }
605 
606 static void
607 netmap_recv_enable(struct net_backend *be)
608 {
609 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
610 
611 	mevent_enable(priv->mevp);
612 }
613 
614 static void
615 netmap_recv_disable(struct net_backend *be)
616 {
617 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
618 
619 	mevent_disable(priv->mevp);
620 }
621 
622 static struct net_backend netmap_backend = {
623 	.prefix = "netmap",
624 	.priv_size = sizeof(struct netmap_priv),
625 	.init = netmap_init,
626 	.cleanup = netmap_cleanup,
627 	.send = netmap_send,
628 	.recv = netmap_recv,
629 	.recv_enable = netmap_recv_enable,
630 	.recv_disable = netmap_recv_disable,
631 	.get_cap = netmap_get_cap,
632 	.set_cap = netmap_set_cap,
633 };
634 
635 /* A clone of the netmap backend, with a different prefix. */
636 static struct net_backend vale_backend = {
637 	.prefix = "vale",
638 	.priv_size = sizeof(struct netmap_priv),
639 	.init = netmap_init,
640 	.cleanup = netmap_cleanup,
641 	.send = netmap_send,
642 	.recv = netmap_recv,
643 	.recv_enable = netmap_recv_enable,
644 	.recv_disable = netmap_recv_disable,
645 	.get_cap = netmap_get_cap,
646 	.set_cap = netmap_set_cap,
647 };
648 
649 DATA_SET(net_backend_set, netmap_backend);
650 DATA_SET(net_backend_set, vale_backend);
651 
652 /*
653  * Initialize a backend and attach to the frontend.
654  * This is called during frontend initialization.
655  *  @pbe is a pointer to the backend to be initialized
656  *  @devname is the backend-name as supplied on the command line,
657  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
658  *  @cb is the receive callback supplied by the frontend,
659  *	and it is invoked in the event loop when a receive
660  *	event is generated in the hypervisor,
661  *  @param is a pointer to the frontend, and normally used as
662  *	the argument for the callback.
663  */
664 int
665 netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
666     void *param)
667 {
668 	struct net_backend **pbe, *nbe, *tbe = NULL;
669 	int err;
670 
671 	/*
672 	 * Find the network backend that matches the user-provided
673 	 * device name. net_backend_set is built using a linker set.
674 	 */
675 	SET_FOREACH(pbe, net_backend_set) {
676 		if (strncmp(devname, (*pbe)->prefix,
677 		    strlen((*pbe)->prefix)) == 0) {
678 			tbe = *pbe;
679 			assert(tbe->init != NULL);
680 			assert(tbe->cleanup != NULL);
681 			assert(tbe->send != NULL);
682 			assert(tbe->recv != NULL);
683 			assert(tbe->get_cap != NULL);
684 			assert(tbe->set_cap != NULL);
685 			break;
686 		}
687 	}
688 
689 	*ret = NULL;
690 	if (tbe == NULL)
691 		return (EINVAL);
692 	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
693 	*nbe = *tbe;	/* copy the template */
694 	nbe->fd = -1;
695 	nbe->sc = param;
696 	nbe->be_vnet_hdr_len = 0;
697 	nbe->fe_vnet_hdr_len = 0;
698 
699 	/* Initialize the backend. */
700 	err = nbe->init(nbe, devname, cb, param);
701 	if (err) {
702 		free(nbe);
703 		return (err);
704 	}
705 
706 	*ret = nbe;
707 
708 	return (0);
709 }
710 
711 void
712 netbe_cleanup(struct net_backend *be)
713 {
714 
715 	if (be != NULL) {
716 		be->cleanup(be);
717 		free(be);
718 	}
719 }
720 
721 uint64_t
722 netbe_get_cap(struct net_backend *be)
723 {
724 
725 	assert(be != NULL);
726 	return (be->get_cap(be));
727 }
728 
729 int
730 netbe_set_cap(struct net_backend *be, uint64_t features,
731 	      unsigned vnet_hdr_len)
732 {
733 	int ret;
734 
735 	assert(be != NULL);
736 
737 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
738 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
739 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
740 		return (-1);
741 
742 	be->fe_vnet_hdr_len = vnet_hdr_len;
743 
744 	ret = be->set_cap(be, features, vnet_hdr_len);
745 	assert(be->be_vnet_hdr_len == 0 ||
746 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
747 
748 	return (ret);
749 }
750 
751 static __inline struct iovec *
752 iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
753 {
754 	struct iovec *riov;
755 
756 	/* XXX short-cut: assume first segment is >= tlen */
757 	assert(iov[0].iov_len >= tlen);
758 
759 	iov[0].iov_len -= tlen;
760 	if (iov[0].iov_len == 0) {
761 		assert(*iovcnt > 1);
762 		*iovcnt -= 1;
763 		riov = &iov[1];
764 	} else {
765 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
766 		riov = &iov[0];
767 	}
768 
769 	return (riov);
770 }
771 
772 ssize_t
773 netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt)
774 {
775 
776 	assert(be != NULL);
777 	if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
778 		/*
779 		 * The frontend uses a virtio-net header, but the backend
780 		 * does not. We ignore it (as it must be all zeroes) and
781 		 * strip it.
782 		 */
783 		assert(be->be_vnet_hdr_len == 0);
784 		iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
785 	}
786 
787 	return (be->send(be, iov, iovcnt));
788 }
789 
790 /*
791  * Try to read a packet from the backend, without blocking.
792  * If no packets are available, return 0. In case of success, return
793  * the length of the packet just read. Return -1 in case of errors.
794  */
795 ssize_t
796 netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
797 {
798 	/* Length of prepended virtio-net header. */
799 	unsigned int hlen = be->fe_vnet_hdr_len;
800 	int ret;
801 
802 	assert(be != NULL);
803 
804 	if (hlen && hlen != be->be_vnet_hdr_len) {
805 		/*
806 		 * The frontend uses a virtio-net header, but the backend
807 		 * does not. We need to prepend a zeroed header.
808 		 */
809 		struct virtio_net_rxhdr *vh;
810 
811 		assert(be->be_vnet_hdr_len == 0);
812 
813 		/*
814 		 * Get a pointer to the rx header, and use the
815 		 * data immediately following it for the packet buffer.
816 		 */
817 		vh = iov[0].iov_base;
818 		iov = iov_trim(iov, &iovcnt, hlen);
819 
820 		/*
821 		 * The only valid field in the rx packet header is the
822 		 * number of buffers if merged rx bufs were negotiated.
823 		 */
824 		memset(vh, 0, hlen);
825 		if (hlen == VNET_HDR_LEN) {
826 			vh->vrh_bufs = 1;
827 		}
828 	}
829 
830 	ret = be->recv(be, iov, iovcnt);
831 	if (ret > 0) {
832 		ret += hlen;
833 	}
834 
835 	return (ret);
836 }
837 
838 /*
839  * Read a packet from the backend and discard it.
840  * Returns the size of the discarded packet or zero if no packet was available.
841  * A negative error code is returned in case of read error.
842  */
843 ssize_t
844 netbe_rx_discard(struct net_backend *be)
845 {
846 	/*
847 	 * MP note: the dummybuf is only used to discard frames,
848 	 * so there is no need for it to be per-vtnet or locked.
849 	 * We only make it large enough for TSO-sized segment.
850 	 */
851 	static uint8_t dummybuf[65536 + 64];
852 	struct iovec iov;
853 
854 	iov.iov_base = dummybuf;
855 	iov.iov_len = sizeof(dummybuf);
856 
857 	return netbe_recv(be, &iov, 1);
858 }
859 
860 void
861 netbe_rx_disable(struct net_backend *be)
862 {
863 
864 	return be->recv_disable(be);
865 }
866 
867 void
868 netbe_rx_enable(struct net_backend *be)
869 {
870 
871 	return be->recv_enable(be);
872 }
873