xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision 4b50c451720d8b427757a6da1dd2bb4c52cd9e35)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #include <net/netmap.h>
50 #include <net/netmap_virt.h>
51 #define NETMAP_WITH_LIBS
52 #include <net/netmap_user.h>
53 
54 #ifndef WITHOUT_CAPSICUM
55 #include <capsicum_helpers.h>
56 #endif
57 #include <err.h>
58 #include <errno.h>
59 #include <fcntl.h>
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <stdint.h>
63 #include <string.h>
64 #include <unistd.h>
65 #include <sysexits.h>
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <poll.h>
70 #include <assert.h>
71 
72 
73 #include "debug.h"
74 #include "iov.h"
75 #include "mevent.h"
76 #include "net_backends.h"
77 
78 #include <sys/linker_set.h>
79 
80 /*
81  * Each network backend registers a set of function pointers that are
82  * used to implement the net backends API.
83  * This might need to be exposed if we implement backends in separate files.
84  */
85 struct net_backend {
86 	const char *prefix;	/* prefix matching this backend */
87 
88 	/*
89 	 * Routines used to initialize and cleanup the resources needed
90 	 * by a backend. The cleanup function is used internally,
91 	 * and should not be called by the frontend.
92 	 */
93 	int (*init)(struct net_backend *be, const char *devname,
94 	    net_be_rxeof_t cb, void *param);
95 	void (*cleanup)(struct net_backend *be);
96 
97 	/*
98 	 * Called to serve a guest transmit request. The scatter-gather
99 	 * vector provided by the caller has 'iovcnt' elements and contains
100 	 * the packet to send.
101 	 */
102 	ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt);
103 
104 	/*
105 	 * Called to receive a packet from the backend. When the function
106 	 * returns a positive value 'len', the scatter-gather vector
107 	 * provided by the caller contains a packet with such length.
108 	 * The function returns 0 if the backend doesn't have a new packet to
109 	 * receive.
110 	 */
111 	ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
112 
113 	/*
114 	 * Ask the backend to enable or disable receive operation in the
115 	 * backend. On return from a disable operation, it is guaranteed
116 	 * that the receive callback won't be called until receive is
117 	 * enabled again. Note however that it is up to the caller to make
118 	 * sure that netbe_recv() is not currently being executed by another
119 	 * thread.
120 	 */
121 	void (*recv_enable)(struct net_backend *be);
122 	void (*recv_disable)(struct net_backend *be);
123 
124 	/*
125 	 * Ask the backend for the virtio-net features it is able to
126 	 * support. Possible features are TSO, UFO and checksum offloading
127 	 * in both rx and tx direction and for both IPv4 and IPv6.
128 	 */
129 	uint64_t (*get_cap)(struct net_backend *be);
130 
131 	/*
132 	 * Tell the backend to enable/disable the specified virtio-net
133 	 * features (capabilities).
134 	 */
135 	int (*set_cap)(struct net_backend *be, uint64_t features,
136 	    unsigned int vnet_hdr_len);
137 
138 	struct pci_vtnet_softc *sc;
139 	int fd;
140 
141 	/*
142 	 * Length of the virtio-net header used by the backend and the
143 	 * frontend, respectively. A zero value means that the header
144 	 * is not used.
145 	 */
146 	unsigned int be_vnet_hdr_len;
147 	unsigned int fe_vnet_hdr_len;
148 
149 	/* Size of backend-specific private data. */
150 	size_t priv_size;
151 
152 	/* Room for backend-specific data. */
153 	char opaque[0];
154 };
155 
156 SET_DECLARE(net_backend_set, struct net_backend);
157 
158 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
159 
160 #define WPRINTF(params) PRINTLN params
161 
162 /*
163  * The tap backend
164  */
165 
166 struct tap_priv {
167 	struct mevent *mevp;
168 };
169 
170 static void
171 tap_cleanup(struct net_backend *be)
172 {
173 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
174 
175 	if (priv->mevp) {
176 		mevent_delete(priv->mevp);
177 	}
178 	if (be->fd != -1) {
179 		close(be->fd);
180 		be->fd = -1;
181 	}
182 }
183 
184 static int
185 tap_init(struct net_backend *be, const char *devname,
186 	 net_be_rxeof_t cb, void *param)
187 {
188 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
189 	char tbuf[80];
190 	int opt = 1;
191 #ifndef WITHOUT_CAPSICUM
192 	cap_rights_t rights;
193 #endif
194 
195 	if (cb == NULL) {
196 		WPRINTF(("TAP backend requires non-NULL callback"));
197 		return (-1);
198 	}
199 
200 	strcpy(tbuf, "/dev/");
201 	strlcat(tbuf, devname, sizeof(tbuf));
202 
203 	be->fd = open(tbuf, O_RDWR);
204 	if (be->fd == -1) {
205 		WPRINTF(("open of tap device %s failed", tbuf));
206 		goto error;
207 	}
208 
209 	/*
210 	 * Set non-blocking and register for read
211 	 * notifications with the event loop
212 	 */
213 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
214 		WPRINTF(("tap device O_NONBLOCK failed"));
215 		goto error;
216 	}
217 
218 #ifndef WITHOUT_CAPSICUM
219 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
220 	if (caph_rights_limit(be->fd, &rights) == -1)
221 		errx(EX_OSERR, "Unable to apply rights for sandbox");
222 #endif
223 
224 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
225 	if (priv->mevp == NULL) {
226 		WPRINTF(("Could not register event"));
227 		goto error;
228 	}
229 
230 	return (0);
231 
232 error:
233 	tap_cleanup(be);
234 	return (-1);
235 }
236 
237 /*
238  * Called to send a buffer chain out to the tap device
239  */
240 static ssize_t
241 tap_send(struct net_backend *be, struct iovec *iov, int iovcnt)
242 {
243 	return (writev(be->fd, iov, iovcnt));
244 }
245 
246 static ssize_t
247 tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
248 {
249 	ssize_t ret;
250 
251 	/* Should never be called without a valid tap fd */
252 	assert(be->fd != -1);
253 
254 	ret = readv(be->fd, iov, iovcnt);
255 
256 	if (ret < 0 && errno == EWOULDBLOCK) {
257 		return (0);
258 	}
259 
260 	return (ret);
261 }
262 
263 static void
264 tap_recv_enable(struct net_backend *be)
265 {
266 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
267 
268 	mevent_enable(priv->mevp);
269 }
270 
271 static void
272 tap_recv_disable(struct net_backend *be)
273 {
274 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
275 
276 	mevent_disable(priv->mevp);
277 }
278 
279 static uint64_t
280 tap_get_cap(struct net_backend *be)
281 {
282 
283 	return (0); /* no capabilities for now */
284 }
285 
286 static int
287 tap_set_cap(struct net_backend *be, uint64_t features,
288 		unsigned vnet_hdr_len)
289 {
290 
291 	return ((features || vnet_hdr_len) ? -1 : 0);
292 }
293 
294 static struct net_backend tap_backend = {
295 	.prefix = "tap",
296 	.priv_size = sizeof(struct tap_priv),
297 	.init = tap_init,
298 	.cleanup = tap_cleanup,
299 	.send = tap_send,
300 	.recv = tap_recv,
301 	.recv_enable = tap_recv_enable,
302 	.recv_disable = tap_recv_disable,
303 	.get_cap = tap_get_cap,
304 	.set_cap = tap_set_cap,
305 };
306 
307 /* A clone of the tap backend, with a different prefix. */
308 static struct net_backend vmnet_backend = {
309 	.prefix = "vmnet",
310 	.priv_size = sizeof(struct tap_priv),
311 	.init = tap_init,
312 	.cleanup = tap_cleanup,
313 	.send = tap_send,
314 	.recv = tap_recv,
315 	.recv_enable = tap_recv_enable,
316 	.recv_disable = tap_recv_disable,
317 	.get_cap = tap_get_cap,
318 	.set_cap = tap_set_cap,
319 };
320 
321 DATA_SET(net_backend_set, tap_backend);
322 DATA_SET(net_backend_set, vmnet_backend);
323 
324 /*
325  * The netmap backend
326  */
327 
328 /* The virtio-net features supported by netmap. */
329 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
330 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
331 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
332 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \
333 		VIRTIO_NET_F_MRG_RXBUF)
334 
335 struct netmap_priv {
336 	char ifname[IFNAMSIZ];
337 	struct nm_desc *nmd;
338 	uint16_t memid;
339 	struct netmap_ring *rx;
340 	struct netmap_ring *tx;
341 	struct mevent *mevp;
342 	net_be_rxeof_t cb;
343 	void *cb_param;
344 };
345 
346 static void
347 nmreq_init(struct nmreq *req, char *ifname)
348 {
349 
350 	memset(req, 0, sizeof(*req));
351 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
352 	req->nr_version = NETMAP_API;
353 }
354 
355 static int
356 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
357 {
358 	int err;
359 	struct nmreq req;
360 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
361 
362 	nmreq_init(&req, priv->ifname);
363 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
364 	req.nr_arg1 = vnet_hdr_len;
365 	err = ioctl(be->fd, NIOCREGIF, &req);
366 	if (err) {
367 		WPRINTF(("Unable to set vnet header length %d",
368 				vnet_hdr_len));
369 		return (err);
370 	}
371 
372 	be->be_vnet_hdr_len = vnet_hdr_len;
373 
374 	return (0);
375 }
376 
377 static int
378 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
379 {
380 	int prev_hdr_len = be->be_vnet_hdr_len;
381 	int ret;
382 
383 	if (vnet_hdr_len == prev_hdr_len) {
384 		return (1);
385 	}
386 
387 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
388 	if (ret) {
389 		return (0);
390 	}
391 
392 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
393 
394 	return (1);
395 }
396 
397 static uint64_t
398 netmap_get_cap(struct net_backend *be)
399 {
400 
401 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
402 	    NETMAP_FEATURES : 0);
403 }
404 
405 static int
406 netmap_set_cap(struct net_backend *be, uint64_t features,
407 	       unsigned vnet_hdr_len)
408 {
409 
410 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
411 }
412 
413 static int
414 netmap_init(struct net_backend *be, const char *devname,
415 	    net_be_rxeof_t cb, void *param)
416 {
417 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
418 
419 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
420 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
421 
422 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
423 	if (priv->nmd == NULL) {
424 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
425 			devname, strerror(errno)));
426 		free(priv);
427 		return (-1);
428 	}
429 
430 	priv->memid = priv->nmd->req.nr_arg2;
431 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
432 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
433 	priv->cb = cb;
434 	priv->cb_param = param;
435 	be->fd = priv->nmd->fd;
436 
437 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
438 	if (priv->mevp == NULL) {
439 		WPRINTF(("Could not register event"));
440 		return (-1);
441 	}
442 
443 	return (0);
444 }
445 
446 static void
447 netmap_cleanup(struct net_backend *be)
448 {
449 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
450 
451 	if (priv->mevp) {
452 		mevent_delete(priv->mevp);
453 	}
454 	if (priv->nmd) {
455 		nm_close(priv->nmd);
456 	}
457 	be->fd = -1;
458 }
459 
460 static ssize_t
461 netmap_send(struct net_backend *be, struct iovec *iov,
462 	    int iovcnt)
463 {
464 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
465 	struct netmap_ring *ring;
466 	ssize_t totlen = 0;
467 	int nm_buf_size;
468 	int nm_buf_len;
469 	uint32_t head;
470 	void *nm_buf;
471 	int j;
472 
473 	ring = priv->tx;
474 	head = ring->head;
475 	if (head == ring->tail) {
476 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
477 		goto txsync;
478 	}
479 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
480 	nm_buf_size = ring->nr_buf_size;
481 	nm_buf_len = 0;
482 
483 	for (j = 0; j < iovcnt; j++) {
484 		int iov_frag_size = iov[j].iov_len;
485 		void *iov_frag_buf = iov[j].iov_base;
486 
487 		totlen += iov_frag_size;
488 
489 		/*
490 		 * Split each iovec fragment over more netmap slots, if
491 		 * necessary.
492 		 */
493 		for (;;) {
494 			int copylen;
495 
496 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
497 			memcpy(nm_buf, iov_frag_buf, copylen);
498 
499 			iov_frag_buf += copylen;
500 			iov_frag_size -= copylen;
501 			nm_buf += copylen;
502 			nm_buf_size -= copylen;
503 			nm_buf_len += copylen;
504 
505 			if (iov_frag_size == 0) {
506 				break;
507 			}
508 
509 			ring->slot[head].len = nm_buf_len;
510 			ring->slot[head].flags = NS_MOREFRAG;
511 			head = nm_ring_next(ring, head);
512 			if (head == ring->tail) {
513 				/*
514 				 * We ran out of netmap slots while
515 				 * splitting the iovec fragments.
516 				 */
517 				WPRINTF(("No space, drop %zu bytes",
518 				   count_iov(iov, iovcnt)));
519 				goto txsync;
520 			}
521 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
522 			nm_buf_size = ring->nr_buf_size;
523 			nm_buf_len = 0;
524 		}
525 	}
526 
527 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
528 	ring->slot[head].len = nm_buf_len;
529 	ring->slot[head].flags = 0;
530 	head = nm_ring_next(ring, head);
531 
532 	/* Now update ring->head and ring->cur. */
533 	ring->head = ring->cur = head;
534 txsync:
535 	ioctl(be->fd, NIOCTXSYNC, NULL);
536 
537 	return (totlen);
538 }
539 
540 static ssize_t
541 netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
542 {
543 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
544 	struct netmap_slot *slot = NULL;
545 	struct netmap_ring *ring;
546 	void *iov_frag_buf;
547 	int iov_frag_size;
548 	ssize_t totlen = 0;
549 	uint32_t head;
550 
551 	assert(iovcnt);
552 
553 	ring = priv->rx;
554 	head = ring->head;
555 	iov_frag_buf = iov->iov_base;
556 	iov_frag_size = iov->iov_len;
557 
558 	do {
559 		int nm_buf_len;
560 		void *nm_buf;
561 
562 		if (head == ring->tail) {
563 			return (0);
564 		}
565 
566 		slot = ring->slot + head;
567 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
568 		nm_buf_len = slot->len;
569 
570 		for (;;) {
571 			int copylen = nm_buf_len < iov_frag_size ?
572 			    nm_buf_len : iov_frag_size;
573 
574 			memcpy(iov_frag_buf, nm_buf, copylen);
575 			nm_buf += copylen;
576 			nm_buf_len -= copylen;
577 			iov_frag_buf += copylen;
578 			iov_frag_size -= copylen;
579 			totlen += copylen;
580 
581 			if (nm_buf_len == 0) {
582 				break;
583 			}
584 
585 			iov++;
586 			iovcnt--;
587 			if (iovcnt == 0) {
588 				/* No space to receive. */
589 				WPRINTF(("Short iov, drop %zd bytes",
590 				    totlen));
591 				return (-ENOSPC);
592 			}
593 			iov_frag_buf = iov->iov_base;
594 			iov_frag_size = iov->iov_len;
595 		}
596 
597 		head = nm_ring_next(ring, head);
598 
599 	} while (slot->flags & NS_MOREFRAG);
600 
601 	/* Release slots to netmap. */
602 	ring->head = ring->cur = head;
603 
604 	return (totlen);
605 }
606 
607 static void
608 netmap_recv_enable(struct net_backend *be)
609 {
610 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
611 
612 	mevent_enable(priv->mevp);
613 }
614 
615 static void
616 netmap_recv_disable(struct net_backend *be)
617 {
618 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
619 
620 	mevent_disable(priv->mevp);
621 }
622 
623 static struct net_backend netmap_backend = {
624 	.prefix = "netmap",
625 	.priv_size = sizeof(struct netmap_priv),
626 	.init = netmap_init,
627 	.cleanup = netmap_cleanup,
628 	.send = netmap_send,
629 	.recv = netmap_recv,
630 	.recv_enable = netmap_recv_enable,
631 	.recv_disable = netmap_recv_disable,
632 	.get_cap = netmap_get_cap,
633 	.set_cap = netmap_set_cap,
634 };
635 
636 /* A clone of the netmap backend, with a different prefix. */
637 static struct net_backend vale_backend = {
638 	.prefix = "vale",
639 	.priv_size = sizeof(struct netmap_priv),
640 	.init = netmap_init,
641 	.cleanup = netmap_cleanup,
642 	.send = netmap_send,
643 	.recv = netmap_recv,
644 	.recv_enable = netmap_recv_enable,
645 	.recv_disable = netmap_recv_disable,
646 	.get_cap = netmap_get_cap,
647 	.set_cap = netmap_set_cap,
648 };
649 
650 DATA_SET(net_backend_set, netmap_backend);
651 DATA_SET(net_backend_set, vale_backend);
652 
653 /*
654  * Initialize a backend and attach to the frontend.
655  * This is called during frontend initialization.
656  *  @pbe is a pointer to the backend to be initialized
657  *  @devname is the backend-name as supplied on the command line,
658  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
659  *  @cb is the receive callback supplied by the frontend,
660  *	and it is invoked in the event loop when a receive
661  *	event is generated in the hypervisor,
662  *  @param is a pointer to the frontend, and normally used as
663  *	the argument for the callback.
664  */
665 int
666 netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
667     void *param)
668 {
669 	struct net_backend **pbe, *nbe, *tbe = NULL;
670 	int err;
671 
672 	/*
673 	 * Find the network backend that matches the user-provided
674 	 * device name. net_backend_set is built using a linker set.
675 	 */
676 	SET_FOREACH(pbe, net_backend_set) {
677 		if (strncmp(devname, (*pbe)->prefix,
678 		    strlen((*pbe)->prefix)) == 0) {
679 			tbe = *pbe;
680 			assert(tbe->init != NULL);
681 			assert(tbe->cleanup != NULL);
682 			assert(tbe->send != NULL);
683 			assert(tbe->recv != NULL);
684 			assert(tbe->get_cap != NULL);
685 			assert(tbe->set_cap != NULL);
686 			break;
687 		}
688 	}
689 
690 	*ret = NULL;
691 	if (tbe == NULL)
692 		return (EINVAL);
693 	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
694 	*nbe = *tbe;	/* copy the template */
695 	nbe->fd = -1;
696 	nbe->sc = param;
697 	nbe->be_vnet_hdr_len = 0;
698 	nbe->fe_vnet_hdr_len = 0;
699 
700 	/* Initialize the backend. */
701 	err = nbe->init(nbe, devname, cb, param);
702 	if (err) {
703 		free(nbe);
704 		return (err);
705 	}
706 
707 	*ret = nbe;
708 
709 	return (0);
710 }
711 
712 void
713 netbe_cleanup(struct net_backend *be)
714 {
715 
716 	if (be != NULL) {
717 		be->cleanup(be);
718 		free(be);
719 	}
720 }
721 
722 uint64_t
723 netbe_get_cap(struct net_backend *be)
724 {
725 
726 	assert(be != NULL);
727 	return (be->get_cap(be));
728 }
729 
730 int
731 netbe_set_cap(struct net_backend *be, uint64_t features,
732 	      unsigned vnet_hdr_len)
733 {
734 	int ret;
735 
736 	assert(be != NULL);
737 
738 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
739 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
740 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
741 		return (-1);
742 
743 	be->fe_vnet_hdr_len = vnet_hdr_len;
744 
745 	ret = be->set_cap(be, features, vnet_hdr_len);
746 	assert(be->be_vnet_hdr_len == 0 ||
747 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
748 
749 	return (ret);
750 }
751 
752 static __inline struct iovec *
753 iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
754 {
755 	struct iovec *riov;
756 
757 	/* XXX short-cut: assume first segment is >= tlen */
758 	assert(iov[0].iov_len >= tlen);
759 
760 	iov[0].iov_len -= tlen;
761 	if (iov[0].iov_len == 0) {
762 		assert(*iovcnt > 1);
763 		*iovcnt -= 1;
764 		riov = &iov[1];
765 	} else {
766 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
767 		riov = &iov[0];
768 	}
769 
770 	return (riov);
771 }
772 
773 ssize_t
774 netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt)
775 {
776 
777 	assert(be != NULL);
778 	if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
779 		/*
780 		 * The frontend uses a virtio-net header, but the backend
781 		 * does not. We ignore it (as it must be all zeroes) and
782 		 * strip it.
783 		 */
784 		assert(be->be_vnet_hdr_len == 0);
785 		iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
786 	}
787 
788 	return (be->send(be, iov, iovcnt));
789 }
790 
791 /*
792  * Try to read a packet from the backend, without blocking.
793  * If no packets are available, return 0. In case of success, return
794  * the length of the packet just read. Return -1 in case of errors.
795  */
796 ssize_t
797 netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
798 {
799 	/* Length of prepended virtio-net header. */
800 	unsigned int hlen = be->fe_vnet_hdr_len;
801 	int ret;
802 
803 	assert(be != NULL);
804 
805 	if (hlen && hlen != be->be_vnet_hdr_len) {
806 		/*
807 		 * The frontend uses a virtio-net header, but the backend
808 		 * does not. We need to prepend a zeroed header.
809 		 */
810 		struct virtio_net_rxhdr *vh;
811 
812 		assert(be->be_vnet_hdr_len == 0);
813 
814 		/*
815 		 * Get a pointer to the rx header, and use the
816 		 * data immediately following it for the packet buffer.
817 		 */
818 		vh = iov[0].iov_base;
819 		iov = iov_trim(iov, &iovcnt, hlen);
820 
821 		/*
822 		 * The only valid field in the rx packet header is the
823 		 * number of buffers if merged rx bufs were negotiated.
824 		 */
825 		memset(vh, 0, hlen);
826 		if (hlen == VNET_HDR_LEN) {
827 			vh->vrh_bufs = 1;
828 		}
829 	}
830 
831 	ret = be->recv(be, iov, iovcnt);
832 	if (ret > 0) {
833 		ret += hlen;
834 	}
835 
836 	return (ret);
837 }
838 
839 /*
840  * Read a packet from the backend and discard it.
841  * Returns the size of the discarded packet or zero if no packet was available.
842  * A negative error code is returned in case of read error.
843  */
844 ssize_t
845 netbe_rx_discard(struct net_backend *be)
846 {
847 	/*
848 	 * MP note: the dummybuf is only used to discard frames,
849 	 * so there is no need for it to be per-vtnet or locked.
850 	 * We only make it large enough for TSO-sized segment.
851 	 */
852 	static uint8_t dummybuf[65536 + 64];
853 	struct iovec iov;
854 
855 	iov.iov_base = dummybuf;
856 	iov.iov_len = sizeof(dummybuf);
857 
858 	return netbe_recv(be, &iov, 1);
859 }
860 
861 void
862 netbe_rx_disable(struct net_backend *be)
863 {
864 
865 	return be->recv_disable(be);
866 }
867 
868 void
869 netbe_rx_enable(struct net_backend *be)
870 {
871 
872 	return be->recv_enable(be);
873 }
874